In [1]:
import torch
import pandas as pd
from torchtext import data
from torchtext.vocab import Vectors
from torch.nn import init
from tqdm import tqdm
from torchtext.vocab import GloVe
from torchtext import data

# data 

### my dataset 

In [2]:
def get_dataset(csv_data, id_field, text_field, label_field, is_final_valid = False):
    fields = [('id', id_field), ('new', text_field), ('meanGrade', label_field)]
    fields2 = [('id', id_field), ('new', text_field)]
    examples = []
    if is_final_valid:
        for myid, text in tqdm(zip(csv_data['id'], csv_data['new'])):
            examples.append(data.Example.fromlist([myid, text], fields2))
        return examples, fields2
    else:
        for myid, text, label in tqdm(zip(csv_data['id'], csv_data['new'], csv_data['meanGrade'])):
            examples.append(data.Example.fromlist([myid, text, label], fields))
        return examples, fields

### load data

In [None]:
TEXT = data.Field(tokenize='spacy',sequential=True)
LABEL = data.LabelField(use_vocab=False, dtype=torch.float)
ID = data.LabelField(use_vocab=False)

train_path = "../data/task-1/train2.csv"
final_valid_path = "../data/task-1/dev2.csv"
    
df = pd.read_csv(train_path)
final_valid = pd.read_csv(final_valid_path)

test = df[-1000:]
test = test.reset_index(drop=True)
train = df[:len(df)-1000]
valid = train[-1000:]
valid = valid.reset_index(drop=True)
train = train[:len(train)-1000]

### split data 

In [None]:
train_examples, train_fields = get_dataset(train, ID, TEXT, LABEL)
valid_examples, valid_fields = get_dataset(valid, ID, TEXT, LABEL)
# test_examples, test_fields = get_dataset(test, TEXT, None, True)
test_examples, test_fields = get_dataset(test, ID, TEXT, LABEL)
final_valid_examples, final_valid_fields = get_dataset(final_valid, ID, TEXT, LABEL, True)


train_data = data.Dataset(train_examples, train_fields)
valid_data = data.Dataset(valid_examples, valid_fields)
test_data = data.Dataset(test_examples, test_fields)
final_valid_data = data.Dataset(final_valid_examples, final_valid_fields)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')
print(f'Number of final_valid examples: {len(final_valid_data)}')

### import Glove

In [4]:
TEXT.build_vocab(train_data,vectors="glove.840B.300d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")


In [None]:
print(TEXT.vocab.freqs.most_common(20))
print(LABEL.vocab.stoi)
print(LABEL.vocab.freqs.most_common(40))

### baches example

In [82]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator, final_valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data, final_valid_data), 
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.new), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    device=device)

# BiLSTM 

In [83]:
import torch.nn as nn
import torch.nn.functional as F

class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx, hidden_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True) 
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text[sent_len, batch_size]
        embedded = self.embedding(text)
        # embeded[sent_len, batch_size, embedding_dim]
        embedded = self.dropout(embedded) 
        output, (hidden, cell) = self.lstm(embedded) 
        hidden = self.dropout(torch.cat((hidden[-2], hidden[-1]), dim=1))
        # hidden[batch_size, hidden_dim * num_directions]
        return self.fc(hidden.squeeze(0))

# training

## parameters

In [84]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
OUTPUT_DIM = 1 # Classification: num_labels/Regression: 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
HIDDEN_SIZE = 16 
DROPOUT = 0.5 

model = BiLSTMModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX, HIDDEN_SIZE, DROPOUT)

In [85]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,776,337 trainable parameters


In [86]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 1.4284e+00,  2.0316e+00, -1.8117e+00,  ..., -1.6376e+00,
         -9.5069e-01, -2.9310e-01],
        [ 4.0436e-01,  5.9245e-01,  7.6357e-04,  ..., -3.6533e-01,
         -1.1878e-01, -7.8427e-01],
        [-3.3637e-01,  4.1678e-02,  4.8849e-01,  ..., -1.6988e-01,
          2.0799e-02,  4.8053e-01],
        ...,
        [ 2.7073e-01, -8.7394e-02, -3.6834e-01,  ...,  3.6836e-02,
         -2.1718e-01, -3.4061e-01],
        [-5.4644e-02, -3.8569e-01, -1.1224e-01,  ..., -2.1691e-02,
         -8.6189e-02, -6.3430e-02],
        [ 1.6838e-01, -6.9567e-02, -9.5079e-02,  ...,  3.8989e-02,
          5.1054e-01, -9.0771e-02]])

In [87]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

## training

In [88]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.MSELoss(reduction='sum') # TODO
model = model.to(device)
criterion = criterion.to(device)

In [89]:
def trainModel(model, iterator, optimizer, criterion):
    epoch_loss = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.new).squeeze(1)
        loss = criterion(predictions, batch.meanGrade)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [90]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.new).squeeze(1)
            loss = criterion(predictions, batch.meanGrade)
        
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

In [91]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss = trainModel(model, train_iterator, optimizer, criterion)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'lstm-model.pth')
        print("save")
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValid Loss: {valid_loss:.3f}')
    
    

save
Epoch: 01
	Train Loss: 16.048
	Valid Loss: 11.289
save
Epoch: 02
	Train Loss: 11.640
	Valid Loss: 11.107
Epoch: 03
	Train Loss: 10.928
	Valid Loss: 11.122
save
Epoch: 04
	Train Loss: 9.672
	Valid Loss: 11.040
Epoch: 05
	Train Loss: 8.776
	Valid Loss: 11.081
Epoch: 06
	Train Loss: 7.896
	Valid Loss: 11.176
save
Epoch: 07
	Train Loss: 7.340
	Valid Loss: 10.966
Epoch: 08
	Train Loss: 6.755
	Valid Loss: 11.094
Epoch: 09
	Train Loss: 6.375
	Valid Loss: 11.707
Epoch: 10
	Train Loss: 5.794
	Valid Loss: 11.596


# Evaluation (MSE)

In [92]:
import numpy as np
def myMSE(model, iterator):
    pred_list = np.array([])
    real_list = np.array([])
    id_list = np.array([])
    
#     Result_Dict = sorted(list(set(train['meanGrade'])))
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.new).squeeze(1)
            pred = np.array(predictions.data.tolist())
            real = np.array(batch.meanGrade.data.tolist())
            myid = np.array(batch.id.data.tolist())
            real_list = np.append(real_list, real)
            pred_list = np.append(pred_list, pred)
            id_list = np.append(id_list, myid)

    
# #     csv['pred_label'] = pred_list.round(0).astype(int)
#     csv['pred'] = [Result_Dict[i] for i in csv['pred_label']]


    df = pd.DataFrame({'id':id_list, 'real':real_list, 'pred':pred_list})
    rmse = np.sqrt(np.mean((df['real'] - df['pred'])**2))
            
    print(rmse)
    return df

# 测试最后一次的model

In [93]:
df = myMSE(model, train_iterator)

0.34131570232640285


In [94]:
df.sort_values(by=['real'])

Unnamed: 0,id,real,pred
7651,1591.0,0.0,0.219359
5312,3034.0,0.0,0.666898
1214,5077.0,0.0,0.393513
5296,7204.0,0.0,1.059074
5290,4912.0,0.0,0.257849
...,...,...,...
7369,1664.0,2.8,1.849426
1469,7255.0,2.8,1.812325
1653,3500.0,2.8,1.941720
1561,1229.0,3.0,1.937473


In [95]:
df = myMSE(model, valid_iterator)

0.6091482274242547


In [96]:
df.sort_values(by=['real'])

Unnamed: 0,id,real,pred
724,12928.0,0.0,0.598634
101,4984.0,0.0,0.336657
313,5169.0,0.0,1.125902
647,8479.0,0.0,1.180478
320,14376.0,0.0,0.377735
...,...,...,...
575,4142.0,2.6,1.768698
452,9899.0,2.6,1.111197
327,7693.0,2.6,1.365898
695,11170.0,2.8,1.242616


# 测试 best的model

In [97]:
model.load_state_dict(torch.load("lstm-model.pth"))
df = myMSE(model, train_iterator)

0.3805651567891106


In [98]:
df.sort_values(by=['real'])

Unnamed: 0,id,real,pred
5974,5622.0,0.0,0.479883
1600,4123.0,0.0,0.266592
1606,1016.0,0.0,0.377563
5916,5879.0,0.0,0.602375
6850,5900.0,0.0,0.493271
...,...,...,...
4555,772.0,2.8,1.882243
6801,1664.0,2.8,1.615961
4798,9703.0,2.8,1.551853
4836,1229.0,3.0,1.833558


In [99]:
df = myMSE(model, valid_iterator)

0.5923695042997721


In [100]:
df.sort_values(by=['real'])

Unnamed: 0,id,real,pred
724,12928.0,0.0,0.598558
101,4984.0,0.0,0.382028
313,5169.0,0.0,1.107366
647,8479.0,0.0,1.149551
320,14376.0,0.0,0.473654
...,...,...,...
575,4142.0,2.6,1.480583
452,9899.0,2.6,1.114315
327,7693.0,2.6,1.272936
695,11170.0,2.8,1.191589


In [101]:
df = myMSE(model, test_iterator)

0.5962360053556518


In [103]:
df.sort_values(by=['pred'])

Unnamed: 0,id,real,pred
555,13902.0,0.2,0.130202
954,5802.0,0.4,0.152644
758,10344.0,0.4,0.205137
509,10884.0,0.6,0.219578
663,12378.0,1.8,0.253998
...,...,...,...
658,9500.0,1.8,1.797751
939,12863.0,1.4,1.802699
553,14526.0,1.2,1.803328
778,5550.0,0.6,1.808452


In [70]:
def getFinal_Valid(model, iterator):
    pred_list = np.array([])
    id_list = np.array([])
    
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.new).squeeze(1)
            pred = np.array(predictions.data.tolist())
            myid = np.array(batch.id.data.tolist())
            pred_list = np.append(pred_list, pred)
            id_list = np.append(id_list, myid)

    df = pd.DataFrame({'id':id_list,'pred':pred_list})
            
    return df

In [71]:
df = getFinal_Valid(model, final_valid_iterator)
df['id'] = df['id'].round(0).astype(int)
df.sort_values(by=['id'])

Unnamed: 0,id,pred
1938,4,1.092751
940,13,1.099297
723,15,1.059714
2237,25,1.071770
436,30,0.852402
...,...,...
1866,15084,0.931598
71,15086,0.508013
590,15091,1.082323
1278,15093,0.532556


In [72]:
data = pd.merge(df,final_valid)
data = data[['id','pred']] 
data.sort_values(by=['id'])

Unnamed: 0,id,pred
1938,4,1.092751
940,13,1.099297
723,15,1.059714
2237,25,1.071770
436,30,0.852402
...,...,...
1866,15084,0.931598
71,15086,0.508013
590,15091,1.082323
1278,15093,0.532556


In [74]:
out_loc = '../output/task-1-output.csv'
data.to_csv(out_loc, index=False)