In [9]:
import torch
import pandas as pd
from torchtext import data
from torchtext.vocab import Vectors
from torch.nn import init
from tqdm import tqdm
from torchtext.vocab import GloVe
from torchtext import data

# data 

### my dataset 

In [10]:
def get_dataset(csv_data, id_field, text_field, label_field, is_final_valid = False):
    fields = [('id', id_field), ('new', text_field), ('original2', text_field), ('meanGrade', label_field)]
    fields2 = [('id', id_field), ('new', text_field), ('original2', text_field)]
    examples = []
    if is_final_valid:
        for myid, text1, text2 in tqdm(zip(csv_data['id'], csv_data['new'], csv_data['original2'])):
            examples.append(data.Example.fromlist([myid, text1, text2], fields2))
        return examples, fields2
    else:
        for myid, text1, text2, label in tqdm(zip(csv_data['id'], csv_data['new'], csv_data['original2'], csv_data['meanGrade'])):
            examples.append(data.Example.fromlist([myid, text1, text2, label], fields))
        return examples, fields

### load data

In [11]:
TEXT = data.Field(tokenize='spacy',sequential=True)
LABEL = data.LabelField(use_vocab=False, dtype=torch.float)
ID = data.LabelField(use_vocab=False)

train_path = "data/task-1/train2.csv"
final_valid_path = "data/task-1/dev2.csv"
    
df = pd.read_csv(train_path)
final_valid = pd.read_csv(final_valid_path)

test = df[-1000:]
test = test.reset_index(drop=True)
train = df[:len(df)-1000]
valid = train[-1000:]
valid = valid.reset_index(drop=True)
train = train[:len(train)-1000]

### split data 

In [12]:
train_examples, train_fields = get_dataset(train, ID, TEXT, LABEL)
valid_examples, valid_fields = get_dataset(valid, ID, TEXT, LABEL)
# test_examples, test_fields = get_dataset(test, TEXT, None, True)
test_examples, test_fields = get_dataset(test, ID, TEXT, LABEL)
final_valid_examples, final_valid_fields = get_dataset(final_valid, ID, TEXT, LABEL, True)


train_data = data.Dataset(train_examples, train_fields)
valid_data = data.Dataset(valid_examples, valid_fields)
test_data = data.Dataset(test_examples, test_fields)
final_valid_data = data.Dataset(final_valid_examples, final_valid_fields)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')
print(f'Number of final_valid examples: {len(final_valid_data)}')

7652it [00:01, 7186.73it/s]
1000it [00:00, 8888.72it/s]
1000it [00:00, 8583.93it/s]
2419it [00:00, 6741.65it/s]


Number of training examples: 7652
Number of validation examples: 1000
Number of testing examples: 1000
Number of final_valid examples: 2419


### import Glove

In [13]:
TEXT.build_vocab(train_data,vectors="glove.840B.300d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")


Unique tokens in TEXT vocabulary: 12452
Unique tokens in LABEL vocabulary: 33


In [14]:
print(TEXT.vocab.freqs.most_common(20))
print(LABEL.vocab.stoi)
print(LABEL.vocab.freqs.most_common(40))

[('Trump', 2737), ('to', 2265), ("'", 2250), (',', 1881), (':', 1424), ('in', 1387), ('the', 1266), ("'s", 1219), ('of', 1204), ('-', 1036), ('for', 862), ('on', 843), ('a', 725), ('’s', 632), ('and', 615), ('is', 586), ('.', 498), ('with', 464), ('The', 429), ('’', 427)]
defaultdict(None, {0.8: 0, 0.6: 1, 1.0: 2, 0.4: 3, 1.2: 4, 0.2: 5, 1.4: 6, 1.6: 7, 0.0: 8, 1.8: 9, 2.0: 10, 2.2: 11, 2.4: 12, 2.6: 13, 1.3: 14, 1.1: 15, 2.8: 16, 0.9: 17, 1.5: 18, 0.5: 19, 0.7: 20, 1.9: 21, 1.7: 22, 0.3: 23, 2.1: 24, 3.0: 25, 0.8666666666666667: 26, 1.3333333333333333: 27, 1.5333333333333334: 28, 2.066666666666667: 29, 2.1333333333333333: 30, 2.3: 31, 2.5: 32})
[(0.8, 947), (0.6, 924), (1.0, 918), (0.4, 870), (1.2, 816), (0.2, 746), (1.4, 635), (1.6, 509), (0.0, 406), (1.8, 334), (2.0, 239), (2.2, 131), (2.4, 75), (2.6, 29), (1.3, 13), (1.1, 10), (2.8, 9), (0.9, 7), (1.5, 6), (0.5, 4), (1.9, 4), (0.7, 4), (1.7, 3), (3.0, 2), (2.1, 2), (0.3, 2), (2.066666666666667, 1), (2.1333333333333333, 1), (2.5, 1)

### baches example

In [15]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator, final_valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data, final_valid_data), 
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.new), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    device=device)

# Diff_BiLSTM 

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Diff_BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx, hidden_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, num_layers=2) ### TODO
        self.fc_o = nn.Linear(hidden_dim*4, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text1, text2):
        # text[sent_len, batch_size]
        embedded1 = self.embedding(text1) 
        # embeded[sent_len, batch_size, embedding_dim]
        embedded1 = self.dropout(embedded1) 
        output1, (hidden1, cell1) = self.lstm(embedded1)
        hidden1 = self.dropout(torch.cat((hidden1[-2,:,:], hidden1[-1,:,:]), dim=1))
        # hidden[batch_size, hidden_dim * num_directions]
        
        embedded2 = self.embedding(text2) 
        embedded2 = self.dropout(embedded2) 
        output2, (hidden2, cell1) = self.lstm(embedded2)
        hidden2 = self.dropout(torch.cat((hidden2[-2,:,:], hidden2[-1,:,:]), dim=1))
        # hidden[batch_size, hidden_dim * num_directions]
        hidden = (hidden1-hidden2).squeeze(0)
        return self.fc_h(hidden)

# training

## parameters

In [17]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
OUTPUT_DIM = 1 # Classification: num_labels/Regression: 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
HIDDEN_SIZE = 16 
DROPOUT = 0.5 

model = BiLSTMModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX, HIDDEN_SIZE, DROPOUT)

In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,776,337 trainable parameters


In [19]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
model.embedding.weight.requires_grad = False
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

tensor([[ 1.2156,  0.3010,  0.0606,  ..., -1.1458, -0.6260,  1.7256],
        [-0.5369, -1.2060, -0.2253,  ...,  0.7642, -0.8394,  0.8487],
        [-0.3364,  0.0417,  0.4885,  ..., -0.1699,  0.0208,  0.4805],
        ...,
        [ 0.2707, -0.0874, -0.3683,  ...,  0.0368, -0.2172, -0.3406],
        [-0.0546, -0.3857, -0.1122,  ..., -0.0217, -0.0862, -0.0634],
        [ 0.1684, -0.0696, -0.0951,  ...,  0.0390,  0.5105, -0.0908]])

## training

In [21]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters()) ### lr
criterion = nn.MSELoss(reduction='sum') 
model = model.to(device)
criterion = criterion.to(device)

In [22]:
def trainModel(model, iterator, optimizer, criterion):
    epoch_loss = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.new, batch.original2).squeeze(1)
        loss = criterion(predictions, batch.meanGrade)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [23]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.new, batch.original2).squeeze(1)
            loss = criterion(predictions, batch.meanGrade)
        
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

In [None]:
N_EPOCHS = 40

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss = trainModel(model, train_iterator, optimizer, criterion)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'lstm-model.pth')
        print("save")
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValid Loss: {valid_loss:.3f}')
    
    

# Evaluation (MSE)

In [17]:
import numpy as np
def myMSE(model, iterator):
    pred_list = np.array([])
    real_list = np.array([])
    id_list = np.array([])
    
#     Result_Dict = sorted(list(set(train['meanGrade'])))
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.new).squeeze(1)
            pred = np.array(predictions.data.tolist())
            real = np.array(batch.meanGrade.data.tolist())
            myid = np.array(batch.id.data.tolist())
            real_list = np.append(real_list, real)
            pred_list = np.append(pred_list, pred)
            id_list = np.append(id_list, myid)

    
# #     csv['pred_label'] = pred_list.round(0).astype(int)
#     csv['pred'] = [Result_Dict[i] for i in csv['pred_label']]


    df = pd.DataFrame({'id':id_list, 'real':real_list, 'pred':pred_list})
    rmse = np.sqrt(np.mean((df['real'] - df['pred'])**2))
            
    print(rmse)
    return df

# 测试最后一次的model

In [18]:
df = myMSE(model, train_iterator)

0.3361164227368936


In [19]:
df.sort_values(by=['real'])

Unnamed: 0,id,real,pred
779,11941.0,0.0,0.719049
345,6276.0,0.0,0.602416
2787,2969.0,0.0,0.248862
2785,4767.0,0.0,0.309740
6441,7860.0,0.0,0.509659
...,...,...,...
3638,7255.0,2.8,1.811278
2448,7594.0,2.8,1.631923
3086,590.0,2.8,1.947464
480,3404.0,3.0,1.699795


In [20]:
df = myMSE(model, valid_iterator)

0.6122093213384863


In [21]:
df.sort_values(by=['real'])

Unnamed: 0,id,real,pred
724,12928.0,0.0,0.407617
101,4984.0,0.0,0.357371
313,5169.0,0.0,1.207764
647,8479.0,0.0,1.044237
320,14376.0,0.0,0.288494
...,...,...,...
575,4142.0,2.6,1.704642
452,9899.0,2.6,1.109124
327,7693.0,2.6,1.295462
695,11170.0,2.8,1.675464


# 测试 best的model

In [22]:
model.load_state_dict(torch.load("lstm-model.pth"))
df = myMSE(model, train_iterator)

0.47532969415169535


In [23]:
df.sort_values(by=['real'])

Unnamed: 0,id,real,pred
1775,13379.0,0.0,0.960282
3175,12094.0,0.0,0.927450
7580,8069.0,0.0,1.147470
3152,8228.0,0.0,0.611814
1247,10522.0,0.0,0.968475
...,...,...,...
2411,9933.0,2.8,1.477611
1787,1664.0,2.8,1.250922
5289,7255.0,2.8,1.313781
5404,3404.0,3.0,1.375417


In [24]:
df = myMSE(model, valid_iterator)

0.5862284263531602


In [25]:
df.sort_values(by=['real'])

Unnamed: 0,id,real,pred
724,12928.0,0.0,0.661611
101,4984.0,0.0,0.604616
313,5169.0,0.0,1.001791
647,8479.0,0.0,1.141619
320,14376.0,0.0,0.576823
...,...,...,...
575,4142.0,2.6,1.042276
452,9899.0,2.6,0.881399
327,7693.0,2.6,1.367787
695,11170.0,2.8,1.414090


In [None]:
df = myMSE(model, test_iterator)

In [None]:
df.sort_values(by=['pred'])

In [None]:
def getFinal_Valid(model, iterator):
    pred_list = np.array([])
    id_list = np.array([])
    
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.new).squeeze(1)
            pred = np.array(predictions.data.tolist())
            myid = np.array(batch.id.data.tolist())
            pred_list = np.append(pred_list, pred)
            id_list = np.append(id_list, myid)

    df = pd.DataFrame({'id':id_list,'pred':pred_list})
            
    return df

In [None]:
df = getFinal_Valid(model, final_valid_iterator)
df['id'] = df['id'].round(0).astype(int)
df.sort_values(by=['id'])

In [None]:
data = pd.merge(df,final_valid)
data = data[['id','pred']] 
data.sort_values(by=['id'])

In [None]:
out_loc = '../output/task-1-output.csv'
data.to_csv(out_loc, index=False)