In [39]:
import torch
import pandas as pd
from torchtext import data
from torchtext.vocab import Vectors
from torch.nn import init
from tqdm import tqdm
from torchtext.vocab import GloVe
from torchtext import data

# data 

### my dataset 

In [40]:
def get_dataset(csv_data, id_field, text_field, label_field, is_final_valid = False):
    fields = [('id', id_field), ('new', text_field), ('meanGrade', label_field)]
    fields2 = [('id', id_field), ('new', text_field)]
    examples = []
    if is_final_valid:
        for myid, text in tqdm(zip(csv_data['id'], csv_data['new'])):
            examples.append(data.Example.fromlist([myid, text], fields2))
        return examples, fields2
    else:
        for myid, text, label in tqdm(zip(csv_data['id'], csv_data['new'], csv_data['meanGrade'])):
            examples.append(data.Example.fromlist([myid, text, label], fields))
        return examples, fields

### load data

In [41]:
#tokenize = lambda x: x.split()
TEXT = data.RawField()
LABEL = data.LabelField(use_vocab=False, dtype=torch.float)
ID = data.LabelField(use_vocab=False)

train_path = "data/task-1/train2.csv"
final_valid_path = "data/task-1/dev2.csv"
    
df = pd.read_csv(train_path)
final_valid = pd.read_csv(final_valid_path)

test = df[-1000:]
test = test.reset_index(drop=True)
train = df[:len(df)-1000]
valid = train[-1000:]
valid = valid.reset_index(drop=True)
train = train[:len(train)-1000]

### split data 

In [42]:
train_examples, train_fields = get_dataset(train, ID, TEXT, LABEL)
valid_examples, valid_fields = get_dataset(valid, ID, TEXT, LABEL)
# test_examples, test_fields = get_dataset(test, TEXT, None, True)
test_examples, test_fields = get_dataset(test, ID, TEXT, LABEL)
final_valid_examples, final_valid_fields = get_dataset(final_valid, ID, TEXT, LABEL, True)


train_data = data.Dataset(train_examples, train_fields)
valid_data = data.Dataset(valid_examples, valid_fields)
test_data = data.Dataset(test_examples, test_fields)
final_valid_data = data.Dataset(final_valid_examples, final_valid_fields)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')
print(f'Number of final_valid examples: {len(final_valid_data)}')

7652it [00:00, 207245.16it/s]
1000it [00:00, 7881.64it/s]
1000it [00:00, 224992.17it/s]
2419it [00:00, 287227.42it/s]

Number of training examples: 7652
Number of validation examples: 1000
Number of testing examples: 1000
Number of final_valid examples: 2419





### import Glove

In [43]:
# TEXT.build_vocab(train_data,vectors="glove.840B.300d", unk_init=torch.Tensor.normal_) 
# TEXT.build_vocab(train_data) 
LABEL.build_vocab(train_data)
# print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")


Unique tokens in LABEL vocabulary: 33


In [44]:
# print(TEXT.vocab.freqs.most_common(20))
# print(TEXT.vocab.itos[3])
# print(LABEL.vocab.freqs.most_common(40))

### baches example

In [45]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator, final_valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data, final_valid_data), 
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.new), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    device=device)

# Elmo_BiLSTM 

In [59]:
import torch.nn as nn
import torch.nn.functional as F

class BiLSTMModel(nn.Module):
    def __init__(self, embedding_dim, output_dim, hidden_dim, dropout):
        super().__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True) 
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, embedding):
        embedding = self.dropout(embedding) 
        # embedding[sent_len, betch_size, embedding_dim]
        
        # text[sent_len, batch_size]
        # embedded = self.embedding()
        # embeded[sent_len, batch_size, embedding_dim]
        output, (hidden, cell) = self.lstm(embedding) 
        hidden = self.dropout(torch.cat((hidden[-2], hidden[-1]), dim=1))
        # hidden[batch_size, hidden_dim * num_directions]
        return self.fc(hidden.squeeze(0))

# training

## parameters

In [60]:
EMBEDDING_DIM = 1024
OUTPUT_DIM = 1 # Classification: num_labels/Regression: 1
HIDDEN_SIZE = 16 
DROPOUT = 0.5 

model = BiLSTMModel(EMBEDDING_DIM, OUTPUT_DIM, HIDDEN_SIZE, DROPOUT)

In [61]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 133,409 trainable parameters


## training

In [62]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.MSELoss(reduction='sum') # TODO
model = model.to(device)
criterion = criterion.to(device)

### Elmo

In [63]:
from allennlp.modules.elmo import Elmo, batch_to_ids
options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 2, dropout=0)

def get_elmo_embeddings(batch):
    sentences = [sen.split()for sen in batch]
    character_ids = batch_to_ids(sentences)
    embeddings = elmo(character_ids)
    embedding = embeddings['elmo_representations'][0].permute(1,0,2)
    ### embedding [sent_len, batch_size, embed_dim]
    return embedding
    

In [64]:
def trainModel(model, iterator, optimizer, criterion):
    epoch_loss = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        embeddings = get_elmo_embeddings(batch.new)
        predictions = model(embeddings).squeeze(1)
        loss = criterion(predictions, batch.meanGrade)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        
    return epoch_loss / len(iterator)
        

In [65]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            embeddings = get_elmo_embeddings(batch.new)
            predictions = model(embeddings).squeeze(1)
            loss = criterion(predictions, batch.meanGrade)
        
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

In [66]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    trainModel(model, train_iterator, optimizer, criterion)
    
    train_loss = trainModel(model, train_iterator, optimizer, criterion)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'lstm-model.pth')
        print("save")
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValid Loss: {valid_loss:.3f}')
    
    

save
Epoch: 01
	Train Loss: 12.229
	Valid Loss: 11.053
save
Epoch: 02
	Train Loss: 11.207
	Valid Loss: 10.730
save
Epoch: 03
	Train Loss: 10.704
	Valid Loss: 10.516
save
Epoch: 04
	Train Loss: 10.190
	Valid Loss: 10.205
Epoch: 05
	Train Loss: 9.744
	Valid Loss: 10.747
save
Epoch: 06
	Train Loss: 9.601
	Valid Loss: 10.102
Epoch: 07
	Train Loss: 9.250
	Valid Loss: 10.124
save
Epoch: 08
	Train Loss: 9.022
	Valid Loss: 9.897
Epoch: 09
	Train Loss: 8.791
	Valid Loss: 10.402
Epoch: 10
	Train Loss: 8.470
	Valid Loss: 10.549


# Evaluation (MSE)

In [69]:
import numpy as np
def myMSE(model, iterator):
    pred_list = np.array([])
    real_list = np.array([])
    id_list = np.array([])
    
#     Result_Dict = sorted(list(set(train['meanGrade'])))
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            embeddings = get_elmo_embeddings(batch.new)
            predictions = model(embeddings).squeeze(1)
            pred = np.array(predictions.data.tolist())
            real = np.array(batch.meanGrade.data.tolist())
            myid = np.array(batch.id.data.tolist())
            real_list = np.append(real_list, real)
            pred_list = np.append(pred_list, pred)
            id_list = np.append(id_list, myid)

    
# #     csv['pred_label'] = pred_list.round(0).astype(int)
#     csv['pred'] = [Result_Dict[i] for i in csv['pred_label']]


    df = pd.DataFrame({'id':id_list, 'real':real_list, 'pred':pred_list})
    rmse = np.sqrt(np.mean((df['real'] - df['pred'])**2))
            
    print(rmse)
    return df

# 测试最后一次的model

# 测试 best的model

In [70]:
model.load_state_dict(torch.load("lstm-model.pth"))
df = myMSE(model, train_iterator)

0.4811077845940743


In [71]:
df.sort_values(by=['real'])

Unnamed: 0,id,real,pred
6388,1617.0,0.0,1.049526
6259,8191.0,0.0,0.890024
3680,8804.0,0.0,1.234373
6267,12380.0,0.0,0.634069
3657,14836.0,0.0,0.789578
...,...,...,...
827,7594.0,2.8,1.367028
701,9933.0,2.8,1.557338
6036,590.0,2.8,1.505967
5406,1229.0,3.0,1.525018


In [72]:
df = myMSE(model, valid_iterator)

0.5627223148482773


In [73]:
df.sort_values(by=['real'])

Unnamed: 0,id,real,pred
647,319.0,0.0,0.762673
168,3431.0,0.0,0.848087
469,8208.0,0.0,0.528504
461,12928.0,0.0,0.855344
439,14417.0,0.0,1.009588
...,...,...,...
712,4142.0,2.6,1.023752
334,7693.0,2.6,0.932958
531,736.0,2.6,0.628055
452,11170.0,2.8,1.443883


In [74]:
df = myMSE(model, test_iterator)

0.5547148351001647


In [75]:
df.sort_values(by=['pred'])

Unnamed: 0,id,real,pred
170,10487.0,0.8,0.430558
52,13583.0,1.0,0.479546
692,12113.0,0.2,0.487714
233,2442.0,0.8,0.492824
12,11709.0,0.6,0.500772
...,...,...,...
271,9342.0,2.0,1.581871
332,12784.0,1.2,1.601785
947,11492.0,1.0,1.605963
344,1812.0,1.0,1.652382


In [78]:
def getFinal_Valid(model, iterator):
    pred_list = np.array([])
    id_list = np.array([])
    
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            embeddings = get_elmo_embeddings(batch.new)
            predictions = model(embeddings).squeeze(1)
            pred = np.array(predictions.data.tolist())
            myid = np.array(batch.id.data.tolist())
            pred_list = np.append(pred_list, pred)
            id_list = np.append(id_list, myid)

    df = pd.DataFrame({'id':id_list,'pred':pred_list})
            
    return df

In [79]:
df = getFinal_Valid(model, final_valid_iterator)
df['id'] = df['id'].round(0).astype(int)
df.sort_values(by=['id'])

Unnamed: 0,id,pred
1605,4,1.059387
1134,13,0.930356
688,15,0.910926
2135,25,0.815300
561,30,0.768042
...,...,...
1708,15084,0.989453
122,15086,0.641545
486,15091,1.152520
1454,15093,0.708814


In [80]:
data = pd.merge(df,final_valid)
data = data[['id','pred']] 
data.sort_values(by=['id'])

Unnamed: 0,id,pred
1605,4,1.059387
1134,13,0.930356
688,15,0.910926
2135,25,0.815300
561,30,0.768042
...,...,...
1708,15084,0.989453
122,15086,0.641545
486,15091,1.152520
1454,15093,0.708814


In [82]:
out_loc = 'save_outputs/task-1-output.csv'
data.to_csv(out_loc, index=False)