In [130]:
import pandas as pd
import numpy as np
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import pdb
import torch.optim as optim

In [54]:
%matplotlib inline

In [157]:
train_data = pd.read_csv("../data/train.csv")
valid_data = pd.read_csv("../data/valid.csv")
test_data = pd.read_csv("../data/test.csv")

In [158]:
train_data.dropna(inplace=True)
valid_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


100%|██████████| 231508/231508 [00:00<00:00, 3744639.31B/s]


In [17]:
tokens_tensor = torch.tensor([indexed_tokens]).to(device)
segments_tensors = torch.tensor([segments_ids]).to(device)

In [18]:
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

100%|██████████| 407873900/407873900 [00:33<00:00, 12105734.55B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )

In [19]:
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )

In [151]:
class QDataSet(Dataset):
    def __init__(self, dataframe, tokenizer, seq_length=30):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        q1 = row.question1
        q2 = row.question2
        label = int(row.is_duplicate)
        #form tokens
        q1 = ["[CLS]"] + tokenizer.tokenize(q1) + ["[SEP]"]
        q2 = ["[CLS]"] + tokenizer.tokenize(q2) + ["[SEP]"]
        #get token ids
        q1_ids = tokenizer.convert_tokens_to_ids(q1)
        q2_ids = tokenizer.convert_tokens_to_ids(q2)
        #cut sentence larger than max len
        q1_ids = q1_ids[:self.seq_length]
        q2_ids = q2_ids[:self.seq_length]
        #init mast
        q1_mask = [1]*len(q1_ids)
        q2_mask = [1]*len(q2_ids)
        
    
        #add padding
        while len(q1_ids) < self.seq_length:
            q1_ids.append(0)
            q1_mask.append(0)
            
        while len(q2_ids) < self.seq_length:
            q2_ids.append(0)
            q2_mask.append(0)
            
        
        return np.array(q1_ids), np.array(q1_mask), np.array(q2_ids), np.array(q2_mask), label

In [159]:
dataset = QDataSet(train_data, tokenizer)

In [160]:
def train(model, optimizer, criteria, bert_model, train_data, valid_data, batch_size, shuffle, epoch, device):
    bert_model.eval()
    seq_length = train_data.seq_length
    input_type_ids = torch.zeros([batch_size, seq_length], dtype=torch.int64).to(device)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=shuffle)
    for e in range(epoch):
        for i_batch, sample_batch in enumerate(train_loader):
            q1_ids, q1_mask, q2_ids, q2_mask, label = sample_batch
            q1_ids = torch.tensor(q1_ids).to(device)
            q2_ids = torch.tensor(q2_ids).to(device)
            label = torch.tensor(label).to(device)
            with torch.no_grad():
                q1_vecs, _ = bert_model(q1_ids, input_type_ids)
                q2_vecs, _ = bert_model(q2_ids, input_type_ids)
            q1_vecs, q2_vecs = q1_vecs[-1], q2_vecs[-1]
            
            optimizer.zero_grad()
            output = model(q1_vecs, q2_vecs)
            loss = criteria(output, label)
            loss.backward()
            optimizer.step()
            print(loss.item())
            break
            
            

In [161]:
class LSTM_FC(nn.Module):
    def __init__(self, input_size=768, hidden_size=100, fc_size=50):
        super(LSTM_FC, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size*2, fc_size)
        self.fc2 = nn.Linear(fc_size, 2)
        
    def forward(self, q1, q2):
        o1, _ = self.lstm(q1)
        o2, _ = self.lstm(q2)
        out = torch.cat([o1[:, -1, :], o2[:, -1, :]], dim=-1)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        return out

In [162]:
clf = LSTM_FC()
criteria = nn.CrossEntropyLoss()
optimizer = optim.Adam(clf.parameters(), lr=1e-2)

In [None]:
train(model=clf ,optimizer=optimizer, criteria= criteria, bert_model=model, train_data=dataset, valid_data=None, 
      batch_size=32, shuffle=False, epoch=10, device=device)

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


0.706615149974823
0.6950178146362305
0.6931474804878235
0.6931474804878235
0.6931474804878235
0.6931474804878235


In [None]:
# ls = []
# for i in range(len(dataset)):
#     print()
#     q1, q2, _ = dataset[i]
#     l1, l2 = len(q1), len(q2)
#     ls.append(l1)
#     ls.append(l2)
# plt.hist(ls, bins=50, range=(0, 60))
# plt.xlabel("Length of a question")
# plt.ylabel("Count")
# plt.savefig("graphs/len_question.png")