In [1]:
!pip install pytorch_pretrained_bert
import pandas as pd
import numpy as np
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import torch.nn as nn
import random
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence as unpack
from torch.nn.utils.rnn import pack_padded_sequence as pack
import pdb
import torch.optim as optim
import torch.nn.init as weigth_init
from google.colab import drive
drive.mount('/content/drive')

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
%matplotlib inline

In [0]:
dir_path = "drive/My Drive/quora"
train_data = pd.read_csv("{}/data/train.csv".format(dir_path))
valid_data = pd.read_csv("{}/data/valid.csv".format(dir_path))
test_data = pd.read_csv("{}/data/test.csv".format(dir_path))
train_data.dropna(inplace=True)
valid_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )

In [0]:
class QDataSet(Dataset):
    def __init__(self, dataframe, tokenizer, seq_length=30):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        q1 = row.question1
        q2 = row.question2
        
        exchange = random.choice([0, 1])
        if exchange == 1:
          q1, q2 = q2, q1
        
        label = int(row.is_duplicate)
        #form tokens
        q1 = ["[CLS]"] + tokenizer.tokenize(q1) + ["[SEP]"]
        q2 = ["[CLS]"] + tokenizer.tokenize(q2) + ["[SEP]"]
        #get token ids
        q1_ids = tokenizer.convert_tokens_to_ids(q1)
        q2_ids = tokenizer.convert_tokens_to_ids(q2)
        #cut sentence larger than max len
        q1_ids = q1_ids[:self.seq_length]
        q2_ids = q2_ids[:self.seq_length]
        #init mast
        q1_mask = [1]*len(q1_ids)
        q2_mask = [1]*len(q2_ids)
        
    
        #add padding
        while len(q1_ids) < self.seq_length:
            q1_ids.append(0)
            q1_mask.append(0)
            
        while len(q2_ids) < self.seq_length:
            q2_ids.append(0)
            q2_mask.append(0)
            
        
        return np.array(q1_ids), np.array(q1_mask), sum(q1_mask), np.array(q2_ids), np.array(q2_mask), sum(q2_mask), label

In [0]:
dataset = QDataSet(train_data, tokenizer)
valid_dataset = QDataSet(valid_data, tokenizer)

In [0]:
def sort_batch(data, seq_len, device):
    sorted_seq_len, sorted_idx = torch.sort(seq_len, dim=0, descending=True)
    sorted_data = data[sorted_idx.data]
    _, reverse_idx = torch.sort(sorted_idx, dim=0, descending=False)
    return sorted_data, sorted_seq_len.to(device), reverse_idx.to(device)
  
def softmax_mask(input, mask, device, axis=1, epsilon=1e-12):
    shift, _ = torch.max(input, axis, keepdim=True)
    shift = shift.expand_as(input).to(device)

    target_exp = torch.exp(input - shift) * mask

    normalize = torch.sum(target_exp, axis, keepdim=True).expand_as(target_exp)
    softm = target_exp / (normalize + epsilon)

    return softm.to(device)

In [0]:
def valid(model, bert_model, criteria, valid_data, batch_size, shuffle, device):
    model.eval()
    bert_model.eval()
    seq_length = valid_data.seq_length
    valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=shuffle)
    loss_arr = []
    for i_batch, sample_batch in enumerate(valid_loader):
        q1_vecs, q2_vecs, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len, q2_len, label = get_embedding(sample_batch, seq_length, device, bert_model)

        output = model(q1_vecs, q2_vecs, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len.to(device), q2_len.to(device))
        loss = criteria(output, label)
        loss_arr.append(loss.item())
    return loss_arr

In [0]:
def get_embedding(sample_batch, seq_length, device, bert_model):
  q1_ids, q1_mask, q1_len, q2_ids, q2_mask, q2_len, label = sample_batch
  input_type_ids = torch.zeros([q1_ids.shape[0], seq_length], dtype=torch.int64).to(device)
  
  q1_ids = torch.tensor(q1_ids).to(device)
  q2_ids = torch.tensor(q2_ids).to(device)
  label = torch.tensor(label).to(device)
  
  #sort the batch
  s_q1, s_q1_len, reverse_q1_idx = sort_batch(q1_ids, q1_len, device)
  s_q2, s_q2_len, reverse_q2_idx = sort_batch(q2_ids, q2_len, device)
  
  #get embedding
  with torch.no_grad():
      q1_vecs, _ = bert_model(s_q1, input_type_ids)
      q2_vecs, _ = bert_model(s_q2, input_type_ids)
  q1_vecs = pack(q1_vecs[-1], list(s_q1_len.data), batch_first=True)
  q2_vecs = pack(q2_vecs[-1], list(s_q2_len.data), batch_first=True)
  
  #get mask
  q1_mask = torch.tensor(q1_mask[:, :max(q1_len)]).to(device)
  q2_mask = torch.tensor(q2_mask[:, :max(q2_len)]).to(device)
  
  return q1_vecs, q2_vecs, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len, q2_len, label

In [0]:
def train(model, optimizer, criteria, bert_model, train_data, valid_data, batch_size, shuffle, epoch, device, start_epoch):
    
    bert_model.eval()
    seq_length = train_data.seq_length
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=shuffle)
    for e in range(start_epoch, start_epoch + epoch):
        model.train()
        for i_batch, sample_batch in enumerate(train_loader):
            q1_vecs, q2_vecs, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len, q2_len, label = get_embedding(sample_batch, seq_length, device, bert_model)
            #get in the model
            optimizer.zero_grad()
            output = model(q1_vecs, q2_vecs, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len.to(device), q2_len.to(device))
            loss = criteria(output, label)
            loss.backward()
            optimizer.step()
            if i_batch%50==0:
                print(i_batch, loss.item())
#             break
#         continue

        print("Validating the model")
        loss_arr = valid(model=model, bert_model=bert_model, criteria=criteria, valid_data=valid_data, batch_size=batch_size, shuffle=shuffle, device=device)
        print("Finish an epoch with validation loss {}, training loss {}".format(np.mean(loss_arr), loss.item()))         
        torch.save(model.state_dict(), "drive/My Drive/quora/trained_models/att_lstm/{0}_{1:.2f}_LSTMATT.pt".format(e, np.mean(loss_arr)))
        print("Saved the model.")
            

In [0]:
class LSTMMaskFC(nn.Module):
    def __init__(self, device, input_size=768, hidden_size=100, fc_size=50):
        super(LSTMMaskFC, self).__init__()
        self.device = device
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(hidden_size*4, fc_size)
        self.fc2 = nn.Linear(fc_size, 2)
        
        for weight in self.lstm.parameters():
          if len(weight.size()) > 1:
            weigth_init.orthogonal(weight.data)

        
    def forward(self, q1, q2, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len, q2_len):

        
        #unpack
        o1, _ = unpack(q1, batch_first=True)
        o2, _ = unpack(q2, batch_first=True)
        
        o1 = o1[reverse_q1_idx.data]
        o2 = o2[reverse_q2_idx.data]
        
        #q1, q2 dot product
        q1_mask= q1_mask.unsqueeze(2)
        q2_mask = q2_mask.unsqueeze(2)
        
        M = torch.bmm(o1, o2.transpose(1, 2))
        M_mask = torch.bmm(q1_mask.float(), q2_mask.transpose(1, 2).float())
        
        #q1, q2 attention
        alpha = softmax_mask(M, M_mask, self.device, axis=1)
        beta = softmax_mask(M, M_mask, self.device, axis=2)
        
        out1 = torch.bmm(alpha.transpose(1, 2), o1)
        out2 = torch.bmm(beta, o2)
        out1, _ = self.lstm(out1)
        out2, _ = self.lstm(out2)
        
        out = torch.cat([out1[:, -1, :], out2[:, -1, :]], dim=1)
        out = F.relu(self.fc1(out))
        
        out = F.relu(self.fc2(out))
        return out

In [20]:
clf = LSTMMaskFC(device)
clf.to(device)
criteria = nn.CrossEntropyLoss()

  # This is added back by InteractiveShellApp.init_path()


In [0]:
optimizer = optim.Adam(clf.parameters(), lr=5e-3, weight_decay=1e-4)

In [0]:
train(model=clf ,optimizer=optimizer, criteria= criteria, bert_model=model, train_data=dataset, valid_data=valid_dataset, 
      batch_size=256, shuffle=True, epoch=20, device=device,start_epoch=0)

  """
  
  import sys


0 0.6931474804878235
50 0.6320931911468506
100 0.5813612341880798
150 0.5238048434257507
200 0.5572337508201599
250 0.5908418893814087
300 0.4915110766887665
350 0.5673909783363342
400 0.5592517852783203
450 0.5189058780670166
500 0.524715781211853
550 0.5247702598571777
600 0.451886385679245
650 0.5230859518051147
700 0.5193924307823181
750 0.5465618371963501
800 0.468985378742218
850 0.5537930727005005
900 0.5406456589698792
950 0.47395995259284973
1000 0.5079203844070435
1050 0.49402469396591187
1100 0.495651513338089
Validating the model
Finish an epoch with validation loss 0.5000382314754438, training loss 0.39936140179634094
Saved the model.
0 0.5125454664230347
50 0.5472338795661926
100 0.47860488295555115
150 0.5035973191261292
200 0.49229592084884644
250 0.4867461919784546
300 0.45573514699935913
350 0.5074822902679443
400 0.5281082987785339
450 0.45395585894584656
500 0.5155036449432373
550 0.4880293011665344
600 0.49315473437309265
650 0.4870426058769226
700 0.54478389024734