In [1]:
!pip install pytorch_pretrained_bert
import pandas as pd
import numpy as np
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import torch.nn as nn
import random
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence as unpack
from torch.nn.utils.rnn import pack_padded_sequence as pack
import pdb
import torch.optim as optim
from google.colab import drive
drive.mount('/content/drive')

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/5d/3c/d5fa084dd3a82ffc645aba78c417e6072ff48552e3301b1fa3bd711e03d4/pytorch_pretrained_bert-0.6.1-py3-none-any.whl (114kB)
[K    100% |████████████████████████████████| 122kB 4.5MB/s 
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.1
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /c

In [0]:
%matplotlib inline

In [0]:
dir_path = "drive/My Drive/quora"
train_data = pd.read_csv("{}/data/train.csv".format(dir_path))
valid_data = pd.read_csv("{}/data/valid.csv".format(dir_path))
test_data = pd.read_csv("{}/data/test.csv".format(dir_path))
train_data.dropna(inplace=True)
valid_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
model.to(device)

100%|██████████| 231508/231508 [00:00<00:00, 2628655.78B/s]
100%|██████████| 407873900/407873900 [00:09<00:00, 43365782.80B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
        )

In [0]:
class QDataSet(Dataset):
    def __init__(self, dataframe, tokenizer, seq_length=30):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        q1 = row.question1
        q2 = row.question2
        
        exchange = random.choice([0, 1])
        if exchange == 1:
          q1, q2 = q2, q1
        
        label = int(row.is_duplicate)
        #form tokens
        q1 = ["[CLS]"] + tokenizer.tokenize(q1) + ["[SEP]"]
        q2 = ["[CLS]"] + tokenizer.tokenize(q2) + ["[SEP]"]
        #get token ids
        q1_ids = tokenizer.convert_tokens_to_ids(q1)
        q2_ids = tokenizer.convert_tokens_to_ids(q2)
        #cut sentence larger than max len
        q1_ids = q1_ids[:self.seq_length]
        q2_ids = q2_ids[:self.seq_length]
        #init mast
        q1_mask = [1]*len(q1_ids)
        q2_mask = [1]*len(q2_ids)
        
    
        #add padding
        while len(q1_ids) < self.seq_length:
            q1_ids.append(0)
            q1_mask.append(0)
            
        while len(q2_ids) < self.seq_length:
            q2_ids.append(0)
            q2_mask.append(0)
            
        
        return np.array(q1_ids), np.array(q1_mask), sum(q1_mask), np.array(q2_ids), np.array(q2_mask), sum(q2_mask), label

In [0]:
dataset = QDataSet(train_data, tokenizer)
valid_dataset = QDataSet(valid_data, tokenizer)

In [0]:
def sort_batch(data, seq_len, device):
    sorted_seq_len, sorted_idx = torch.sort(seq_len, dim=0, descending=True)
    sorted_data = data[sorted_idx.data]
    _, reverse_idx = torch.sort(sorted_idx, dim=0, descending=False)
    return sorted_data, sorted_seq_len.to(device), reverse_idx.to(device)
  
def softmax_mask(input, mask, device, axis=1, epsilon=1e-12):
    shift, _ = torch.max(input, axis, keepdim=True)
    shift = shift.expand_as(input).to(device)

    target_exp = torch.exp(input - shift) * mask

    normalize = torch.sum(target_exp, axis, keepdim=True).expand_as(target_exp)
    softm = target_exp / (normalize + epsilon)

    return softm.to(device)

In [0]:
def valid(model, bert_model, criteria, valid_data, batch_size, shuffle, device):
    model.eval()
    bert_model.eval()
    seq_length = valid_data.seq_length
    valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=shuffle)
    loss_arr = []
    for i_batch, sample_batch in enumerate(valid_loader):
        q1_vecs, q2_vecs, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len, label = get_embedding(sample_batch, seq_length, device, bert_model)

        output = model(q1_vecs, q2_vecs, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len.to(device))
        loss = criteria(output, label)
        loss_arr.append(loss.item())
    return loss_arr

In [0]:
def get_embedding(sample_batch, seq_length, device, bert_model):
  q1_ids, q1_mask, q1_len, q2_ids, q2_mask, q2_len, label = sample_batch
  input_type_ids = torch.zeros([q1_ids.shape[0], seq_length], dtype=torch.int64).to(device)
  
  q1_ids = torch.tensor(q1_ids).to(device)
  q2_ids = torch.tensor(q2_ids).to(device)
  label = torch.tensor(label).to(device)
  
  #sort the batch
  s_q1, s_q1_len, reverse_q1_idx = sort_batch(q1_ids, q1_len, device)
  s_q2, s_q2_len, reverse_q2_idx = sort_batch(q2_ids, q2_len, device)
  
  #get embedding
  with torch.no_grad():
      q1_vecs, _ = bert_model(s_q1, input_type_ids)
      q2_vecs, _ = bert_model(s_q2, input_type_ids)
  q1_vecs = pack(q1_vecs[-1], list(s_q1_len.data), batch_first=True)
  q2_vecs = pack(q2_vecs[-1], list(s_q2_len.data), batch_first=True)
  
  #get mask
  q1_mask = torch.tensor(q1_mask[:, :max(q1_len)]).to(device)
  q2_mask = torch.tensor(q2_mask[:, :max(q2_len)]).to(device)
  
  return q1_vecs, q2_vecs, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len, label

In [0]:
def train(model, optimizer, criteria, bert_model, train_data, valid_data, batch_size, shuffle, epoch, device, start_epoch):
    
    bert_model.eval()
    seq_length = train_data.seq_length
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=shuffle)
    for e in range(start_epoch, start_epoch + epoch):
        model.train()
        for i_batch, sample_batch in enumerate(train_loader):
            q1_vecs, q2_vecs, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len, label = get_embedding(sample_batch, seq_length, device, bert_model)
            #get in the model
            optimizer.zero_grad()
            output = model(q1_vecs, q2_vecs, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len.to(device))
            loss = criteria(output, label)
            loss.backward()
            optimizer.step()
            if i_batch%50==0:
                print(i_batch, loss.item())

        print("Validating the model")
        loss_arr = valid(model=model, bert_model=bert_model, criteria=criteria, valid_data=valid_data, batch_size=batch_size, shuffle=shuffle, device=device)
        print("Finish an epoch with validation loss {}, training loss {}".format(np.mean(loss_arr), loss.item()))         
        torch.save(model.state_dict(), "drive/My Drive/quora/trained_models/mask/{0}_{1:.2f}_LSTMATT.pt".format(e, np.mean(loss_arr)))
        print("Saved the model.")
            

In [0]:
class LSTMMaskFC(nn.Module):
    def __init__(self, device, input_size=768, hidden_size=100):
        super(LSTMMaskFC, self).__init__()
        self.device = device
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, 2)
        
        
    def forward(self, q1, q2, reverse_q1_idx, reverse_q2_idx, q1_mask, q2_mask, q1_len):
        #encode
        o1, _ = self.lstm(q1)
        o2, _ = self.lstm(q2)
        
        #unpack
        o1, _ = unpack(o1, batch_first=True)
        o2, _ = unpack(o2, batch_first=True)
        
        o1 = o1[reverse_q1_idx.data]
        o2 = o2[reverse_q2_idx.data]
        
        #q1, q2 dot product
        q1_mask= q1_mask.unsqueeze(2)
        q2_mask = q2_mask.unsqueeze(2)
        
        M = torch.bmm(o1, o2.transpose(1, 2))
        M_mask = torch.bmm(q1_mask.float(), q2_mask.transpose(1, 2).float())
        
        #q1, q2 attention
        alpha = softmax_mask(M, M_mask, self.device, axis=1)
        beta = softmax_mask(M, M_mask, self.device, axis=2)
        
        sum_beta = torch.sum(beta, dim=1, keepdim=True)
        q1_len = q1_len.unsqueeze(1).unsqueeze(2).expand_as(sum_beta)
        average_beta = sum_beta / q1_len.float()
        
        #q1-aware attention
        out = torch.bmm(alpha, average_beta.transpose(1, 2))
        out = o1*out
        out = F.relu(self.fc(out.sum(dim=1)))
        return out

In [0]:
clf = LSTMMaskFC(device)
clf.load_state_dict(torch.load("drive/My Drive/quora/trained_models/mask/17_0.34_LSTMATT.pt"))
clf.to(device)
criteria = nn.CrossEntropyLoss()

In [0]:
optimizer = optim.Adam(clf.parameters(), lr=1e-3, weight_decay=1e-4)

In [15]:
train(model=clf ,optimizer=optimizer, criteria= criteria, bert_model=model, train_data=dataset, valid_data=valid_dataset, 
      batch_size=256, shuffle=True, epoch=20, device=device,start_epoch=18)

  """
  
  import sys


0 0.1824077069759369
50 0.20536060631275177
100 0.16501109302043915
150 0.15312063694000244
200 0.206694096326828
250 0.1342277079820633
300 0.1170395240187645
350 0.1487385779619217
400 0.20356805622577667
450 0.17799563705921173
500 0.15932391583919525
550 0.17446279525756836
600 0.1701275110244751
650 0.18624603748321533
700 0.20870696008205414
750 0.11746397614479065
800 0.19709810614585876
850 0.1756007969379425
900 0.19221530854701996
950 0.23189397156238556
1000 0.20657096803188324
1050 0.19001325964927673
1100 0.19622261822223663
Validating the model
Finish an epoch with validation loss 0.33933663079004245, training loss 0.17236489057540894
Saved the model.
0 0.11882113665342331
50 0.14853674173355103
100 0.10981336981058121
150 0.1270381659269333
200 0.16955837607383728
250 0.17508895695209503
300 0.15802152454853058
350 0.171431303024292
400 0.25439929962158203
450 0.17356352508068085
500 0.15724749863147736
550 0.14954327046871185
600 0.16978959739208221
650 0.21486824750900

KeyboardInterrupt: ignored