In [1]:
from os.path import join

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.functional as F
from transformers import DistilBertModel, DistilBertTokenizer

np.random.seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
DATA_DIR = join('..', 'data', 'raw')

In [3]:
train_df = pd.read_csv(join(DATA_DIR, 'train.csv'))
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
train_df = train_df.fillna("") 

In [5]:
SEP = '[SEP]'
train_df['question1'] = train_df['question1'].str.lower()
train_df['question2'] = train_df['question2'].str.lower()
train_df['concat_qns'] = train_df['question1'] + ' ' + SEP + ' ' + train_df['question2']

In [6]:
train_df.loc[0, 'concat_qns']

'what is the step by step guide to invest in share market in india? [SEP] what is the step by step guide to invest in share market?'

In [10]:
MAX_LEN = 512
BATCH_SIZE = 16

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_dict=False)

class QuoraDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_len):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        qns = str(self.X.iloc[index]['concat_qns'])
        encoded_qns = self.tokenizer.encode_plus(
            qns,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
        )
        ids = encoded_qns['input_ids']
        mask = encoded_qns['attention_mask']
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'target': torch.tensor(self.y[index])
        }

    def __len__(self):
        return len(self.X)

X = train_df[['concat_qns']]
y = train_df['is_duplicate']
y = torch.tensor(y, dtype=torch.float32)

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)


train_dataset = QuoraDataset(X_train[:256], y_train[:256], tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = QuoraDataset(X_val[:256], y_val[:256], tokenizer, MAX_LEN)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = QuoraDataset(X_val, y_val, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [11]:
class DistilBertClass(torch.nn.Module):
    def __init__(self, dropout=0.1):
        super(DistilBertClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=False)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(dropout)
        self.classifier = torch.nn.Linear(768, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.sigmoid(output)
        return output

def train(model, criterion, optimizer, scheduler=None, num_epochs=1):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            labels = batch['target'].to(device, dtype = torch.float)

            optimizer.zero_grad()
            outputs = model(ids, attention_mask=mask)
            loss = criterion(outputs.view(-1), labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        if scheduler: scheduler.step()
        print(f"Epoch {epoch+1}: loss = {total_loss:.2f}")
    return model


def evaluate(model):
    model.eval()
    with torch.no_grad():
        # train set
        correct = 0
        total = 0
        for batch in tqdm(train_loader):
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            labels = batch['target'].to(device, dtype = torch.float)

            outputs = model(ids, attention_mask=mask)
            predicted = (outputs > 0.5).float().view(-1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total
        print(f"Train Accuracy: {accuracy:.2f}%")

        # validation set
        correct = 0
        total = 0
        for batch in tqdm(val_loader):
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            labels = batch['target'].to(device, dtype = torch.float)

            outputs = model(ids, attention_mask=mask)
            predicted = (outputs > 0.5).float().view(-1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total
        print(f"Validation Accuracy: {accuracy:.2f}%")

In [12]:
model = DistilBertClass()
model.to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=5e-5)
model = train(model, criterion, optimizer, num_epochs=2)

  'target': torch.tensor(self.y[index])
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [09:15<00:00, 34.75s/it]


Epoch 1: loss = 10.82


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [09:12<00:00, 34.53s/it]

Epoch 2: loss = 9.95





In [15]:
evaluate(model)

  'target': torch.tensor(self.y[index])
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [02:57<00:00, 11.11s/it]


Train Accuracy: 64.06%


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [02:57<00:00, 11.11s/it]

Validation Accuracy: 65.62%





In [16]:
model_new = DistilBertClass()
evaluate(model_new)

  'target': torch.tensor(self.y[index])
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [02:57<00:00, 11.10s/it]


Train Accuracy: 62.50%


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [02:57<00:00, 11.06s/it]

Validation Accuracy: 66.02%





In [17]:
output_model_file = '../models/pytorch_distilbert.bin'
output_vocab_file = '../models/vocab_distilbert.bin'

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('Model saved')

Model saved
