# Libraries

In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [15]:
import time
import torch
import torch.utils
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertTokenizerFast, BertForSequenceClassification
from torch.optim import Adam


# Prepare dataset

In [4]:
train_full_df = pd.read_csv("quora data/train.tsv", sep="\t")
print(train_full_df.head())

       id    qid1    qid2                                          question1  \
0  133273  213221  213222  How is the life of a math student? Could you d...   
1  402555  536040  536041                How do I control my horny emotions?   
2  360472  364011  490273       What causes stool color to change to yellow?   
3  150662  155721    7256                        What can one do after MBBS?   
4  183004  279958  279959  Where can I find a power outlet for my laptop ...   

                                           question2  is_duplicate  
0  Which level of prepration is enough for the ex...             0  
1                 How do you control your horniness?             1  
2  What can cause stool to come out as little balls?             0  
3                       What do i do after my MBBS ?             1  
4  Would a second airport in Sydney, Australia be...             0  


In [5]:
train_df, dev_df = train_test_split(train_full_df, test_size=0.1, random_state=42)

In [6]:
len(train_df), len(dev_df)

(327461, 36385)

In [7]:
print(train_df.head())

            id    qid1    qid2  \
80519    51432   91228   91229   
349125   52249   92541   92542   
126605  135176  215925  215926   
6008    184003  281230  281231   
292788    5971   11714   11715   

                                                question1  \
80519          Which topic is the most followed in Quora?   
349125  What does it feel like to have sex with a rela...   
126605                     Can I get back my best friend?   
6008    Are there solar systems that act like a double...   
292788                    Why are there only few magnets?   

                                                question2  is_duplicate  
80519         What topic on Quora has the most followers?             1  
349125         Is it possible to eat more than you weigh?             0  
126605   How do I get back to my best friend as my lover?             0  
6008    As a Navy SEAL do you feel as if people are af...             0  
292788                     Why is there only few magnets? 

In [8]:
test_df = pd.read_csv("quora data/dev.tsv", sep="\t")
print(test_df.head())

       id    qid1    qid2                                          question1  \
0  201359  303345  303346            Why are African-Americans so beautiful?   
1  263843   69383  380476  I want to pursue PhD in Computer Science about...   
2  172974  266948  175089      Is there a reason why we should travel alone?   
3   15329   29298   29299  Why are people so obsessed with having a girlf...   
4  209794  314169  314170  What are some good baby girl names starting wi...   

                                           question2  is_duplicate  
0                    Why are hispanics so beautiful?             0  
1  I handle social media for a non-profit. Should...             0  
2             What are some reasons to travel alone?             1  
3                How can a single male have a child?             0  
4  What are some good baby girl names starting wi...             0  


In [9]:
test_df.shape

(40430, 6)

In [10]:
train_sentences1 = train_df['question1'].tolist()
train_sentences2 = train_df['question2'].tolist()
train_labels = train_df['is_duplicate'].tolist()

dev_sentences1 = dev_df['question1'].tolist()
dev_sentences2 = dev_df['question2'].tolist()
dev_labels = dev_df['is_duplicate'].tolist()

test_sentences1 = test_df['question1'].tolist()
test_sentences2 = test_df['question2'].tolist()
test_labels = test_df['is_duplicate'].tolist()

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_sentences1, train_sentences2, truncation=True, padding=True)
dev_encodings = tokenizer(dev_sentences1, dev_sentences2, truncation=True, padding=True)
test_encodings = tokenizer(test_sentences1, test_sentences2, truncation=True, padding=True)

In [12]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [13]:
train_dataset = CustomDataset(train_encodings, train_labels)
dev_dataset = CustomDataset(dev_encodings, dev_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Model

In [14]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = Adam(model.parameters(), lr=1e-5)

device = "mps" if torch.backends.mps.is_built() \
    else "gpu" if torch.cuda.is_available() else "cpu"

model.to(device)
print(f"Device: {device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device: mps


# Train, evaluate

In [56]:
start_time = time.time()
num_epochs = 3
correct_train=0
total_train=0
for epoch in range(num_epochs):
    print(f"training epoch no: {epoch+1}/{num_epochs}")
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        _, predicted = torch.max(outputs.logits, dim=1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

file_path = "trained model.pkl"
torch.save(model.state_dict(), file_path)

accuracy_train = correct_train/total_train
print(f'Train Accuracy: {accuracy_train}')

time_elapsed = time.time()-start_time
hours = int(time_elapsed // 3600)
minutes = int((time_elapsed % 3600) // 60)
print(f"training time elapsed: {hours} hours {minutes} minutes")

training epoch no: 1/3
training epoch no: 2/3
training epoch no: 3/3
Train Accuracy: 0.910253943726632
training time elapsed: 40 hours 47 minutes


In [16]:
model_state_dict = torch.load("trained model.pkl")
model.load_state_dict(model_state_dict)

<All keys matched successfully>

In [20]:
# validation
start_time = time.time()
model.eval()
with torch.no_grad():
    true_y = []
    predicted_combined = []
    for batch in dev_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, dim=1)
        true_y.extend(labels.numpy())
        predicted_combined.extend(predicted.cpu().numpy())

accuracy = accuracy_score(true_y, predicted_combined)
print(f'Validation Accuracy: {accuracy*100:.2f}%')
print(classification_report(true_y, predicted_combined))

time_elapsed = time.time()-start_time
hours = int(time_elapsed // 3600)
minutes = int((time_elapsed % 3600) // 60)
print(f"validation time elapsed: {hours} hours {minutes} minutes")

Validation Accuracy: 90.36%
              precision    recall  f1-score   support

           0       0.94      0.90      0.92     23067
           1       0.84      0.91      0.87     13318

    accuracy                           0.90     36385
   macro avg       0.89      0.90      0.90     36385
weighted avg       0.91      0.90      0.90     36385

validation time elapsed: 0 hours 20 minutes


In [22]:
# test
start_time = time.time()
model.eval()
with torch.no_grad():
    true_y = []
    predicted_combined = []
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, dim=1)
        true_y.extend(labels.numpy())
        predicted_combined.extend(predicted.cpu().numpy())

accuracy = accuracy_score(true_y, predicted_combined)
print(f'Test Accuracy: {accuracy*100:.2f}%')
print(classification_report(true_y, predicted_combined))

time_elapsed = time.time()-start_time
hours = int(time_elapsed // 3600)
minutes = int((time_elapsed % 3600) // 60)
print(f"Test time elapsed: {hours} hours {minutes} minutes")

Test Accuracy: 90.13%
              precision    recall  f1-score   support

           0       0.94      0.90      0.92     25545
           1       0.84      0.90      0.87     14885

    accuracy                           0.90     40430
   macro avg       0.89      0.90      0.90     40430
weighted avg       0.90      0.90      0.90     40430

Test time elapsed: 0 hours 14 minutes


# Without fine-tuning

In [23]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = Adam(model.parameters(), lr=1e-5)

device = "mps" if torch.backends.mps.is_built() \
    else "gpu" if torch.cuda.is_available() else "cpu"

model.to(device)
print(f"Device: {device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device: mps


In [24]:
# validation
start_time = time.time()
model.eval()
with torch.no_grad():
    true_y = []
    predicted_combined = []
    for batch in dev_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, dim=1)
        true_y.extend(labels.numpy())
        predicted_combined.extend(predicted.cpu().numpy())

accuracy = accuracy_score(true_y, predicted_combined)
print(f'Validation Accuracy: {accuracy*100:.2f}%')
print(classification_report(true_y, predicted_combined))

time_elapsed = time.time()-start_time
hours = int(time_elapsed // 3600)
minutes = int((time_elapsed % 3600) // 60)
print(f"Validation time elapsed: {hours} hours {minutes} minutes")

Validation Accuracy: 51.25%
              precision    recall  f1-score   support

           0       0.70      0.40      0.51     23067
           1       0.40      0.70      0.51     13318

    accuracy                           0.51     36385
   macro avg       0.55      0.55      0.51     36385
weighted avg       0.59      0.51      0.51     36385

Validation time elapsed: 0 hours 21 minutes


In [25]:
# test
start_time = time.time()
model.eval()
with torch.no_grad():
    true_y = []
    predicted_combined = []
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, dim=1)
        true_y.extend(labels.numpy())
        predicted_combined.extend(predicted.cpu().numpy())

accuracy = accuracy_score(true_y, predicted_combined)
print(f'Test Accuracy: {accuracy*100:.2f}%')
print(classification_report(true_y, predicted_combined))

time_elapsed = time.time()-start_time
hours = int(time_elapsed // 3600)
minutes = int((time_elapsed % 3600) // 60)
print(f"Test time elapsed: {hours} hours {minutes} minutes")

Test Accuracy: 51.13%
              precision    recall  f1-score   support

           0       0.70      0.40      0.51     25545
           1       0.40      0.70      0.51     14885

    accuracy                           0.51     40430
   macro avg       0.55      0.55      0.51     40430
weighted avg       0.59      0.51      0.51     40430

Test time elapsed: 0 hours 14 minutes
