In [1]:
from os.path import join

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from nltk.tokenize import word_tokenize
from fasttext import load_model 
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
np.random.seed(42)

In [3]:
DATA_DIR = join('..', 'data', 'raw')
MODEL_DIR = join('..', 'models')

In [4]:
fasttext_model = load_model(join(MODEL_DIR, 'pretrained', 'cc.en.300.bin'))



In [5]:
train_df = pd.read_csv(join(DATA_DIR, 'train.csv'), index_col=0)
train_df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
fasttext_model.get_word_vector('hi').shape

(300,)

In [7]:
fasttext_model.get_word_vector('Hi') == fasttext_model.get_word_vector('hi') 

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [8]:
fasttext_model.get_sentence_vector("What's your name?").shape

(300,)

In [9]:
train_df = train_df.fillna("")

In [10]:
X = train_df[['question1', 'question2']]
X['question1'] = X['question1'].str.replace('\n', '', regex=True)
X['question2'] = X['question2'].str.replace('\n', '', regex=True)

X['q1_embeds'] = X['question1'].apply(lambda x: fasttext_model.get_sentence_vector(x.lower().strip()))
X['q2_embeds'] = X['question2'].apply(lambda x: fasttext_model.get_sentence_vector(x.lower().strip()))

y = train_df['is_duplicate']
y = torch.tensor(y, dtype=torch.float32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['question1'] = X['question1'].str.replace('\n', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['question2'] = X['question2'].str.replace('\n', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['q1_embeds'] = X['question1'].apply(lambda x: fasttext_model.get_senten

In [11]:
X = X[['q1_embeds', 'q2_embeds']]
X.head()

Unnamed: 0_level_0,q1_embeds,q2_embeds
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[-0.017194528, -0.0021926684, 0.028786456, 0.0...","[-0.019007614, -0.0020667282, 0.025949523, 0.0..."
1,"[-0.011212165, 0.03627265, 0.0061234357, 0.038...","[-0.02273693, 0.02811398, 0.02587554, 0.051059..."
2,"[-0.0034232663, 0.0003161574, -0.012909933, 0....","[-0.019429542, 0.039420854, 0.024514169, 0.029..."
3,"[-0.01996699, -0.037113886, 0.0055986927, 0.03...","[-0.017514624, 0.01671183, -0.0049271486, 0.05..."
4,"[-0.04029714, 0.023224352, -0.026572602, 0.083...","[-0.025465617, 0.012547443, 0.013416384, 0.076..."


In [12]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

In [13]:
class CustomDataset(Dataset):

    def __init__(self, question_embeddings, labels):
        self.question_embeddings = question_embeddings
        self.labels = labels

    def __len__(self):
        return len(self.question_embeddings)

    def __getitem__(self, idx):
        q1_embed, q2_embed = self.question_embeddings.iloc[idx]
        x = torch.cat((torch.from_numpy(q1_embed), torch.from_numpy(q2_embed)), 0)
        return x.to(device), self.labels[idx].to(device)


class FCN(nn.Module):

    def __init__(self, input_dim, num_layers, hidden_dim):
        super(FCN, self).__init__()
        self.fcs = nn.ModuleList(
            [nn.Linear(input_dim, hidden_dim)]
            + [nn.Linear(hidden_dim, hidden_dim) for _ in range(num_layers - 2)]
            + [nn.Linear(hidden_dim, 1)]
        )

    def forward(self, x):
        for fc in self.fcs[:-1]:
            x = F.relu(fc(x))
        x = self.fcs[-1](x)
        return F.sigmoid(x)

In [14]:
batch_size = 512

train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [22]:
model = FCN(input_dim=600, num_layers=4, hidden_dim=300)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [23]:
num_epochs = 20

for epoch in range(num_epochs):
    total_loss = 0
    for (questions, labels) in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        optimizer.zero_grad()
        outputs = model(questions)
        loss = criterion(outputs.view(-1), labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: loss = {total_loss:.2f}")

Epoch 1/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 1: loss = 283.99


Epoch 2/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 2: loss = 245.45


Epoch 3/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 3: loss = 225.96


Epoch 4/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 4: loss = 213.38


Epoch 5/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 5: loss = 202.81


Epoch 6/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 6: loss = 193.80


Epoch 7/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 7: loss = 184.92


Epoch 8/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 8: loss = 176.59


Epoch 9/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 9: loss = 169.30


Epoch 10/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 10: loss = 161.62


Epoch 11/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 11: loss = 154.59


Epoch 12/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 12: loss = 148.11


Epoch 13/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 13: loss = 141.24


Epoch 14/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 14: loss = 135.14


Epoch 15/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 15: loss = 128.87


Epoch 16/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 16: loss = 123.37


Epoch 17/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 17: loss = 117.66


Epoch 18/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 18: loss = 113.97


Epoch 19/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 19: loss = 108.54


Epoch 20/20:   0%|          | 0/506 [00:00<?, ?it/s]

Epoch 20: loss = 103.48


In [24]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for questions, labels in tqdm(train_loader):
        outputs = model(questions)
        predicted = (outputs > 0.5).float().view(-1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f"Train Accuracy: {accuracy:.2f}%")

  0%|          | 0/506 [00:00<?, ?it/s]

Train Accuracy: 92.44%


In [25]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for questions, labels in tqdm(val_loader):
        outputs = model(questions)
        predicted = (outputs > 0.5).float().view(-1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f"Validation Accuracy: {accuracy:.2f}%")

  0%|          | 0/127 [00:00<?, ?it/s]

Validation Accuracy: 79.26%
