<a href="https://colab.research.google.com/github/ganeshred/NLPAuthorshipAttributionNLM/blob/main/Siamese_Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python --version

Python 3.10.11


In [2]:
!pip install torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from torchtext import data
from torchtext import datasets

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm')

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
import torch
from torchtext import data
from torchtext import datasets

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


In [6]:
import pandas as pd
df = pd.read_csv('/content/gdrive/MyDrive/task1.csv')

In [7]:
df.head()

Unnamed: 0,T1,T2,Same
0,Senegal has partnered with a UK-based health d...,I'm relieved that Harvey Weinstein will finall...,1
1,the man on the phone: what's it like making hi...,flame towers: luxury atop one of the world's t...,1
2,", Callum Michael RebelJenna Dewan has announce...",The current crop of twentysomethings are going...,1
3,learning to live with the coronavirus q if i h...,guinea-bissau: political chaos could boost coc...,1
4,"athletes allege abuse, racism at u. of illinoi...",contact tracing is key to america's coronaviru...,1


In [8]:

# TEXT = data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm', include_lengths = True)
# LABEL = data.LabelField(sequential=False, use_vocab=False, dtype = torch.float)

# preprocessed_text = df['T1'].apply(
#     lambda x: TEXT.preprocess(x)
# )

# # load fastext simple embedding with 300d
# TEXT.build_vocab(
#     preprocessed_text, 
#     max_size = 30000,
#     vectors='glove.6B.50d',
#     unk_init = torch.Tensor.normal_
# )

# vocab = TEXT.vocab



In [9]:
import pandas as pd
import torch
import torchtext.data as data
import torchtext.vocab as vocab

# Load the DataFrame
# df = pd.read_csv('data.csv')

# Create the data fields
TEXT = data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm', include_lengths=True)
LABEL = data.LabelField(sequential=False, use_vocab=False, dtype=torch.float)

preprocessed_text=[]
# Preprocess the text and combine into tuples
for t1,t2 in zip(df['T1'], df['T2']):
  if type(t2)==float:
    t2=''
  preprocessed_text.append((TEXT.preprocess(t1), TEXT.preprocess(t2)))

# preprocessed_text = [(TEXT.preprocess(t1), TEXT.preprocess(t2)) for t1, t2 in zip(df['T1'], df['T2'])]

# Build the vocabulary using GloVe embeddings
glove = vocab.GloVe(name='6B', dim=50)
TEXT.build_vocab(preprocessed_text, max_size=30000, vectors=glove)

# Extract the vocabulary
vocab = TEXT.vocab


In [10]:
ltoi = {l: i for i, l in enumerate(df['Same'].unique())}

In [11]:
ltoi

{1: 0, 0: 1}

In [12]:
df = df.dropna()

In [13]:
len(df)

9996

In [14]:
examples=[]
for i,row in df.iterrows():
  # print(example)
  examples.append(data.Example.fromlist(row, fields=[('text1', TEXT),('text2', TEXT), ('label', LABEL)]))

In [15]:
# list(examples[0].text1)

In [16]:
dataset = data.Dataset(examples, fields=[('text1', TEXT),('text2', TEXT), ('label', LABEL)])

In [17]:
train_dataset, test_dataset, valid_dataset = dataset.split(split_ratio=[0.65, 0.15, 0.2], stratified=True, strata_field='label')

In [18]:
len(train_dataset)

6497

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("We are working with ", device)
# device = 'cpu'
train_iter, test_iter, valid_iter = data.BucketIterator.splits(
    datasets=(train_dataset, test_dataset, valid_dataset), 
    batch_size=32,
    sort_within_batch = True,
    sort_key=lambda x: max(len(x.text1),len(x.text2)),
    device=device
)

We are working with  cuda


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim


In [21]:
class SiameseNetwork(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SiameseNetwork, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward_once(self, x):
        # x,_ = x
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        out = self.fc(hidden.squeeze(0))
        return out

    def forward(self, sentence1, sentence2):
        output1 = self.forward_once(sentence1)
        output2 = self.forward_once(sentence2)
        return output1, output2

In [22]:
def train(model, train_loader, criterion, optimizer, threshold):
    model.train()
    loss_avg = []
    # correct_avg = []
    correct=0
    total=0
    for i, batch in enumerate(train_loader):
        sentence1, sentence2 = batch.text1, batch.text2
        sentence1, sentence1_lengths = sentence1
        sentence2, sentence2_lengths = sentence2
        sentence1 = torch.transpose(sentence1, 0, 1)
        sentence2 = torch.transpose(sentence2, 0, 1)
        label = batch.label.float()
        optimizer.zero_grad()
        output1, output2 = model(sentence1, sentence2)
        loss = criterion(output1, output2, label)
        loss.backward()
        optimizer.step()

        # Calculate accuracy and loss for this batch
        # predictions = (output1 - output2) < threshold
        # correct = (predictions == label.byte()).sum().item()
        # accuracy = correct / label.size(0)
        loss_avg.append(loss.item())
        total += batch.label.size(0)
        euclidean_distance = F.pairwise_distance(output1, output2)
        predictions = (euclidean_distance <= threshold).float()
        correct += (predictions == batch.label).sum().item()
        # correct_avg.append(correct/total)

    # Calculate average accuracy and loss for the epoch
    # print(correct_avg)
    epoch_loss = sum(loss_avg) / len(loss_avg)
    # epoch_acc = sum(correct_avg) / len(correct_avg)

    # Return accuracy and loss for the epoch
    return 100 * correct / total, epoch_loss



# def train(model, train_loader, criterion, optimizer, threshold):
#     model.train()
#     loss_avg=[]
#     correct_avg=[]
#     for i,batch in enumerate(train_loader):
#         # print(batch.size())
#         sentence1, sentence2 = batch.text1, batch.text2
#         sentence1,sentence1_lengths = sentence1
#         sentence2,sentence2_lengths = sentence2
#         sentence1 = torch.transpose(sentence1, 0, 1)
#         sentence2 = torch.transpose(sentence2, 0, 1)
#         # print(sentence1.size())
#         # print(sentence2.size())
#         label = batch.label.float()
#         optimizer.zero_grad()
#         output1, output2 = model(sentence1, sentence2)
#         # print(output1.size())
#         # print(output2.size())
#         # print(label.size())
#         loss = criterion(output1, output2, label)
#         loss.backward()
#         optimizer.step()
#         predicted = (euclidean_distance <= threshold).float()
#         total += batch.label.size(0)
#         loss_avg.append(loss.item())

#         # print('Step [{}/{}], Loss: {:.4f}'.format(i+1, len(train_loader), loss.item()))
#         # correct += (predicted == batch.label).sum().item()



#     return correct*100/total, sum(loss_avg)/len(loss_avg)

#     # print('Accuracy of the network on the test sentences: {} %'.format(100 * correct / total))






In [23]:
def test(model, test_loader, threshold):
    model.eval()
    correct = 0
    total = 0
    loss_avg = []
    with torch.no_grad():
        for batch in test_loader:
            sentence1, sentence2 = batch.text1, batch.text2
            sentence1, sentence1_lengths = sentence1
            sentence2, sentence2_lengths = sentence2
            sentence1 = torch.transpose(sentence1, 0, 1)
            sentence2 = torch.transpose(sentence2, 0, 1)
            label = batch.label.float()
            optimizer.zero_grad()
            output1, output2 = model(sentence1, sentence2)
            loss = criterion(output1, output2, label)
            loss_avg.append(loss.item())
            # output1, output2 = model(sentence1, sentence2)
            euclidean_distance = F.pairwise_distance(output1, output2)
            predicted = (euclidean_distance <= threshold).float()
            total += batch.label.size(0)
            correct += (predicted == batch.label).sum().item()
    # print('Accuracy of the network on the test sentences: {} %'.format(100 * correct / total))
    return sum(loss_avg) / len(loss_avg), 100 * correct / total


In [24]:
import torch.nn as nn
import torch.nn.functional as F


class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2, keepdim=True)
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))

        return loss_contrastive


In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [26]:
import torch.optim as optim
from torch.utils.data import DataLoader

# Define batch size
# batch_size = 32
# def collate_fn(batch):
#     text1 = torch.cat([x.text1 for x in batch], dim=1)
#     text2 = torch.cat([x.text2 for x in batch], dim=1)
#     label = torch.cat([x.label for x in batch])
#     return text1, text2, label


# # Create data loaders
# train_loader = DataLoader(train_iter, batch_size=batch_size, shuffle=True,collate_fn=collate_fn)
# test_loader = DataLoader(test_iter, batch_size=batch_size,collate_fn=collate_fn)
# valid_loader = DataLoader(valid_iter, batch_size=batch_size,collate_fn=collate_fn)


In [27]:
embedding_dim = 50
hidden_dim = 50
dropout = 0.2
lr = 0.001
num_epochs = 10

# Initialize the model and move it to the device
model = SiameseNetwork(len(TEXT.vocab), embedding_dim, hidden_dim)
model.to(device)

# Define loss function and optimizer
criterion = ContrastiveLoss(margin=1)
optimizer = optim.Adam(model.parameters(), lr=lr)


In [28]:
# for epoch in range(num_epochs):
#     train(model, train_iter, criterion, optimizer, threshold=0.5)
#     print(f"Epoch {epoch+1} completed")

#     # Evaluate the model on validation set
#     val_loss, val_acc = test(model, valid_iter, criterion)
#     print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

for epoch in range(num_epochs):
    train_acc, train_loss = train(model, train_iter, criterion, optimizer, threshold=0.5)
    print(f"Epoch {epoch+1} completed. Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")

    # Evaluate the model on validation setr
    val_loss, val_acc = test(model, valid_iter, 0.5)
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")



Epoch 1 completed. Train Loss: 0.3382, Train Accuracy: 56.6723
Validation Loss: 0.3271, Validation Accuracy: 59.0000
Epoch 2 completed. Train Loss: 0.3087, Train Accuracy: 60.3355
Validation Loss: 0.3428, Validation Accuracy: 59.1333
Epoch 3 completed. Train Loss: 0.3055, Train Accuracy: 59.7507
Validation Loss: 0.3298, Validation Accuracy: 58.1333
Epoch 4 completed. Train Loss: 0.2986, Train Accuracy: 60.0277
Validation Loss: 0.3344, Validation Accuracy: 54.0000
Epoch 5 completed. Train Loss: 0.2975, Train Accuracy: 57.5189
Validation Loss: 0.3302, Validation Accuracy: 59.3333
Epoch 6 completed. Train Loss: 0.2939, Train Accuracy: 55.5487
Validation Loss: 0.3235, Validation Accuracy: 59.6667
Epoch 7 completed. Train Loss: 0.2917, Train Accuracy: 54.6098
Validation Loss: 0.3256, Validation Accuracy: 57.9333
Epoch 8 completed. Train Loss: 0.2851, Train Accuracy: 52.6859
Validation Loss: 0.3295, Validation Accuracy: 57.0000
Epoch 9 completed. Train Loss: 0.2808, Train Accuracy: 53.2092
V