In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import sys
import os
import pyarrow.parquet as pq
import torch

In [75]:
def read_parquet_in_batches(file_path: str, batch_size=100000):

    parquet_file = pq.ParquetFile(file_path)

    total_rows = parquet_file.metadata.num_rows
    processed_rows = 0

    for batch in parquet_file.iter_batches(batch_size=batch_size):
        batch_df = batch.to_pandas()

        processed_rows += len(batch_df)
        progress = (processed_rows / total_rows) * 100
        print(f'Progress: {progress:.2f}%')
        
        yield batch_df

In [76]:
counter = 0
batches = []
for batch in read_parquet_in_batches(file_path='../data/train/siamence_main_pic.parquet'):
    if not batches:
        batches.append(batch)
    else:
        batches.append(batch)
        break
    
for batch in batches:
    batch.drop(columns=['variantid1', 'variantid2'], inplace=True)

Progress: 8.56%
Progress: 17.12%


In [78]:
from torch.utils.data import Dataset, DataLoader

class SiameseDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Get the tensors and label from the dataframe
        tensor1 = torch.tensor(self.dataframe.iloc[idx, 1].tolist(), dtype=torch.float32)
        tensor2 = torch.tensor(self.dataframe.iloc[idx, 2].tolist(), dtype=torch.float32)
        label = torch.tensor(self.dataframe.iloc[idx, 0].tolist(), dtype=torch.float32)
        
        return tensor1, tensor2, label

# Load your dataframe
# dataframe = pd.read_csv('../data/train/siamence_main_pic.parquet')
train_dataframe = batches[0]
test_dataframe = batches[1]

# Initialize the dataset and dataloader
train_dataset = SiameseDataset(train_dataframe)
test_dataset = SiameseDataset(test_dataframe)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [79]:
import torch.nn as nn
import torch.nn.functional as F

class SiamenceNetwork(nn.Module):
    def __init__(self):
        super(SiamenceNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 10)
        )
        
    def forward_one(self, x):
        return self.fc(x)
    
    def forward(self, input1, input2):
        outpu1 = self.forward_one(input1)
        outpu2 = self.forward_one(input2)
        return outpu1, outpu2

In [80]:
class ContrastiveLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        # Compute Euclidean distance
        euclidean_distance = F.pairwise_distance(output1, output2)
        
        # Contrastive loss
        loss = torch.mean(
            (label) * torch.pow(euclidean_distance, 2) +  # Similar pairs: distance squared
            (1 - label) * torch.pow(F.relu(self.margin - euclidean_distance), 2)  # Dissimilar pairs: margin - distance squared
        )
        return loss
        

In [81]:
# Initialize model, loss, and optimizer

# Check if GPU is available and use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'device - {device}')

# Initialize model, loss, and optimizer
model = SiamenceNetwork().to(device)
criterion = ContrastiveLoss().to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

device - cuda


In [82]:
num_epochs = 100

for epoch in range(num_epochs):
    epoch_loss = 0.0  # To keep track of epoch loss
    for tensor1, tensor2, label in train_dataloader:
        # Move data to GPU
        tensor1 = tensor1.to(device)
        tensor2 = tensor2.to(device)
        label = label.to(device)

        optimizer.zero_grad()

        # Forward pass
        output1, output2 = model(tensor1, tensor2)

        # Compute loss
        loss = criterion(output1, output2, label)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * tensor1.size(0)  # Accumulate loss

    # Average loss for the epoch
    epoch_loss /= len(dataloader.dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

Epoch [1/100], Loss: 0.4117
Epoch [2/100], Loss: 0.4028
Epoch [3/100], Loss: 0.3997
Epoch [4/100], Loss: 0.3974
Epoch [5/100], Loss: 0.3956
Epoch [6/100], Loss: 0.3946
Epoch [7/100], Loss: 0.3937
Epoch [8/100], Loss: 0.3930
Epoch [9/100], Loss: 0.3920
Epoch [10/100], Loss: 0.3917
Epoch [11/100], Loss: 0.3912
Epoch [12/100], Loss: 0.3906
Epoch [13/100], Loss: 0.3897
Epoch [14/100], Loss: 0.3899
Epoch [15/100], Loss: 0.3893
Epoch [16/100], Loss: 0.3888
Epoch [17/100], Loss: 0.3884
Epoch [18/100], Loss: 0.3885
Epoch [19/100], Loss: 0.3885
Epoch [20/100], Loss: 0.3877
Epoch [21/100], Loss: 0.3873
Epoch [22/100], Loss: 0.3873
Epoch [23/100], Loss: 0.3871
Epoch [24/100], Loss: 0.3868
Epoch [25/100], Loss: 0.3865
Epoch [26/100], Loss: 0.3861
Epoch [27/100], Loss: 0.3859
Epoch [28/100], Loss: 0.3858
Epoch [29/100], Loss: 0.3852
Epoch [30/100], Loss: 0.3852
Epoch [31/100], Loss: 0.3849
Epoch [32/100], Loss: 0.3842
Epoch [33/100], Loss: 0.3842
Epoch [34/100], Loss: 0.3840
Epoch [35/100], Loss: 0

In [105]:
from sklearn.metrics import accuracy_score, recall_score, f1_score

def test_model_with_accuracy(model, dataloader, criterion, device, threshold=0.5):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0
    num_samples = 0
    true_values = []
    predicted_values = []
    
    with torch.no_grad():  # Disable gradient calculation
        for tensor1, tensor2, label in dataloader:
            tensor1 = tensor1.to(device)
            tensor2 = tensor2.to(device)
            label = label.to(device)

            # Forward pass
            output1, output2 = model(tensor1, tensor2)

            # Compute loss
            loss = criterion(output1, output2, label)
            total_loss += loss.item() * tensor1.size(0)  # Accumulate loss
            num_samples += tensor1.size(0)

            # Compute predictions
            euclidean_distance = F.pairwise_distance(output1, output2)
            predictions = (euclidean_distance < threshold).float()
            
            # Move to CPU and detach before appending to lists
            predicted_values.extend(predictions.cpu().detach().numpy())
            true_values.extend(label.cpu().detach().numpy())
            
    
    true_values = np.array(true_values)
    predicted_values = np.array(predicted_values)
    
    average_loss = total_loss / num_samples
    accuracy = accuracy_score(true_values, predicted_values)
    recall = recall_score(true_values, predicted_values)
    f1 = f1_score(true_values, predicted_values)
    return average_loss, accuracy, recall, f1

# Example usage
thresholds = []
losses = []
accuracies = []
recalls = []
f1s = []

threshold = 0
for i in range(0, 10):
    threshold = i*0.1
    test_loss, test_accuracy, test_recall, test_f1 = test_model_with_accuracy(model, test_dataloader, criterion, device, threshold=threshold)
    thresholds.append(threshold)
    losses.append(test_loss)
    accuracies.append(test_accuracy)
    recalls.append(test_recall)
    f1s.append(f1s)
    print(f'Threshold: {threshold:.4f}, Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}, Recall: {test_recall:.4f}, f1: {test_f1:.4f}, ')

Threshold: 0.0000, Loss: 0.3784, Accuracy: 0.5185, Recall: 0.0000, f1: 0.0000, 
Threshold: 0.1000, Loss: 0.3784, Accuracy: 0.7109, Recall: 0.6450, f1: 0.6824, 
Threshold: 0.2000, Loss: 0.3784, Accuracy: 0.7192, Recall: 0.7012, f1: 0.7063, 
Threshold: 0.3000, Loss: 0.3781, Accuracy: 0.7142, Recall: 0.7569, f1: 0.7183, 
Threshold: 0.4000, Loss: 0.3783, Accuracy: 0.6866, Recall: 0.8250, f1: 0.7171, 
Threshold: 0.5000, Loss: 0.3782, Accuracy: 0.6313, Recall: 0.8939, f1: 0.7001, 
Threshold: 0.6000, Loss: 0.3781, Accuracy: 0.5707, Recall: 0.9487, f1: 0.6803, 
Threshold: 0.7000, Loss: 0.3781, Accuracy: 0.5225, Recall: 0.9795, f1: 0.6639, 
Threshold: 0.8000, Loss: 0.3784, Accuracy: 0.4970, Recall: 0.9929, f1: 0.6553, 
Threshold: 0.9000, Loss: 0.3785, Accuracy: 0.4863, Recall: 0.9981, f1: 0.6517, 


In [1]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.plot(thresholds, f1s, label='F1 score', color='blue')
plt.plot(thresholds, accuracies, label='Accuracy', color='red')
plt.plot(thresholds, recalls, label='Recall', color='greed')

plt.xlabel('Threshold')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()

NameError: name 'thresholds' is not defined

<Figure size 1000x600 with 0 Axes>