In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import sys
import os
import pyarrow.parquet as pq
import torch

In [2]:
def read_parquet_in_batches(file_path: str, batch_size=100000):

    parquet_file = pq.ParquetFile(file_path)

    total_rows = parquet_file.metadata.num_rows
    processed_rows = 0

    for batch in parquet_file.iter_batches(batch_size=batch_size):
        batch_df = batch.to_pandas()

        processed_rows += len(batch_df)
        progress = (processed_rows / total_rows) * 100
        print(f'Progress: {progress:.2f}%')
        
        yield batch_df

In [5]:
counter = 0
batches = []
for batch in read_parquet_in_batches(file_path='../data/train/train_data_validate.parquet'):
    if not batches:
        batches.append(batch)
    else:
        batches.append(batch)
        break
    
for batch in batches:
    batch.drop(columns=['variantid1', 'variantid2'], inplace=True)

Progress: 8.56%
Progress: 17.12%


In [6]:
batches[0].head()

Unnamed: 0,embedding1,embedding2,target
0,"[0.5318107604980469, 0.35363996028900146, -0.7...","[0.5318107604980469, 0.35363996028900146, -0.7...",1
1,"[0.4308440089225769, 0.7620932459831238, 0.793...","[0.5668608546257019, 0.9573432803153992, 1.017...",1
2,"[-0.36238163709640503, 0.4316844344139099, -0....","[-0.25123998522758484, 0.3757574260234833, -0....",0
3,"[0.7327960729598999, -0.7488707900047302, 0.55...","[0.7327960729598999, -0.7488707900047302, 0.55...",1
4,"[-1.3140270709991455, -0.8071212768554688, 0.7...","[-0.49589139223098755, -0.5760805606842041, 0....",0


In [11]:
test_tensor = torch.tensor(batches[0].iloc[0, 1])

In [12]:
test_tensor.shape

torch.Size([320])

In [13]:
from torch.utils.data import Dataset, DataLoader

class SiameseDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Get the tensors and label from the dataframe
        tensor1 = torch.tensor(self.dataframe.iloc[idx, 0].tolist(), dtype=torch.float32)
        tensor2 = torch.tensor(self.dataframe.iloc[idx, 1].tolist(), dtype=torch.float32)
        label = torch.tensor(self.dataframe.iloc[idx, 2].tolist(), dtype=torch.float32)
        
        return tensor1, tensor2, label

# Load your dataframe
# dataframe = pd.read_csv('../data/train/siamence_main_pic.parquet')
train_dataframe = batches[0]
test_dataframe = batches[1]

# Initialize the dataset and dataloader
train_dataset = SiameseDataset(train_dataframe)
test_dataset = SiameseDataset(test_dataframe)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [14]:
import torch.nn as nn
import torch.nn.functional as F

class SiamenceNetwork(nn.Module):
    def __init__(self):
        super(SiamenceNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(320,128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 10)
        )
        
    def forward_one(self, x):
        return self.fc(x)
    
    def forward(self, input1, input2):
        outpu1 = self.forward_one(input1)
        outpu2 = self.forward_one(input2)
        return outpu1, outpu2

In [15]:
class ContrastiveLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        # Compute Euclidean distance
        euclidean_distance = F.pairwise_distance(output1, output2)
        
        # Contrastive loss
        loss = torch.mean(
            (label) * torch.pow(euclidean_distance, 2) +  # Similar pairs: distance squared
            (1 - label) * torch.pow(F.relu(self.margin - euclidean_distance), 2)  # Dissimilar pairs: margin - distance squared
        )
        return loss
        

In [38]:
# Initialize model, loss, and optimizer

# Check if GPU is available and use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'device - {device}')

# Initialize model, loss, and optimizer
model = SiamenceNetwork().to(device)
criterion = ContrastiveLoss().to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

device - cuda


In [47]:
def train_model(model, dataloader, criterion, optimizer, device, num_epochs, CE=False):
    for epoch in range(num_epochs):
        epoch_loss = 0.0  # To keep track of epoch loss
        for tensor1, tensor2, label in dataloader:
            # Move data to GPU
            tensor1 = tensor1.to(device)
            tensor2 = tensor2.to(device)
            label = label.to(device)

            optimizer.zero_grad()

            # Forward pass
            if CE:
                output = model(tensor1, tensor2)
                loss = criterion(output, label.unsqueeze(1))
            else:
                output1, output2 = model(tensor1, tensor2)
                loss = criterion(output1, output2, label)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item() * tensor1.size(0)  # Accumulate loss

        # Average loss for the epoch
        epoch_loss /= len(dataloader.dataset)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

In [40]:
num_epochs = 4
train_model(model, train_dataloader, criterion, optimizer, device, num_epochs)

Epoch [1/4], Loss: 0.1989
Epoch [2/4], Loss: 0.1655
Epoch [3/4], Loss: 0.1532
Epoch [4/4], Loss: 0.1454


In [49]:
from sklearn.metrics import accuracy_score, recall_score, f1_score

def test_model_with_accuracy(model, dataloader, criterion, device, threshold=0.5, CE=False):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0
    num_samples = 0
    true_values = []
    predicted_values = []
    
    with torch.no_grad():  # Disable gradient calculation
        for tensor1, tensor2, label in dataloader:
            tensor1 = tensor1.to(device)
            tensor2 = tensor2.to(device)
            label = label.to(device)

            # Forward pass
            if CE:
                output = model(tensor1, tensor2)
                loss = criterion(output, label.unsqueeze(1))
                predictions = (output < threshold).float()
            else:
                output1, output2 = model(tensor1, tensor2)
                loss = criterion(output1, output2, label)
                euclidean_distance = F.pairwise_distance(output1, output2)
                predictions = (euclidean_distance < threshold).float()
            
            total_loss += loss.item() * tensor1.size(0)  # Accumulate loss
            num_samples += tensor1.size(0)
            
            # Move to CPU and detach before appending to lists
            predicted_values.extend(predictions.cpu().detach().numpy())
            true_values.extend(label.cpu().detach().numpy())
            
    
    true_values = np.array(true_values)
    predicted_values = np.array(predicted_values)
    
    average_loss = total_loss / num_samples
    accuracy = accuracy_score(true_values, predicted_values)
    recall = recall_score(true_values, predicted_values)
    f1 = f1_score(true_values, predicted_values)
    return average_loss, accuracy, recall, f1


In [43]:
thresholds = []
losses = []
accuracies = []
recalls = []
f1s = []

threshold = 0
for i in range(3, 8):
    threshold = i*0.1
    test_loss, test_accuracy, test_recall, test_f1 = test_model_with_accuracy(model, test_dataloader, criterion, device, threshold=threshold)
    thresholds.append(threshold)
    losses.append(test_loss)
    accuracies.append(test_accuracy)
    recalls.append(test_recall)
    f1s.append(f1s)
    print(f'Threshold: {threshold:.4f}, Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}, Recall: {test_recall:.4f}, f1: {test_f1:.4f}, ')

Threshold: 0.3000, Loss: 0.1612, Accuracy: 0.7516, Recall: 0.5670, f1: 0.6873, 
Threshold: 0.4000, Loss: 0.1612, Accuracy: 0.7821, Recall: 0.7106, f1: 0.7585, 
Threshold: 0.5000, Loss: 0.1612, Accuracy: 0.7791, Recall: 0.8074, f1: 0.7788, 
Threshold: 0.6000, Loss: 0.1612, Accuracy: 0.7567, Recall: 0.8741, f1: 0.7758, 
Threshold: 0.7000, Loss: 0.1612, Accuracy: 0.7231, Recall: 0.9173, f1: 0.7613, 


In [44]:
class NewNetwork(nn.Module):
    def __init__(self, pretrained_fc):
        super(NewNetwork, self).__init__()
        # Use the pre-trained fully connected layers
        self.fc = pretrained_fc
        
        # New layers after concatenation
        self.concat_fc = nn.Sequential(
            nn.Linear(20, 64),  # 10 + 10 = 20
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # Sigmoid activation for binary classification
        )

    def forward(self, input1, input2):
        # Get outputs from the pretrained fc layers
        output1 = self.fc(input1)
        output2 = self.fc(input2)
        
        # Concatenate the two outputs along the feature dimension
        concat_output = torch.cat((output1, output2), dim=1)
        
        # Pass the concatenated output through the new layers
        final_output = self.concat_fc(concat_output)
        
        return final_output

In [45]:
import torch
import torch.nn as nn
import torch.optim as optim

new_model = NewNetwork(model.fc)
new_model = new_model.to(device)

# Define the loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(new_model.parameters(), lr=0.001)  # Adam optimizer

# Optionally freeze the pretrained layers
for param in new_model.fc.parameters():
    param.requires_grad = False

In [48]:
num_epochs=10
train_model(new_model, train_dataloader, criterion, optimizer, device, num_epochs, CE=True)

Epoch [1/10], Loss: 0.4923
Epoch [2/10], Loss: 0.4253
Epoch [3/10], Loss: 0.4171
Epoch [4/10], Loss: 0.4127
Epoch [5/10], Loss: 0.4097
Epoch [6/10], Loss: 0.4091
Epoch [7/10], Loss: 0.4075
Epoch [8/10], Loss: 0.4067
Epoch [9/10], Loss: 0.4052
Epoch [10/10], Loss: 0.4038


In [50]:
thresholds = []
losses = []
accuracies = []
recalls = []
f1s = []

threshold = 0
for i in range(3, 8):
    threshold = i*0.1
    test_loss, test_accuracy, test_recall, test_f1 = test_model_with_accuracy(new_model, test_dataloader, criterion, device, threshold=threshold, CE=True)
    thresholds.append(threshold)
    losses.append(test_loss)
    accuracies.append(test_accuracy)
    recalls.append(test_recall)
    f1s.append(f1s)
    print(f'Threshold: {threshold:.4f}, Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}, Recall: {test_recall:.4f}, f1: {test_f1:.4f}, ')

Threshold: 0.3000, Loss: 0.4799, Accuracy: 0.2409, Recall: 0.1310, f1: 0.1426, 
Threshold: 0.4000, Loss: 0.4799, Accuracy: 0.2248, Recall: 0.1823, f1: 0.1847, 
Threshold: 0.5000, Loss: 0.4799, Accuracy: 0.2184, Recall: 0.2370, f1: 0.2260, 
Threshold: 0.6000, Loss: 0.4799, Accuracy: 0.2219, Recall: 0.2963, f1: 0.2683, 
Threshold: 0.7000, Loss: 0.4799, Accuracy: 0.2347, Recall: 0.3649, f1: 0.3147, 


In [24]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10,6))
# plt.plot(thresholds, f1s, label='F1 score', color='blue')
# plt.plot(thresholds, accuracies, label='Accuracy', color='red')
# plt.plot(thresholds, recalls, label='Recall', color='greed')

# plt.xlabel('Threshold')
# plt.ylabel('Score')
# plt.legend()
# plt.grid(True)
# plt.show()