In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import sys
import os
import pyarrow.parquet as pq
import torch

In [2]:
def read_parquet_in_batches(file_path: str, batch_size=10000):

    parquet_file = pq.ParquetFile(file_path)

    total_rows = parquet_file.metadata.num_rows
    processed_rows = 0
    batches = []

    for batch in parquet_file.iter_batches(batch_size=batch_size):
        batch_df = batch.to_pandas()

        processed_rows += len(batch_df)
        progress = (processed_rows / total_rows) * 100
        print(f'Progress: {progress:.2f}%')
        
        yield batch_df

In [3]:
for batch in read_parquet_in_batches(file_path='../data/train/resnet.parquet'):
    break

Progress: 0.44%


In [4]:
batch.head()

Unnamed: 0,variantid,main_pic_embeddings_resnet_v1,pic_embeddings_resnet_v1
0,47920382,"[[0.8170074820518494, 0.9416620135307312, 0.31...","[[0.20931944251060486, -0.29257065057754517, -..."
1,49801845,"[[-0.43339717388153076, -0.17318281531333923, ...",
2,49853444,"[[0.11314830183982849, -0.34010639786720276, -...",
3,49893028,"[[0.25037717819213867, 0.33753663301467896, 0....",
4,49987483,"[[0.43453288078308105, 0.09419603645801544, -0...","[[-0.18672508001327515, -0.2851635217666626, -..."


In [19]:
arr = torch.tensor(np.array(batch['main_pic_embeddings_resnet_v1']).tolist(), dtype=torch.float32)

In [21]:
arr.shape


torch.Size([10000, 1, 128])

In [None]:
from torch.utils.data import Dataset, DataLoader

class SiameseDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Get the tensors and label from the dataframe
        tensor1 = torch.tensor(self.dataframe.iloc[idx, 0])
        tensor2 = torch.tensor(self.dataframe.iloc[idx, 1])
        label = torch.tensor(self.dataframe.iloc[idx, 2])
        
        return tensor1, tensor2, label

# Load your dataframe
dataframe = pd.read_csv('../data/train/siamence_main_pic.parquet')

# Initialize the dataset and dataloader
dataset = SiameseDataset(dataframe)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
import torch.nn as nn
import torch.functional as F

class SiamenceNetwork(nn.Module):
    def __init__(self):
        super(SiamenceNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLu(),
            nn.Linear(32, 10)
        )
        
    def forward_one(self, x):
        return self.fc(x)
    
    def forward(self, input1, input2):
        outpu1 = self.forward_one(input1)
        outpu2 = self.forward_one(input2)
        return outpu1, outpu2