In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

In [2]:
def read_parquet_in_batches(file_path: str, batch_size=10000):

    parquet_file = pq.ParquetFile(file_path)

    total_rows = parquet_file.metadata.num_rows
    processed_rows = 0
    batches = []

    for batch in parquet_file.iter_batches(batch_size=batch_size):
        batch_df = batch.to_pandas()

        processed_rows += len(batch_df)
        progress = (processed_rows / total_rows) * 100
        print(f'Progress: {progress:.2f}%')
        
        yield batch_df

In [8]:
# Initialize an empty list to collect processed batches
batches = []

# Iterate over the train parquet file in batches
for train_batch in read_parquet_in_batches(file_path='../data/train/train.parquet'):
    # Add new columns and set them to NaN
    train_batch['main_pic1'] = np.nan
    train_batch['main_pic2'] = np.nan
    
    # Read the entire pic parquet file into a DataFrame and set its index
    pic_df = pd.read_parquet('../data/train/resnet.parquet').set_index('variantid')
    
    # Map the embeddings to the respective columns using the variant IDs
    train_batch['main_pic1'] = train_batch['variantid1'].map(pic_df['main_pic_embeddings_resnet_v1'])
    train_batch['main_pic2'] = train_batch['variantid2'].map(pic_df['main_pic_embeddings_resnet_v1'])
    
    # Collect the processed batch
    batches.append(train_batch)

# Concatenate all processed batches and save to a single Parquet file
if batches:
    final_df = pd.concat(batches)
    final_df.to_parquet('../data/train/siamence_main_pic.parquet')

Progress: 0.86%
