In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import pyarrow as pa

In [2]:
def read_parquet_in_batches(file_path: str, batch_size=10000):

    parquet_file = pq.ParquetFile(file_path)

    total_rows = parquet_file.metadata.num_rows
    processed_rows = 0

    for batch in parquet_file.iter_batches(batch_size=batch_size):
        batch_df = batch.to_pandas()

        processed_rows += len(batch_df)
        progress = (processed_rows / total_rows) * 100
        print(f'Progress: {progress:.2f}%')
        
        yield batch_df

In [3]:
# File paths
train_file_path = '../data/train/train.parquet'
pic_file_path = '../data/train/resnet.parquet'
output_file_path = '../data/train/siamence_main_pic.parquet'

first_batch = True

# Read the entire pic parquet file into a DataFrame and set its index
pic_df = pd.read_parquet(pic_file_path).set_index('variantid')

# Iterate over the train parquet file in batches
for train_batch in read_parquet_in_batches(train_file_path):
    # Add new columns and set them to NaN
    train_batch['main_pic1'] = np.nan
    train_batch['main_pic2'] = np.nan
    
    # Map the embeddings to the respective columns using the variant IDs
    train_batch['main_pic1'] = train_batch['variantid1'].map(pic_df['main_pic_embeddings_resnet_v1'])
    train_batch['main_pic2'] = train_batch['variantid2'].map(pic_df['main_pic_embeddings_resnet_v1'])
    
    # Convert DataFrame to PyArrow Table for appending
    table = pa.Table.from_pandas(train_batch)
    
    # Write the batch to the Parquet file
    if first_batch:
        # Create a Parquet writer for the first batch
        writer = pq.ParquetWriter(output_file_path, table.schema)
        first_batch = False
    # Append the current batch to the Parquet file
    writer.write_table(table)

# Close the Parquet writer
if not first_batch:
    writer.close()