In [1]:
import pyarrow.parquet as pq
import pandas as pd
import random

# Input parquet file
INPUT_FILE = "sentences.parquet"

# Output files
TRAIN_FILE = "train.parquet"
TEST_FILE  = "test.parquet"
VAL_FILE   = "val.parquet"

# Sample sizes
TRAIN_SIZE = 1_000_000
TEST_SIZE  = 100_000
VAL_SIZE   = 100_000
TOTAL_SIZE = TRAIN_SIZE + TEST_SIZE + VAL_SIZE  # 1.2M

# Reservoir sampling setup
reservoir = []
n = 0  # total rows seen so far

# Read in batches (so memory stays small)
parquet = pq.ParquetFile(INPUT_FILE)

for batch in parquet.iter_batches(batch_size=10_000):
    df_batch = batch.to_pandas()
    for row in df_batch.itertuples(index=False):
        n += 1
        if len(reservoir) < TOTAL_SIZE:
            reservoir.append(row)
        else:
            j = random.randint(0, n - 1)
            if j < TOTAL_SIZE:
                reservoir[j] = row

# Convert reservoir to DataFrame
df = pd.DataFrame(reservoir, columns=df_batch.columns)

# Shuffle the sample
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train, test, val
train_df = df.iloc[:TRAIN_SIZE]
test_df  = df.iloc[TRAIN_SIZE:TRAIN_SIZE+TEST_SIZE]
val_df   = df.iloc[TRAIN_SIZE+TEST_SIZE:]

# Save to parquet
train_df.to_parquet(TRAIN_FILE, index=False)
test_df.to_parquet(TEST_FILE, index=False)
val_df.to_parquet(VAL_FILE, index=False)

print("✅ Done! Saved train, test, and val parquet files.")


✅ Done! Saved train, test, and val parquet files.
