In [2]:
import vaex
import dask.dataframe as dd
import pandas as pd
import numpy as np
import glob
import os

#def process_review_data(input_dir, output_dir, chunksize=100000):
    """
    Processes review data from TSV files, handles large datasets, and saves to Parquet.

    Args:
        input_dir (str): Directory containing the input TSV files.
        output_dir (str): Directory to save the output Parquet files.
        chunksize (int, optional): Number of rows to process per chunk.
            Useful for very large files to avoid memory issues. Defaults to 100000.
    """
    # 1. Find all TSV files in the input directory
    tsv_pattern = os.path.join(input_dir, "*.tsv")
    tsv_files = glob.glob(tsv_pattern)

    if not tsv_files:
        print(f"No TSV files found in: {input_dir}")
        return

    print(f"Found {len(tsv_files)} TSV files to process.")

    # 2. Process TSV files in chunks and convert to Parquet
    parquet_files = []
    for i, tsv_file in enumerate(tsv_files):
        print(f"Processing file: {tsv_file} ({i+1}/{len(tsv_files)})")
        # Use pandas to read TSV in chunks
        for j, chunk in enumerate(pd.read_csv(tsv_file, sep='\t', chunksize=chunksize)):
            print(f"  Processing chunk: {j+1}")
            # Convert chunk to Dask DataFrame
            dask_chunk = dd.from_pandas(chunk, npartitions=1)  # Keep partitions low for chunks
            # Create a unique filename for the Parquet chunk
            parquet_chunk_path = os.path.join(output_dir, f"review_chunk_{i}_{j}.parquet")
            # Save chunk to Parquet
            dask_chunk.to_parquet(parquet_chunk_path, compression='snappy')  # Snappy is fast
            parquet_files.append(parquet_chunk_path)

    print(f"Saved Parquet chunks to: {output_dir}")

    # 3. Load all Parquet chunks into a single Vaex DataFrame
    print("Loading Parquet files into Vaex...")
    vaex_df = vaex.open(parquet_files)  # Vaex can handle a list of Parquet files

    print("Vaex DataFrame loaded.")
    print(vaex_df.head()) #show the first few rows

    # 4.  Example usage of the function
if __name__ == "__main__":
    # Create dummy tsv files
    input_dir = "input_data"
    output_dir = "output_parquet"
    os.makedirs(input_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)

#,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date,cleaned_review_body,cleaned_review_headline,cleaned_product_title,combined_review,tokens
0,US,53096400.0,R63J84G1LOX6R,1563890119,763188000.0,The Sandman Vol. 1: Preludes and Nocturnes,Books,4,0,1,N,N,1995-08-13,'first issues series starting point also contain...,ignore review,sandman vol preludes nocturnes,'ignore review first issues series starting poin...,'ignore review first issues series starting poin...
1,US,53096400.0,R1BALOA11Z06MT,1559947608,381721000.0,The 22 Immutable Laws of Marketing,Books,4,0,0,N,N,1995-08-17,'ive always partial immutable laws tape entertai...,awesome,immutable laws marketing,'awesome ive always partial immutable laws tape ...,'awesome ive always partial immutable laws tape ...
2,US,53096300.0,R1LLAY5W5PZUS4,0671701800,860650000.0,Contact,Books,5,1,2,N,N,1995-08-30,'book first contact aliens written prominent mem...,read book good,contact,'read book good book first contact aliens writte...,'read book good book first contact aliens writte...
3,US,53096300.0,R3R9VTJ82FXECQ,0425132153,624270000.0,Good Omens,Books,5,0,0,N,N,1995-09-11,'quite possibly funniest book ever read terry pr...,funniest book ever written antichrist,good omens,'funniest book ever written antichrist quite pos...,'funniest book ever written antichrist quite pos...
4,US,51747700.0,R1P5J3FNBWTFXY,0517122707,161411000.0,A Confederacy of Dunces,Books,5,0,0,N,N,1995-10-17,'story behind book almost better work make mista...,winner didnt last mothers love remained,confederacy dunces,'winner didnt last mothers love remained story b...,'winner didnt last mothers love remained story b...
5,US,53095900.0,R2XQTCOBWNCEAC,0345350499,627223000.0,The Mists of Avalon,Books,5,0,0,N,N,1995-11-02,'marion zimmer bradley brings king arthurs court...,one best books ive ever read,mists avalon,'one best books ive ever read marion zimmer brad...,'one best books ive ever read marion zimmer brad...
6,US,53096500.0,R87VUTMFOE6N,B000002OTL,877615000.0,Pulp Fiction: Music From The Motion Picture,Music,5,0,1,N,N,1995-11-11,'pulp fiction movie one favorite movies soundtra...,pulp fiction soundtrack excellent,pulp fiction music motion picture,'pulp fiction soundtrack excellent pulp fiction ...,'pulp fiction soundtrack excellent pulp fiction ...
7,US,53096500.0,RVS529AN21RR3,6302136105,701401000.0,The Doors [VHS],Video,5,1,2,N,N,1995-11-11,'doors one favorite movies val kilmer great acto...,doors best movie ever,doors vhs,'doors best movie ever doors one favorite movies...,'doors best movie ever doors one favorite movies...
8,US,53095700.0,RG6LPC4W8LN29,0399134204,31287900.0,The Joy Luck Club,Books,5,0,1,N,N,1995-11-15,'book way another gives deeper view chinese set ...,gives deeper understanding chinese culture,joy luck club,'gives deeper understanding chinese culture book...,'gives deeper understanding chinese culture book...
9,US,53071300.0,RDFIE1IF0IYFH,0887306667,381721000.0,'The 22 Immutable Laws of Marketing: Violate Th...,Books,5,0,0,N,N,1996-05-12,'book fast read covers strategic highlevel marke...,one best marketing books ever written timeless,immutable laws marketing violate risk,'one best marketing books ever written timeless ...,'one best marketing books ever written timeless ...
