In [None]:
import os
import pandas as pd
import glob

def clean_and_save_data(input_dir: str, output_dir: str):
    """
    Reads each .parquet file from the input directory, removes duplicate rows,
    and writes the cleaned data to .parquet and .csv.gz files in the output directory
    using the same filename as the original files.

    Parameters:
    input_dir (str): Directory containing the .parquet files to read.
    output_dir (str): Directory where the cleaned files will be saved.
    """
    # Create the output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)

    # Get all .parquet files in the input directory
    parquet_files = glob.glob(os.path.join(input_dir, "*.parquet"))

    # Process each .parquet file individually
    for file in parquet_files:
        print(file)
        # Read the .parquet file into a DataFrame
        df = pd.read_parquet(file)
        
        # Drop duplicate rows
        df.drop_duplicates(inplace=True)

        # Extract the base filename without the extension
        base_filename = os.path.basename(file).split('.')[0]

        # Define the output paths for .parquet and .csv.gz files
        parquet_output_path = os.path.join(output_dir, f'{base_filename}.parquet')
        csv_output_path = os.path.join(output_dir, f'{base_filename}.csv.gz')

        # Write the cleaned DataFrame to .parquet
        df.to_parquet(parquet_output_path)

        # Write the cleaned DataFrame to .csv.gz
        df.to_csv(csv_output_path, index=False, compression='gzip')

if __name__ == "__main__":
    input_directory = '/home/cdsw/data/OPDI/v002/measurements'
    output_directory = '/home/cdsw/data/OPDI/v002/measurements_clean/'
    clean_and_save_data(input_directory, output_directory)


/home/cdsw/data/OPDI/v002/measurements/measurements_20240629_20240709.parquet
/home/cdsw/data/OPDI/v002/measurements/measurements_20240619_20240629.parquet
/home/cdsw/data/OPDI/v002/measurements/measurements_20240609_20240619.parquet
/home/cdsw/data/OPDI/v002/measurements/measurements_20240530_20240609.parquet
