In [9]:
import pandas as pd
import glob
import os

# --- CONFIGURATION ---
# The folder containing your yearly parquet files (e.g., data/2022)
INPUT_FOLDER = 'data/2022'
OUTPUT_FOLDER = 'data'
YEAR = 2022

def consolidate_sessions(input_dir, year, session_suffix):
    """
    Finds all parquet files for a specific session type, 
    merges them, and returns a single DataFrame.
    """
    # Search for files ending with _Q.parquet or _R.parquet
    search_pattern = os.path.join(input_dir, f"*_{session_suffix}.parquet")
    file_list = sorted(glob.glob(search_pattern))
    
    if not file_list:
        print(f"No files found for session type: {session_suffix} in {input_dir}")
        return None
    
    print(f"Merging {len(file_list)} files for session {session_suffix}...")
    
    df_list = []
    for file in file_list:
        try:
            df_list.append(pd.read_parquet(file))
        except Exception as e:
            print(f"Error reading {file}: {e}")
            
    if df_list:
        return pd.concat(df_list, ignore_index=True)
    return None

def main():
    # 1. Consolidate Qualifying (Q)
    qualifying_df = consolidate_sessions(INPUT_FOLDER, YEAR, 'Q')
    if qualifying_df is not None:
        q_output = os.path.join(OUTPUT_FOLDER, f'f1_{YEAR}_all_qualifying.csv')
        qualifying_df.to_csv(q_output, index=False)
        print(f"SUCCESS: Qualifying CSV saved to {q_output} ({len(qualifying_df)} rows)")

    # 2. Consolidate Race (R)
    race_df = consolidate_sessions(INPUT_FOLDER, YEAR, 'R')
    if race_df is not None:
        r_output = os.path.join(OUTPUT_FOLDER, f'f1_{YEAR}_all_races.csv')
        race_df.to_csv(r_output, index=False)
        print(f"SUCCESS: Race CSV saved to {r_output} ({len(race_df)} rows)")

if __name__ == "__main__":
    main()

Merging 22 files for session Q...
SUCCESS: Qualifying CSV saved to data/f1_2022_all_qualifying.csv (6985 rows)
Merging 22 files for session R...
SUCCESS: Race CSV saved to data/f1_2022_all_races.csv (23577 rows)


In [7]:
import pandas as pd

# Load the dataset
file_path = 'data/raw_f1_2024.parquet'
df = pd.read_parquet(file_path)


# --- SAVING THE DATAFRAME ---

# 1. Save as Parquet (Best for ML - keeps data types and is compressed)
# Make sure you have 'pyarrow' installed: pip install pyarrow
output_parquet = 'data/f1_full_data_processed.parquet'
df.to_parquet(output_parquet, engine='pyarrow', index=False)
print(f"Data successfully saved to Parquet: {output_parquet}")

# 2. Save as CSV (Optional - if you need to read it with a text editor)
output_csv = 'data/f1_full_data_processed.csv'
df.to_csv(output_csv, index=False)
print(f"Data successfully saved to CSV: {output_csv}")

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw_f1_2024.parquet'