In [2]:
%pip install pyarrow requests pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import pyarrow.parquet as pq

# 1. SETUP: Define paths and targets
file_path = r"C:\Users\Gangadhar\OneDrive\Study\EBS\Hackathon\Business-Analytics-Hackathon\data\data-2025-12.parquet"

# UPDATED: Added 'Hattenheim' for Schloss Campus students
target_stations = [
    "Frankfurt(Main)Hbf",
    "Frankfurt-Höchst",
    "Wiesbaden Hbf",
    "Wiesbaden-Biebrich",
    "Eltville",
    "Oestrich-Winkel", # Burg Campus
    "Hattenheim",      # Schloss Campus (NEW)
    "Geisenheim",
    "Rüdesheim(Rhein)",
    "Mainz Hbf"
]

print("⏳ Re-loading data with Schloss Campus (Hattenheim) included...")

try:
    # 2. LOAD
    table = pq.read_table(
        file_path, 
        columns=['station_name', 'time', 'delay_in_min', 'is_canceled', 'train_type', 'train_name']
    )
    df = table.to_pandas()
    
    # 3. FILTER
    df_filtered = df[df['station_name'].isin(target_stations)].copy()
    
    # 4. CLEAN & FEATURE ENGINEER
    df_filtered['time'] = pd.to_datetime(df_filtered['time'])
    df_filtered['hour'] = df_filtered['time'].dt.hour
    df_filtered['weekday'] = df_filtered['time'].dt.day_name()
    
    # Save the corrected dataset
    output_file = "ebs_commute_data.csv"
    df_filtered.to_csv(output_file, index=False)
    print(f"✅ Corrected dataset saved! ({len(df_filtered):,} rows)")
    print("   Includes: Oestrich-Winkel (Burg) and Hattenheim (Schloss)")

except Exception as e:
    print(f"❌ Error: {e}")

⏳ Re-loading data with Schloss Campus (Hattenheim) included...
✅ Corrected dataset saved! (54,271 rows)
   Includes: Oestrich-Winkel (Burg) and Hattenheim (Schloss)
