In [1]:
import pandas as pd
import numpy as np

try:
    # Load the full, enriched dataset using the efficient dtype method.
    column_dtypes = {
        'start_station_name': str, 'start_station_id': str,
        'end_station_name': str, 'end_station_id': str
    }
    df_full = pd.read_csv(
        'citi_bike_2022_with_weather.csv',
        dtype=column_dtypes
    )
    
    # --- DATA CLEANING STEP ---
    # First, convert 'started_at' to a proper datetime format.
    df_full['started_at'] = pd.to_datetime(df_full['started_at'])
    
    # THE FIX: Filter the DataFrame to include ONLY data from the year 2022.
    df_2022_only = df_full[df_full['started_at'].dt.year == 2022].copy()
    print(f"Filtered data to {len(df_2022_only):,} trips from 2022.")

    # --- SAMPLING STEP ---
    # Now, we proceed with the sampling process on this clean, 2022-only data.
    columns_to_keep = [
        'started_at', 
        'start_station_name',
        'member_casual',
    ]
    df_to_sample = df_2022_only[columns_to_keep]

    # Create the random sample.
    np.random.seed(32) # Seed for reproducibility
    sample_fraction = 0.005 # 0.5% sample
    mask = np.random.rand(len(df_to_sample)) <= sample_fraction

    df_small_sample = df_to_sample[mask]

    # Save the sample to a new CSV file.
    output_filename = 'citi_bike_2022_small_sample.csv'
    df_small_sample.to_csv(output_filename, index=False)

    print(f"\nSUCCESS! Created a clean random sample with {len(df_small_sample):,} rows.")
    print(f"File saved as '{output_filename}'.")
    
except FileNotFoundError:
    print("\nERROR: The main data file 'citi_bike_2022_with_weather.csv' was not found.")
except Exception as e:
    print(f"\nAn error occurred: {e}")

Filtered data to 29,838,166 trips from 2022.

SUCCESS! Created a clean random sample with 149,372 rows.
File saved as 'citi_bike_2022_small_sample.csv'.
