In [None]:
from pathlib import Path
import pandas as pd

In [None]:
# Define paths
RAW_DIR = Path("../input/raw/trip_records")
YELLOW_DIR = RAW_DIR / "yellow_taxi"
GREEN_DIR = RAW_DIR / "green_taxi"
HVFHV_DIR = RAW_DIR / "HVHFV"

# Read all parquet files from each folder and concatenate into single dataframes
df_yellow = pd.concat([pd.read_parquet(f) for f in YELLOW_DIR.glob("*.parquet")], ignore_index=True)
df_green = pd.concat([pd.read_parquet(f) for f in GREEN_DIR.glob("*.parquet")], ignore_index=True)
df_hvfhv = pd.concat([pd.read_parquet(f) for f in HVFHV_DIR.glob("*.parquet")], ignore_index=True)

print(f"Yellow taxi records: {len(df_yellow):,}")
print(f"Green taxi records: {len(df_green):,}")
print(f"HVFHV records: {len(df_hvfhv):,}")

Yellow taxi records: 13,389,587
Green taxi records: 221,006
HVFHV records: 78,608,184


In [None]:
# Standardize column names to lowercase for all dataframes
df_yellow.columns = df_yellow.columns.str.lower()
df_green.columns = df_green.columns.str.lower()
df_hvfhv.columns = df_hvfhv.columns.str.lower()

print("Column names standardized to lowercase")
print(f"Yellow columns: {list(df_yellow.columns)}")
print(f"Green columns: {list(df_green.columns)}")
print(f"HVFHV columns: {list(df_hvfhv.columns)}")

Column names standardized to lowercase
Yellow columns: ['vendorid', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'ratecodeid', 'store_and_fwd_flag', 'pulocationid', 'dolocationid', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee']
Green columns: ['vendorid', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag', 'ratecodeid', 'pulocationid', 'dolocationid', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge', 'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge']
HVFHV columns: ['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num', 'request_datetime', 'on_scene_datetime', 'pickup_datetime', 'dropoff_datetime', 'pulocationid', 'dolocationid', 'trip_miles', 'trip_time', 'base_passenger_fare', 'tolls', 'bcf', 's

# Data Cleaning

In [None]:
# Check data types and null values for all dataframes
def check_data_quality(df, name):
    """Display data types and null value counts for a dataframe."""
    print(f"{'='*60}")
    print(f"{name} - Shape: {df.shape}")
    print(f"{'='*60}")
    
    info_df = pd.DataFrame({
        'dtype': df.dtypes,
        'null_count': df.isnull().sum(),
        'null_pct': (df.isnull().sum() / len(df) * 100).round(2)
    })
    print(info_df)
    print()

check_data_quality(df_yellow, "Yellow Taxi")
check_data_quality(df_green, "Green Taxi")
check_data_quality(df_hvfhv, "HVFHV")

Yellow Taxi - Shape: (13389587, 19)
                                dtype  null_count  null_pct
vendorid                        int32           0      0.00
tpep_pickup_datetime   datetime64[us]           0      0.00
tpep_dropoff_datetime  datetime64[us]           0      0.00
passenger_count               float64     1221622      9.12
trip_distance                 float64           0      0.00
ratecodeid                    float64     1221622      9.12
store_and_fwd_flag             object     1221622      9.12
pulocationid                    int32           0      0.00
dolocationid                    int32           0      0.00
payment_type                    int64           0      0.00
fare_amount                   float64           0      0.00
extra                         float64           0      0.00
mta_tax                       float64           0      0.00
tip_amount                    float64           0      0.00
tolls_amount                  float64           0      0.00
impr

**Finding:** None of the relevant columns for the analysis have null values

## Yellow Taxi Data Cleaning

In [None]:
# Yellow Taxi - Data type conversions based on data dictionary
# Kept columns:
# - tpep_pickup_datetime, tpep_dropoff_datetime: datetime
# - pulocationid, dolocationid, passenger_count: int
# - fare_amount, total_amount, trip_distance: float

# Keep only relevant columns
cols_to_keep = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'pulocationid', 'dolocationid',
                'trip_distance', 'fare_amount', 'total_amount']
df_yellow = df_yellow[cols_to_keep]

# Convert datetime columns
datetime_cols = ['tpep_pickup_datetime', 'tpep_dropoff_datetime']
for col in datetime_cols:
    df_yellow[col] = pd.to_datetime(df_yellow[col])

# Convert integer columns (using nullable Int64 for columns that may have nulls)
int_cols = ['pulocationid', 'dolocationid']
for col in int_cols:
    df_yellow[col] = df_yellow[col].astype('Int64')

print("Yellow Taxi data types after conversion:")
print(df_yellow.dtypes)

Yellow Taxi data types after conversion:
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
pulocationid                      Int64
dolocationid                      Int64
trip_distance                   float64
fare_amount                     float64
total_amount                    float64
dtype: object


In [None]:
# Filter yellow taxi dataframe to keep only January, April, July, October (the sampled months)
valid_months = [1, 4, 7, 10]
rows_before = len(df_yellow)
df_yellow = df_yellow[df_yellow['tpep_pickup_datetime'].dt.month.isin(valid_months)]
rows_after = len(df_yellow)
rows_dropped = rows_before - rows_after

print(f"Yellow Taxi - Rows before: {rows_before:,}")
print(f"Yellow Taxi - Rows after: {rows_after:,}")
print(f"Yellow Taxi - Rows dropped: {rows_dropped:,} ({rows_dropped/rows_before*100:.2f}%)")

Yellow Taxi - Rows before: 13,389,587
Yellow Taxi - Rows after: 13,389,480
Yellow Taxi - Rows dropped: 107 (0.00%)


In [None]:
def audit_numeric(df, col):
    """Audit a numeric column for anomalies."""
    print(f"\n{'='*40}")
    print(f"Auditing: {col}")
    print(f"{'='*40}")
    
    series = df[col]
    
    # Basic stats
    print(f"\nBasic Stats:")
    print(series.describe())
    
    # Check for suspicious values
    print(f"\nPotential Issues:")
    print(f"  Nulls: {series.isna().sum():,} ({series.isna().mean()*100:.2f}%)")
    print(f"  Zeros: {(series == 0).sum():,} ({(series == 0).mean()*100:.2f}%)")
    print(f"  Negatives: {(series < 0).sum():,} ({(series < 0).mean()*100:.2f}%)")
    
    # Outliers (using IQR)
    q1, q99 = series.quantile([0.01, 0.99])
    print(f"\n  1st percentile: {q1}")
    print(f"  99th percentile: {q99}")
    
    extreme_low = (series < q1).sum()
    extreme_high = (series > q99).sum()
    print(f"  Below 1st pctl: {extreme_low:,}")
    print(f"  Above 99th pctl: {extreme_high:,}")

audit_numeric(df_yellow, 'fare_amount')
audit_numeric(df_yellow, 'trip_distance')
audit_numeric(df_yellow, 'total_amount')


Auditing: fare_amount

Basic Stats:
count    1.338948e+07
mean     1.913892e+01
std      1.935871e+01
min     -2.261200e+03
25%      9.300000e+00
50%      1.350000e+01
75%      2.190000e+01
max      5.000000e+03
Name: fare_amount, dtype: float64

Potential Issues:
  Nulls: 0 (0.00%)
  Zeros: 4,781 (0.04%)
  Negatives: 229,042 (1.71%)

  1st percentile: -7.9
  99th percentile: 80.0
  Below 1st pctl: 131,107
  Above 99th pctl: 132,193

Auditing: trip_distance

Basic Stats:
count    1.338948e+07
mean     4.837142e+00
std      4.096487e+02
min      0.000000e+00
25%      1.020000e+00
50%      1.760000e+00
75%      3.360000e+00
max      3.663430e+05
Name: trip_distance, dtype: float64

Potential Issues:
  Nulls: 0 (0.00%)
  Zeros: 235,868 (1.76%)
  Negatives: 0 (0.00%)

  1st percentile: 0.0
  99th percentile: 20.13
  Below 1st pctl: 0
  Above 99th pctl: 133,652

Auditing: total_amount

Basic Stats:
count    1.338948e+07
mean     2.774446e+01
std      2.408909e+01
min     -2.265450e+03
25% 

**Findings:**
* Negative values and zeros in fare_amount and total_amount might be refunds or cancelled trips, having no context I will drop these rows
* The max value of trip distance is 366,343 miles which is roughly traveling the circumference of the earth 9 times, New York city spans about 35 miles so I will drop any rows that lie outside the 0.1 to 100 miles range. I will also drop the zeros given it would make no sense to include them in the analysis

In [None]:
rows_before = len(df_yellow)

df_yellow = df_yellow[
    (df_yellow['fare_amount'] > 0) &
    (df_yellow['total_amount'] > 0) &
    (df_yellow['trip_distance'] >= 0.1) &
    (df_yellow['trip_distance'] <= 100)
]

rows_after = len(df_yellow)
rows_dropped = rows_before - rows_after

print(f"Yellow Taxi - Rows before filtering: {rows_before:,}")
print(f"Yellow Taxi - Rows after filtering: {rows_after:,}")
print(f"Yellow Taxi - Rows dropped: {rows_dropped:,} ({rows_dropped/rows_before*100:.2f}%)")

Yellow Taxi - Rows before filtering: 13,389,480
Yellow Taxi - Rows after filtering: 12,896,459
Yellow Taxi - Rows dropped: 493,021 (3.68%)


## Green Taxi Data Cleaning

In [None]:
# Green Taxi - Data type conversions based on data dictionary
# Kept columns:
# - lpep_pickup_datetime, lpep_dropoff_datetime: datetime
# - pulocationid, dolocationid, passenger_count: int
# - fare_amount, total_amount, trip_distance: float

# Keep only relevant columns
cols_to_keep = ['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'pulocationid', 'dolocationid',
                'trip_distance', 'fare_amount', 'total_amount']
df_green = df_green[cols_to_keep]

# Convert datetime columns
datetime_cols = ['lpep_pickup_datetime', 'lpep_dropoff_datetime']
for col in datetime_cols:
    df_green[col] = pd.to_datetime(df_green[col])

# Convert integer columns (using nullable Int64 for columns that may have nulls)
int_cols = ['pulocationid', 'dolocationid']
for col in int_cols:
    df_green[col] = df_green[col].astype('Int64')

print("Green Taxi data types after conversion:")
print(df_green.dtypes)

Green Taxi data types after conversion:
lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
pulocationid                      Int64
dolocationid                      Int64
trip_distance                   float64
fare_amount                     float64
total_amount                    float64
dtype: object


In [None]:
# Filter green taxi dataframe to keep only January, April, July, October
valid_months = [1, 4, 7, 10]
rows_before = len(df_green)
df_green = df_green[df_green['lpep_pickup_datetime'].dt.month.isin(valid_months)]
rows_after = len(df_green)
rows_dropped = rows_before - rows_after

print(f"Green Taxi - Rows before: {rows_before:,}")
print(f"Green Taxi - Rows after: {rows_after:,}")
print(f"Green Taxi - Rows dropped: {rows_dropped:,} ({rows_dropped/rows_before*100:.2f}%)")

Green Taxi - Rows before: 221,006
Green Taxi - Rows after: 220,967
Green Taxi - Rows dropped: 39 (0.02%)


In [None]:
audit_numeric(df_green, 'fare_amount')
audit_numeric(df_green, 'trip_distance')
audit_numeric(df_green, 'total_amount')


Auditing: fare_amount

Basic Stats:
count    220967.000000
mean         18.017807
std          16.993607
min        -450.000000
25%          10.000000
50%          13.500000
75%          20.500000
max        1422.600000
Name: fare_amount, dtype: float64

Potential Issues:
  Nulls: 0 (0.00%)
  Zeros: 187 (0.08%)
  Negatives: 719 (0.33%)

  1st percentile: 3.0
  99th percentile: 77.9
  Below 1st pctl: 988
  Above 99th pctl: 2,203

Auditing: trip_distance

Basic Stats:
count    220967.000000
mean         16.692203
std         992.180284
min           0.000000
25%           1.130000
50%           1.860000
75%           3.230000
max      201421.680000
Name: trip_distance, dtype: float64

Potential Issues:
  Nulls: 0 (0.00%)
  Zeros: 11,648 (5.27%)
  Negatives: 0 (0.00%)

  1st percentile: 0.0
  99th percentile: 16.0
  Below 1st pctl: 0
  Above 99th pctl: 2,207

Auditing: total_amount

Basic Stats:
count    220967.000000
mean         23.869384
std          19.033013
min        -451.000000
2

**Findings:** Same logic from the yellow taxi dataframe applies to the green taxi dataframe

In [None]:
rows_before = len(df_green)

df_green = df_green[
    (df_green['fare_amount'] > 0) &
    (df_green['total_amount'] > 0) &
    (df_green['trip_distance'] >= 0.1) &
    (df_green['trip_distance'] <= 100)
]

rows_after = len(df_green)
rows_dropped = rows_before - rows_after

print(f"Green Taxi - Rows before filtering: {rows_before:,}")
print(f"Green Taxi - Rows after filtering: {rows_after:,}")
print(f"Green Taxi - Rows dropped: {rows_dropped:,} ({rows_dropped/rows_before*100:.2f}%)")

Green Taxi - Rows before filtering: 220,967
Green Taxi - Rows after filtering: 206,982
Green Taxi - Rows dropped: 13,985 (6.33%)


## HVFHV Data Cleaning

In [None]:
# HVFHV - Data type conversions based on data dictionary
# Kept columns:
# - hvfhs_license_num: category (HV0002=Juno, HV0003=Uber, HV0004=Via, HV0005=Lyft)
# - pickup_datetime, dropoff_datetime: datetime
# - pulocationid, dolocationid, trip_time: int
# - trip_miles, base_passenger_fare, tips: float
# - shared_request_flag, wav_request_flag, wav_match_flag: category (Y/N)

# Keep only relevant columns
cols_to_keep = ['hvfhs_license_num', 'pickup_datetime', 'dropoff_datetime', 'pulocationid', 'dolocationid',
                'trip_miles', 'trip_time', 'base_passenger_fare', 'tips', 'shared_request_flag', 
                'wav_request_flag', 'wav_match_flag']
df_hvfhv = df_hvfhv[cols_to_keep]

# Convert datetime columns
datetime_cols = ['pickup_datetime', 'dropoff_datetime']
for col in datetime_cols:
    df_hvfhv[col] = pd.to_datetime(df_hvfhv[col])

# Convert integer columns
int_cols = ['pulocationid', 'dolocationid', 'trip_time']
for col in int_cols:
    df_hvfhv[col] = df_hvfhv[col].astype('Int64')

# Convert string/category columns
category_cols = ['hvfhs_license_num']
for col in category_cols:
    df_hvfhv[col] = df_hvfhv[col].astype('category')

# Convert Y/N flag columns to category
flag_cols = ['shared_request_flag', 'wav_request_flag', 'wav_match_flag']
for col in flag_cols:
    df_hvfhv[col] = df_hvfhv[col].astype('category')

print("HVFHV data types after conversion:")
print(df_hvfhv.dtypes)

HVFHV data types after conversion:
hvfhs_license_num            category
pickup_datetime        datetime64[us]
dropoff_datetime       datetime64[us]
pulocationid                    Int64
dolocationid                    Int64
trip_miles                    float64
trip_time                       Int64
base_passenger_fare           float64
tips                          float64
shared_request_flag          category
wav_request_flag             category
wav_match_flag               category
dtype: object


In [None]:
# Filter HVFHV dataframe to keep only January, April, July, October
valid_months = [1, 4, 7, 10]
rows_before = len(df_hvfhv)
df_hvfhv = df_hvfhv[df_hvfhv['pickup_datetime'].dt.month.isin(valid_months)]
rows_after = len(df_hvfhv)
rows_dropped = rows_before - rows_after

print(f"HVFHV - Rows before: {rows_before:,}")
print(f"HVFHV - Rows after: {rows_after:,}")
print(f"HVFHV - Rows dropped: {rows_dropped:,} ({rows_dropped/rows_before*100:.2f}%)")

HVFHV - Rows before: 78,608,184
HVFHV - Rows after: 78,608,184
HVFHV - Rows dropped: 0 (0.00%)


In [None]:
audit_numeric(df_hvfhv, 'trip_miles')
audit_numeric(df_hvfhv, 'trip_time')
audit_numeric(df_hvfhv, 'base_passenger_fare')
audit_numeric(df_hvfhv, 'tips')


Auditing: trip_miles

Basic Stats:
count    7.860818e+07
mean     5.056058e+00
std      5.870998e+00
min      0.000000e+00
25%      1.570000e+00
50%      3.000000e+00
75%      6.340000e+00
max      4.555200e+02
Name: trip_miles, dtype: float64

Potential Issues:
  Nulls: 0 (0.00%)
  Zeros: 11,186 (0.01%)
  Negatives: 0 (0.00%)

  1st percentile: 0.481
  99th percentile: 27.14
  Below 1st pctl: 785,498
  Above 99th pctl: 786,026

Auditing: trip_time

Basic Stats:
count     78608184.0
mean     1183.277328
std       835.949352
min              0.0
25%            599.0
50%            966.0
75%           1525.0
max          52060.0
Name: trip_time, dtype: Float64

Potential Issues:
  Nulls: 0 (0.00%)
  Zeros: 8 (0.00%)
  Negatives: 0 (0.00%)

  1st percentile: 196.0
  99th percentile: 4115.0
  Below 1st pctl: 775,474
  Above 99th pctl: 785,470

Auditing: base_passenger_fare

Basic Stats:
count    7.860818e+07
mean     2.564873e+01
std      2.241229e+01
min     -4.309000e+01
25%      1.2320

**Findings:** 
* Limiting trip distance to 0.1 to 100 miles
* Dropping negative and zero values for base_passenger_fare
* The tips and trip_time columns look fine

In [None]:
rows_before = len(df_hvfhv)

df_hvfhv = df_hvfhv[
    (df_hvfhv['base_passenger_fare'] > 0) &
    (df_hvfhv['trip_miles'] >= 0.1) &
    (df_hvfhv['trip_miles'] <= 100)
]

rows_after = len(df_hvfhv)
rows_dropped = rows_before - rows_after

print(f"HVFHV - Rows before filtering: {rows_before:,}")
print(f"HVFHV - Rows after filtering: {rows_after:,}")
print(f"HVFHV - Rows dropped: {rows_dropped:,} ({rows_dropped/rows_before*100:.2f}%)")

HVFHV - Rows before filtering: 78,608,184
HVFHV - Rows after filtering: 78,549,718
HVFHV - Rows dropped: 58,466 (0.07%)


# Data Aggregation

In [None]:
def add_time_features(df, pickup_col, dropoff_col):
    """Add time-based features for aggregation."""
    
    df['pickup_hour'] = df[pickup_col].dt.hour # Truncating to hour
    df['day_of_week'] = df[pickup_col].dt.dayofweek
    df['month'] = df[pickup_col].dt.month
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df['trip_minutes'] = (df[dropoff_col] - df[pickup_col]).dt.total_seconds() / 60
    
    return df

# Apply time features to each dataframe
df_yellow = add_time_features(df_yellow, 'tpep_pickup_datetime', 'tpep_dropoff_datetime')
df_green = add_time_features(df_green, 'lpep_pickup_datetime', 'lpep_dropoff_datetime')
df_hvfhv = add_time_features(df_hvfhv, 'pickup_datetime', 'dropoff_datetime')

print(f"Sample from df_yellow:")
df_yellow[['pickup_hour', 'day_of_week', 'month', 'is_weekend', 'trip_minutes']].head()

Time features added to all dataframes
Yellow new columns: ['pickup_hour', 'day_of_week', 'month', 'is_weekend', 'trip_minutes']
Sample from df_yellow:


Unnamed: 0,pickup_hour,day_of_week,month,is_weekend,trip_minutes
0,0,0,1,0,19.8
1,0,0,1,0,6.6
2,0,0,1,0,17.916667
3,0,0,1,0,8.3
4,0,0,1,0,6.1


In [None]:
def aggregate_to_zone_time(df, vehicle_type, fare_col='fare_amount', distance_col='trip_distance'):
    """Aggregate trip data to zone x hour x is_weekend level."""
    
    agg = df.groupby(['pulocationid', 'pickup_hour', 'is_weekend']).agg(
        trip_count=('pickup_hour', 'count'),
        avg_fare=(fare_col, 'mean'),
        median_fare=(fare_col, 'median'),
        avg_trip_distance=(distance_col, 'mean'),
        avg_trip_minutes=('trip_minutes', 'mean'),
        total_fare=(fare_col, 'sum')
    ).reset_index()
    
    agg['vehicle_type'] = vehicle_type
    
    return agg

# Aggregate each dataframe
agg_yellow = aggregate_to_zone_time(df_yellow, 'yellow', fare_col='fare_amount', distance_col='trip_distance')
agg_green = aggregate_to_zone_time(df_green, 'green', fare_col='fare_amount', distance_col='trip_distance')
agg_hvfhv = aggregate_to_zone_time(df_hvfhv, 'hvfhv', fare_col='base_passenger_fare', distance_col='trip_miles')

print(f"Yellow aggregated: {agg_yellow.shape}")
print(f"Green aggregated: {agg_green.shape}")
print(f"HVFHV aggregated: {agg_hvfhv.shape}")

agg_yellow.head()

Yellow aggregated: (11278, 10)
Green aggregated: (5017, 10)
HVFHV aggregated: (12444, 10)


Unnamed: 0,pulocationid,pickup_hour,is_weekend,trip_count,avg_fare,median_fare,avg_trip_distance,avg_trip_minutes,total_fare,vehicle_type
0,1,0,0,2,46.6,46.6,9.4,23.816667,93.2,yellow
1,1,2,0,1,100.0,100.0,1.06,4.983333,100.0,yellow
2,1,5,0,8,73.8125,57.95,14.02625,15.702083,590.5,yellow
3,1,6,0,3,30.7,11.4,7.36,14.088889,92.1,yellow
4,1,6,1,3,132.166667,119.0,8.53,10.077778,396.5,yellow


In [None]:
# Combine all aggregated dataframes
df_aggregated = pd.concat([agg_yellow, agg_green, agg_hvfhv], ignore_index=True)

print(f"Combined aggregated dataframe shape: {df_aggregated.shape}")
print(f"\nVehicle type distribution:")
print(df_aggregated['vehicle_type'].value_counts())

Combined aggregated dataframe shape: (28739, 10)

Vehicle type distribution:
vehicle_type
hvfhv     12444
yellow    11278
green      5017
Name: count, dtype: int64


In [None]:
# Create zone-level aggregation (group by pulocationid only)
def aggregate_to_zone(df, vehicle_type, fare_col='fare_amount', distance_col='trip_distance'):
    """Aggregate trip data to zone level only."""
    
    agg = df.groupby(['pulocationid']).agg(
        trip_count=('pickup_hour', 'count'),
        avg_fare=(fare_col, 'mean'),
        median_fare=(fare_col, 'median'),
        avg_trip_distance=(distance_col, 'mean'),
        avg_trip_minutes=('trip_minutes', 'mean'),
        total_fare=(fare_col, 'sum')
    ).reset_index()
    
    agg['vehicle_type'] = vehicle_type
    
    return agg

# Aggregate each dataframe at zone level
agg_zone_yellow = aggregate_to_zone(df_yellow, 'yellow', fare_col='fare_amount', distance_col='trip_distance')
agg_zone_green = aggregate_to_zone(df_green, 'green', fare_col='fare_amount', distance_col='trip_distance')
agg_zone_hvfhv = aggregate_to_zone(df_hvfhv, 'hvfhv', fare_col='base_passenger_fare', distance_col='trip_miles')

# Combine into df_aggregated_zone
df_aggregated_zone = pd.concat([agg_zone_yellow, agg_zone_green, agg_zone_hvfhv], ignore_index=True)

print(f"Zone-level aggregated dataframe shape: {df_aggregated_zone.shape}")
print(f"\nVehicle type distribution:")
print(df_aggregated_zone['vehicle_type'].value_counts())

Zone-level aggregated dataframe shape: (767, 8)

Vehicle type distribution:
vehicle_type
hvfhv     262
yellow    261
green     244
Name: count, dtype: int64


In [None]:
# Load taxi zone lookup and standardize column names to lowercase
ZONE_LOOKUP_PATH = Path("../input/raw/other/taxi_zone_lookup.csv")
df_zones = pd.read_csv(ZONE_LOOKUP_PATH)
df_zones.columns = df_zones.columns.str.lower()

print(f"Zone lookup shape: {df_zones.shape}")
print(f"Columns: {list(df_zones.columns)}")
df_zones.head()

Zone lookup shape: (265, 4)
Columns: ['locationid', 'borough', 'zone', 'service_zone']


Unnamed: 0,locationid,borough,zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [None]:
# Join zone information to df_aggregated (zone x hour x is_weekend level)
df_aggregated = df_aggregated.merge(
    df_zones[['locationid', 'borough', 'zone', 'service_zone']],
    left_on='pulocationid',
    right_on='locationid',
    how='left'
).drop(columns=['locationid'])

print(f"df_aggregated shape after join: {df_aggregated.shape}")
print(f"Columns: {list(df_aggregated.columns)}")
df_aggregated.head()

df_aggregated shape after join: (28739, 13)
Columns: ['pulocationid', 'pickup_hour', 'is_weekend', 'trip_count', 'avg_fare', 'median_fare', 'avg_trip_distance', 'avg_trip_minutes', 'total_fare', 'vehicle_type', 'borough', 'zone', 'service_zone']


Unnamed: 0,pulocationid,pickup_hour,is_weekend,trip_count,avg_fare,median_fare,avg_trip_distance,avg_trip_minutes,total_fare,vehicle_type,borough,zone,service_zone
0,1,0,0,2,46.6,46.6,9.4,23.816667,93.2,yellow,EWR,Newark Airport,EWR
1,1,2,0,1,100.0,100.0,1.06,4.983333,100.0,yellow,EWR,Newark Airport,EWR
2,1,5,0,8,73.8125,57.95,14.02625,15.702083,590.5,yellow,EWR,Newark Airport,EWR
3,1,6,0,3,30.7,11.4,7.36,14.088889,92.1,yellow,EWR,Newark Airport,EWR
4,1,6,1,3,132.166667,119.0,8.53,10.077778,396.5,yellow,EWR,Newark Airport,EWR


In [None]:
# Join zone information to df_aggregated_zone (zone level only)
df_aggregated_zone = df_aggregated_zone.merge(
    df_zones[['locationid', 'borough', 'zone', 'service_zone']],
    left_on='pulocationid',
    right_on='locationid',
    how='left'
).drop(columns=['locationid'])

print(f"df_aggregated_zone shape after join: {df_aggregated_zone.shape}")
print(f"Columns: {list(df_aggregated_zone.columns)}")
df_aggregated_zone.head()

df_aggregated_zone shape after join: (767, 11)
Columns: ['pulocationid', 'trip_count', 'avg_fare', 'median_fare', 'avg_trip_distance', 'avg_trip_minutes', 'total_fare', 'vehicle_type', 'borough', 'zone', 'service_zone']


Unnamed: 0,pulocationid,trip_count,avg_fare,median_fare,avg_trip_distance,avg_trip_minutes,total_fare,vehicle_type,borough,zone,service_zone
0,1,153,83.239804,90.0,9.177516,15.02756,12735.69,yellow,EWR,Newark Airport,EWR
1,2,14,51.946429,53.75,14.197143,30.564286,727.25,yellow,Queens,Jamaica Bay,Boro Zone
2,3,500,35.98078,36.5,9.24226,43.164167,17990.39,yellow,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,20292,17.499698,15.6,2.928168,16.181122,355103.87,yellow,Manhattan,Alphabet City,Yellow Zone
4,6,165,11.293273,0.01,7.442242,27.574747,1863.39,yellow,Staten Island,Arrochar/Fort Wadsworth,Boro Zone


In [None]:
# Save aggregated dataframes to parquet
PROCESSED_DIR = Path("../input/processed")

# Save df_aggregated (zone x hour x is_weekend level)
df_aggregated.to_parquet(PROCESSED_DIR / "aggregated_data_time_zone.parquet", index=False)
print(f"Saved df_aggregated to {PROCESSED_DIR / 'aggregated_data_time_zone.parquet'}")

# Save df_aggregated_zone (zone level only)
df_aggregated_zone.to_parquet(PROCESSED_DIR / "aggregated_data_zone.parquet", index=False)
print(f"Saved df_aggregated_zone to {PROCESSED_DIR / 'aggregated_data_zone.parquet'}")

Saved df_aggregated to ..\input\processed\aggregated_data_time_zone.parquet
Saved df_aggregated_zone to ..\input\processed\aggregated_data_zone.parquet
