In [1]:
from helper_functions import get_weather_data, get_aqi_data
import pandas as pd
import holidays

In [3]:
locations = {
    'NYC': (40.7562, -73.9826),
    'CHI': (41.8758, -87.6328)
}

data_dict = {}

for state, (lat,lon) in locations.items():
    weather_df = get_weather_data(state,lat,lon)
    aqi_df = get_aqi_data(state,lat,lon)

    merged = pd.merge(weather_df,aqi_df,on=['date','state'],suffixes=('_weather','_aqi'))
    merged.to_csv(f"data/{state}_data_raw.csv", index=False)
    data_dict[state] = merged

In [5]:
nyc_df = data_dict["NYC"]
chi_df = data_dict["CHI"]

In [7]:
ridership_df = pd.read_csv('data/Combined_Daily_Transit_Ridership.csv', dtype={2: str})

In [9]:
# Convert to DT, remove redundant date column, rename city column to state so it matchs other DF's
ridership_df['date'] = pd.to_datetime(ridership_df['standardized_date'])
ridership_df.drop(columns=['standardized_date'], inplace=True)
ridership_df.rename(columns={'city': 'state'}, inplace=True)

In [23]:
# Merge all 3 
nyc_chi_combined = pd.concat([nyc_df, chi_df], axis=0)
combined_df = ridership_df.merge(nyc_chi_combined, on=['date', 'state'], how='left')

# Add weekend indicator
combined_df['is_weekend'] = (combined_df['date'].dt.weekday >= 5).astype(int)

# Add holiday indicator with a window
us_holidays = holidays.US(years=combined_df['date'].dt.year.unique())
combined_df['is_holiday'] = combined_df['date'].dt.date.isin(us_holidays).astype(int)
holiday_window = 3
window_dates = pd.to_datetime([])

for holiday in us_holidays:
    window = pd.date_range(holiday - pd.Timedelta(days=holiday_window),
                           holiday + pd.Timedelta(days=holiday_window))
    window_dates = window_dates.append(window)

combined_df['is_holiday_adjacent'] = combined_df['date'].isin(window_dates).astype(int)

In [27]:
combined_df[combined_df['date']=='12/28/2024']

Unnamed: 0,date,unit_id,daily_ridership,state,mode,rain_sum,rain_max,snowfall_sum,snowfall_max,relative_humidity_2m_min,...,us_aqi_ozone_mean,us_aqi_sulphur_dioxide_min,us_aqi_sulphur_dioxide_max,us_aqi_sulphur_dioxide_mean,us_aqi_min_lag,us_aqi_max_lag,us_aqi_mean_lag,is_weekend,is_holiday,is_holiday_adjacent
289314,2024-12-28,B1,7696,NYC,bus,0.393701,0.066929,0.0,0.0,67.0,...,2.041300,4.252999,14.503819,7.172392,2.0,3.0,3.0,1,0,1
289315,2024-12-28,B100,1089,NYC,bus,0.393701,0.066929,0.0,0.0,67.0,...,2.041300,4.252999,14.503819,7.172392,2.0,3.0,3.0,1,0,1
289316,2024-12-28,B101,0,NYC,bus,0.393701,0.066929,0.0,0.0,67.0,...,2.041300,4.252999,14.503819,7.172392,2.0,3.0,3.0,1,0,1
289317,2024-12-28,B103,3568,NYC,bus,0.393701,0.066929,0.0,0.0,67.0,...,2.041300,4.252999,14.503819,7.172392,2.0,3.0,3.0,1,0,1
289318,2024-12-28,B106,0,NYC,bus,0.393701,0.066929,0.0,0.0,67.0,...,2.041300,4.252999,14.503819,7.172392,2.0,3.0,3.0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
947314,2024-12-28,41670,421,CHI,train,0.043307,0.015748,0.0,0.0,76.0,...,17.562422,1.363141,3.217012,2.240095,2.0,2.0,2.0,1,0,1
947315,2024-12-28,41680,207,CHI,train,0.043307,0.015748,0.0,0.0,76.0,...,17.562422,1.363141,3.217012,2.240095,2.0,2.0,2.0,1,0,1
947316,2024-12-28,41690,834,CHI,train,0.043307,0.015748,0.0,0.0,76.0,...,17.562422,1.363141,3.217012,2.240095,2.0,2.0,2.0,1,0,1
947317,2024-12-28,41700,4485,CHI,train,0.043307,0.015748,0.0,0.0,76.0,...,17.562422,1.363141,3.217012,2.240095,2.0,2.0,2.0,1,0,1


In [105]:
combined_df.to_csv(f"data/final.csv", index=False)