In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Required libraries
import os
import pandas as pd
from datetime import datetime
import requests
from tqdm import tqdm

# URLs of the CSVs from GitHub
def get_csv_urls_from_github():
    api_url = 'https://api.github.com/repos/GMasso19/NYC_Subway/contents/raw_data'
    response = requests.get(api_url)
    if response.status_code == 200:
        contents = response.json()
        return [file['download_url'] for file in contents if file['name'].endswith('.csv')]
    else:
        print(f"Error fetching CSV URLs: {response.status_code}")
        return []

# Load and combine all data (without weather)
def load_all_data(csv_urls):
    data = []
    for url in tqdm(csv_urls, desc="Processing CSV files"):
        try:
            df = pd.read_csv(url)
            df['transit_timestamp'] = pd.to_datetime(df['transit_timestamp'])
            df['date'] = df['transit_timestamp'].dt.date
            # Filter only subway and Manhattan
            df = df[(df['transit_mode'] == 'subway') & (df['borough'] == 'Manhattan')]
            # Drop the now-redundant columns
            df = df.drop(columns=['transit_mode', 'borough'])
            data.append(df)
        except Exception as e:
            print("Error in:", url, str(e))
    return pd.concat(data, ignore_index=True)


csv_urls = get_csv_urls_from_github()
combined_data = load_all_data(csv_urls)
print("Combined CSVs preview (no weather):")
print(combined_data.head())

Processing CSV files: 100%|██████████| 818/818 [15:00<00:00,  1.10s/it]


Combined CSVs preview (no weather):
    transit_timestamp station_complex_id                   station_complex  \
0 2021-09-23 03:00:00                316                         50 St (1)   
1 2022-07-22 19:00:00                323    Christopher St-Sheridan Sq (1)   
2 2022-07-22 19:00:00                323    Christopher St-Sheridan Sq (1)   
3 2021-09-23 12:00:00                403                         33 St (6)   
4 2021-09-23 07:00:00                602  14 St-Union Sq (L,N,Q,R,W,4,5,6)   

  payment_method               fare_class_category  ridership  transfers  \
0      metrocard       Metrocard - Unlimited 7-Day        8.0        0.0   
1      metrocard             Metrocard - Full Fare      111.0        1.0   
2      metrocard                 Metrocard - Other        9.0        0.0   
3      metrocard             Metrocard - Full Fare      193.0        8.0   
4      metrocard  Metrocard - Seniors & Disability       35.0       10.0   

         date  
0  2021-09-23  
1  202

In [9]:
print(len(combined_data))

12274749


In [4]:
# Adding day type features
combined_data['year'] = combined_data['transit_timestamp'].dt.year
combined_data['month'] = combined_data['transit_timestamp'].dt.month
combined_data['day'] = combined_data['transit_timestamp'].dt.day
combined_data['day_of_week'] = combined_data['transit_timestamp'].dt.dayofweek
combined_data['day_name'] = combined_data['transit_timestamp'].dt.day_name()
combined_data['is_weekend'] = combined_data['day_of_week'].isin([5, 6])

print(combined_data[['transit_timestamp', 'year', 'month', 'day', 'day_name', 'is_weekend']].head())

    transit_timestamp  year  month  day  day_name  is_weekend
0 2021-09-23 03:00:00  2021      9   23  Thursday       False
1 2022-07-22 19:00:00  2022      7   22    Friday       False
2 2022-07-22 19:00:00  2022      7   22    Friday       False
3 2021-09-23 12:00:00  2021      9   23  Thursday       False
4 2021-09-23 07:00:00  2021      9   23  Thursday       False


In [5]:
# checking which years of data we have

sorted(combined_data['year'].unique())

[np.int32(2020),
 np.int32(2021),
 np.int32(2022),
 np.int32(2023),
 np.int32(2024)]

In [6]:
!pip install meteostat tqdm scikit-learn

Collecting meteostat
  Downloading meteostat-1.6.8-py3-none-any.whl.metadata (4.6 kB)
Downloading meteostat-1.6.8-py3-none-any.whl (31 kB)
Installing collected packages: meteostat
Successfully installed meteostat-1.6.8


In [7]:
# Weather imports
from meteostat import Daily, Point

# Weather data
def get_weather_data():
    nyc = Point(40.7128, -74.0060)
    start = datetime(2020, 1, 1)
    end = datetime(2024, 12, 31)
    weather = Daily(nyc, start, end).fetch().reset_index()
    weather['date'] = weather['time'].dt.date
    return weather[['date', 'tmin', 'tmax', 'prcp', 'wspd']]


# Merge weather with the existing data
weather_data = get_weather_data()
combined_with_weather = combined_data.merge(weather_data, on='date', how='left')

# Check for missing weather
missing_weather = combined_with_weather[['prcp', 'wspd', 'tmin', 'tmax']].isna().sum()
print("Missing weather data:\n", missing_weather)

# Preview merged result
print(combined_with_weather.head())




Missing weather data:
 prcp    0
wspd    0
tmin    0
tmax    0
dtype: int64
    transit_timestamp station_complex_id                   station_complex  \
0 2021-09-23 03:00:00                316                         50 St (1)   
1 2022-07-22 19:00:00                323    Christopher St-Sheridan Sq (1)   
2 2022-07-22 19:00:00                323    Christopher St-Sheridan Sq (1)   
3 2021-09-23 12:00:00                403                         33 St (6)   
4 2021-09-23 07:00:00                602  14 St-Union Sq (L,N,Q,R,W,4,5,6)   

  payment_method               fare_class_category  ridership  transfers  \
0      metrocard       Metrocard - Unlimited 7-Day        8.0        0.0   
1      metrocard             Metrocard - Full Fare      111.0        1.0   
2      metrocard                 Metrocard - Other        9.0        0.0   
3      metrocard             Metrocard - Full Fare      193.0        8.0   
4      metrocard  Metrocard - Seniors & Disability       35.0       10.0   

In [13]:
combined_with_weather['timestamp'] = (
    combined_with_weather['transit_timestamp']
    .dt.tz_localize('America/New_York', ambiguous='NaT')  # handles DST fallback
    .dt.floor('h')
)

In [14]:
#grabing NYC events data

events_url = "https://raw.githubusercontent.com/codezilla69/NYC_Subway/main/helper_data/nyc_events_hourly_2020_2025.csv"
events_df = pd.read_csv(events_url, parse_dates=['timestamp'])


In [19]:
# Ensure timestamp is datetime and tz-aware in events_df
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'], utc=True).dt.tz_convert('America/New_York')

In [20]:
#merged
merged_df = combined_with_weather.merge(events_df, how='left', on='timestamp')

print(merged_df.head())
print(merged_df.columns)


    transit_timestamp station_complex_id                   station_complex  \
0 2021-09-23 03:00:00                316                         50 St (1)   
1 2022-07-22 19:00:00                323    Christopher St-Sheridan Sq (1)   
2 2022-07-22 19:00:00                323    Christopher St-Sheridan Sq (1)   
3 2021-09-23 12:00:00                403                         33 St (6)   
4 2021-09-23 07:00:00                602  14 St-Union Sq (L,N,Q,R,W,4,5,6)   

  payment_method               fare_class_category  ridership  transfers  \
0      metrocard       Metrocard - Unlimited 7-Day        8.0        0.0   
1      metrocard             Metrocard - Full Fare      111.0        1.0   
2      metrocard                 Metrocard - Other        9.0        0.0   
3      metrocard             Metrocard - Full Fare      193.0        8.0   
4      metrocard  Metrocard - Seniors & Disability       35.0       10.0   

         date  year  month  ...  is_weekend  tmin  tmax  prcp  wspd  \
0  