Downloading the weather and TLC data used in this project

In [None]:
#importing dictionaries
from urllib.request import urlretrieve
import requests
import pandas as pd
import os

Loading in the TLC trip data. code derived from Python_PreReq_Notebook.ipynb from MAST30034

In [None]:
# from the current directory, go back two levels to the `Assignment1` directory
output_relative_dir = '../datasets/'

# check if it exists
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

# create folders
for target_dir in ('yellow_tlc_data', 'hvfhv_data'):
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)

#March 2024 to June 2025 excluding December-Feburary
YEARS = ['2024', '2025']
MONTHS = range(3, 12)

# URL templates
url_yellow = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"
url_hvfhv  = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_"

def load_TLC_data(folder, url_base):
    """
    Download TLC data into a folder
    """
    for year in YEARS:
        for month in MONTHS:

            #stopping at july 2025
            if ((year == '2025') and (month == 7)):
                print("completed")
                return

            #0-fill i.e 1 -> 01, 2 -> 02, etc
            month = str(month).zfill(2) 
            print(f"Begin month {month}")

            # construct URL and output path
            url = f'{url_base}{year}-{month}.parquet'
            output_dir = f"{folder}/{year}-{month}.parquet"

            #download
            urlretrieve(url, output_dir)

            print(f"Completed {year}-{month}")

#load the data
load_TLC_data(os.path.join(output_relative_dir, 'yellow_tlc_data'), url_yellow)
load_TLC_data(os.path.join(output_relative_dir, 'hvfhv_data'), url_hvfhv)

Begin month 03
Completed 2024-03
Begin month 04
Completed 2024-04
Begin month 05
Completed 2024-05
Begin month 06
Completed 2024-06
Begin month 07
Completed 2024-07
Begin month 08
Completed 2024-08
Begin month 09
Completed 2024-09
Begin month 10
Completed 2024-10
Begin month 11
Completed 2024-11
Begin month 03
Completed 2025-03
Begin month 04
Completed 2025-04
Begin month 05
Completed 2025-05
Begin month 06
Completed 2025-06
completed
Begin month 03
Completed 2024-03
Begin month 04
Completed 2024-04
Begin month 05
Completed 2024-05
Begin month 06
Completed 2024-06
Begin month 07
Completed 2024-07
Begin month 08
Completed 2024-08
Begin month 09
Completed 2024-09
Begin month 10
Completed 2024-10
Begin month 11
Completed 2024-11
Begin month 03
Completed 2025-03
Begin month 04
Completed 2025-04
Begin month 05
Completed 2025-05
Begin month 06
Completed 2025-06
completed


Downloading weather data, hourly rain and temperature, used in this project

In [None]:
#NYC coordinates from weather data source: 40.712778, -74.006111

# Open-Meteo API URL
url = ("https://archive-api.open-meteo.com/v1/archive?latitude=40.7143&"
       "longitude=-74.006&start_date=2024-03-01&end_date=2025-06-30&"
       "hourly=rain,apparent_temperature&timezone=America%2FNew_York")

#Requsting data
data = requests.get(url)

#changing to python dictionary structure
data = data.json()

# Extract hourly time and rain
df = pd.DataFrame({
    "time":data["hourly"]["time"],
    "rain":data["hourly"]["rain"],
    "temp": data["hourly"]["apparent_temperature"]
    })

#match structure to TLC
df["time"] = pd.to_datetime(df["time"]).astype("datetime64[ms]")

#remove unwanted months
df = df[~df["time"].dt.month.isin([12, 1, 2])]


#save to data folder
weather_dir = "../datasets/weather_data"

# check if it exists
if not os.path.exists(weather_dir):
    os.makedirs(weather_dir)

pathway = os.path.join(weather_dir, "rain.parquet")
df.to_parquet(pathway, index=False)