In [1]:
from pathlib import Path
import requests
from datetime import datetime

def download_one_file_of_raw_data(year: int, month: int, save_dir: str = "./data/raw/") -> Path:
    """
    Download a single month's data for the specified year.
    
    Parameters:
        year (int): The year of the data to download.
        month (int): The month of the data to download.
        save_dir (str): Directory to save the downloaded file.
        
    Returns:
        Path: The path to the saved file.
    """
    # URL template
    URL = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet'
    response = requests.get(URL)

    # Check if the request succeeded
    if response.status_code == 200:
        # Save the file
        path = Path(save_dir) / f"yellow_tripdata_{year}-{month:02d}.parquet"
        path.parent.mkdir(parents=True, exist_ok=True)  # Create directories if needed
        with open(path, "wb") as file:
            file.write(response.content)
        print(f"Downloaded: {path}")
        return path
    else:
        print(f"File not found for {year}-{month:02d}: {URL}")
        return None

def download_full_dataset(start_year: int, start_month: int, end_year: int, end_month: int, save_dir: str = "./data/raw/"):
    """
    Download the full dataset from the specified start year/month to end year/month.
    
    Parameters:
        start_year (int): The starting year.
        start_month (int): The starting month.
        end_year (int): The ending year.
        end_month (int): The ending month.
        save_dir (str): Directory to save the downloaded files.
    """
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            # Stop if the year and month exceed the range
            if year == end_year and month > end_month:
                break
            # Download the file
            try:
                download_one_file_of_raw_data(year, month, save_dir)
            except Exception as e:
                print(f"Error downloading {year}-{month:02d}: {e}")

# Example usage: Download all data from January 2009 to September 2024
download_full_dataset(start_year=2009, start_month=1, end_year=2024, end_month=9)


Downloaded: data\raw\yellow_tripdata_2009-01.parquet
Downloaded: data\raw\yellow_tripdata_2009-02.parquet
Downloaded: data\raw\yellow_tripdata_2009-03.parquet
Downloaded: data\raw\yellow_tripdata_2009-04.parquet
Downloaded: data\raw\yellow_tripdata_2009-05.parquet
Downloaded: data\raw\yellow_tripdata_2009-06.parquet
Downloaded: data\raw\yellow_tripdata_2009-07.parquet
Downloaded: data\raw\yellow_tripdata_2009-08.parquet
Downloaded: data\raw\yellow_tripdata_2009-09.parquet
Downloaded: data\raw\yellow_tripdata_2009-10.parquet
Downloaded: data\raw\yellow_tripdata_2009-11.parquet
Downloaded: data\raw\yellow_tripdata_2009-12.parquet
Downloaded: data\raw\yellow_tripdata_2010-01.parquet
Downloaded: data\raw\yellow_tripdata_2010-02.parquet
Downloaded: data\raw\yellow_tripdata_2010-03.parquet
Downloaded: data\raw\yellow_tripdata_2010-04.parquet
Downloaded: data\raw\yellow_tripdata_2010-05.parquet
Downloaded: data\raw\yellow_tripdata_2010-06.parquet
Downloaded: data\raw\yellow_tripdata_2010-07.p

In [5]:
download_one_file_of_raw_data(year=2022, month=1)


Downloaded: data\raw\rides_2022-01.parquet


WindowsPath('data/raw/rides_2022-01.parquet')

In [9]:
import pandas as pd

file_path = "./data/raw/rides_2022-01.parquet"  # Relative to the notebooks folder
rides = pd.read_parquet(file_path)
print(rides.head(20))


    VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0          1  2022-01-01 00:35:40   2022-01-01 00:53:29              2.0   
1          1  2022-01-01 00:33:43   2022-01-01 00:42:07              1.0   
2          2  2022-01-01 00:53:21   2022-01-01 01:02:19              1.0   
3          2  2022-01-01 00:25:21   2022-01-01 00:35:23              1.0   
4          2  2022-01-01 00:36:48   2022-01-01 01:14:20              1.0   
5          1  2022-01-01 00:40:15   2022-01-01 01:09:48              1.0   
6          2  2022-01-01 00:20:50   2022-01-01 00:34:58              1.0   
7          2  2022-01-01 00:13:04   2022-01-01 00:22:45              1.0   
8          2  2022-01-01 00:30:02   2022-01-01 00:44:49              1.0   
9          2  2022-01-01 00:48:52   2022-01-01 00:53:28              1.0   
10         2  2022-01-01 00:55:03   2022-01-01 01:04:25              1.0   
11         2  2022-01-01 00:31:06   2022-01-01 00:34:14              3.0   
12         2