### Import required modules and set environment variables

In [1]:
import os
import requests
from datetime import datetime, timedelta

In [None]:
OUTPUT_FOLDER = r'C:\Users\gauth\Desktop\Extended Medallion Architecture\storage_raw\trip_data'
BASE_URL = 'https://d37ci6vzurychx.cloudfront.net/trip-data'
START_YEAR = 2023
START_MONTH = 1
HEADERS = {"User-Agent": "Mozilla/5.0","Accept": "*/*"}


### Scan the folder to get the latest month

In [3]:
def get_latest_downloaded_month(folder):

    if not os.path.exists(folder):
        os.makedirs(folder, exist_ok=True)
        return None

    max_date = None

    for root, _, files in os.walk(folder):
        for f in files:
            if f.startswith("green_tripdata_") and f.endswith(".parquet"):
                try:
                    date_part = f.replace("green_tripdata_", "").replace(".parquet", "")
                    curr_date = datetime.strptime(date_part, "%Y-%m")

                    if max_date is None or curr_date > max_date:
                        max_date = curr_date

                except ValueError:
                    continue

    return max_date


In [4]:
def add_months(source_date, months):
    new_month = source_date.month - 1 + months
    year = source_date.year + new_month // 12
    month = new_month % 12 + 1
    return datetime(year, month, 1)

In [5]:
def incremental_load():

    last_date = get_latest_downloaded_month(OUTPUT_FOLDER)

    if last_date:
        next_date = add_months(last_date, 1)
        print(f"Data found till {last_date.strftime('%Y-%m')}. Updates starting from {next_date.strftime('%Y-%m')}")
    else:
        next_date = datetime(START_YEAR, START_MONTH, 1)
        print(f"No existing data found. Pulling batch from {next_date.strftime('%Y-%m')}")

    while True:
        year = next_date.strftime('%Y')
        month = next_date.strftime('%m')
        file_name = f"green_tripdata_{year}-{month}.parquet"
        url = f"{BASE_URL}/{file_name}"

        year_folder = os.path.join(OUTPUT_FOLDER, f"year={year}")
        os.makedirs(year_folder, exist_ok=True)
        save_path = os.path.join(year_folder, file_name)

        try:
            print(f"Downloading {file_name}")
            response = requests.get(url, headers = HEADERS, stream=True, timeout = 30)

            # Stop if file doesn't exist
            if response.status_code == 404:
                print("No more files available. Stopping.")
                break

            # Stop if blocked
            if response.status_code == 403:
                print(f"Access denied / blocked for {year}-{month}. Stopping safely.")
                break

            with open(save_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024 * 1024):
                    if chunk:
                        f.write(chunk)

            print(f"Successfully ingested {file_name}")

            next_date = add_months(next_date, 1)

        except Exception as e:
            print(f"Exception: {e}")
            break


In [6]:
if __name__ == "__main__":
    incremental_load()

Data found till 2025-11. Updates starting from 2025-12
Downloading green_tripdata_2025-12.parquet
Access denied / blocked for 2025-12. Stopping safely.
