In [4]:
import requests
import re
import os
import time

LOOKUP_URL = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv'
URL ='https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page'
URL_PATH = 'https://d37ci6vzurychx.cloudfront.net/trip-data'
PARQUET_OUTPUT_PATH = 'storage/taxi_industry/parquet'
CSV_OUTPUT_PATH = 'storage/taxi_industry/csv'
TABLE_NAMES = [
    'yellow_tripdata',
    'green_tripdata',
]

# Lookup table

In [5]:
if not os.path.exists(CSV_OUTPUT_PATH):
    os.makedirs(CSV_OUTPUT_PATH)


response = requests.get(LOOKUP_URL)
if response.status_code == 200:
    with open(f'{CSV_OUTPUT_PATH}/taxi_zone_lookup.csv', 'wb+') as file:
        file.write(response.content)

# Trip data

In [6]:
response = requests.get(URL)
download_links = []
for table in TABLE_NAMES:
    # example: href="https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet"
    #          href="https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-02.parquet"

    matches = re.findall(rf'href="({URL_PATH}/.*{table}_202[45].*\.parquet)"', response.text) # ONLY 2024 and 2025 data
    download_links.extend(matches) if matches else None
print("download_links=",download_links)
if not os.path.exists(PARQUET_OUTPUT_PATH):
    os.makedirs(PARQUET_OUTPUT_PATH)

arr = os.listdir(PARQUET_OUTPUT_PATH)
existing_files = []
for file in arr:
    existing_files.append(file)

for link in download_links:
    filename = link.split('/')[-1]
    if filename in existing_files:
        # print(f'Skipping {filename}, already exists.')
        continue
    print(f'Downloading {filename}...')

    # use exponential backoff to avoid hitting rate limits.

    max_retries = 3
    backoff = 1  # seconds
    for attempt in range(max_retries):
        response = requests.get(link)
        if response.status_code == 200:
            with open(f'{PARQUET_OUTPUT_PATH}/{filename}', 'wb+') as file:
                file.write(response.content)
            break
        elif response.status_code == 403:
            print(f"Failed to download {filename} (status {response.status_code}), ACCESS DENIED")
            break
        else:
            print(f"Failed to download {filename} (status {response.status_code}), retrying in {backoff} seconds...")
            time.sleep(backoff)
            backoff *= 2
    else:
        print(f"Failed to download {filename} after {max_retries} attempts.")

print('Finish!')

download_links= ['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-02.parquet', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-03.parquet', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-04.parquet', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-05.parquet', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-04.parquet', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-05.parquet', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-06.parquet', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-07.parquet', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-08.parquet', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-09.parquet', 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-