# Download TLC Data
This notebook is used to download the TLC data from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
Most of this code is copied from Week 2 Tutorial Content

In [1]:
from urllib.request import urlretrieve
import os

In [2]:
output_relative_dir = '../data/landing/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for target_dir in ('tlc_data',): # taxi_zones should already exist
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)

The data timeframe is chosen to be 2023-01 to 2023-06 since this is the latest timeframe we can use while still falling in Legally Operating Businesses Dataset timeframe. The data is limited to 6 months to reduce the amount of data downloaded and processed as my system is not very powerful.

In [3]:
DATA_RANGE = {
    '2022': range(1, 7),
}
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_"#year-month.parquet

In [4]:
tlc_output_dir = output_relative_dir + 'tlc_data'

for year in DATA_RANGE:
    for month in DATA_RANGE[year]:
        # 0-fill i.e 1 -> 01, 2 -> 02, etc
        month = str(month).zfill(2) 
        print(f"Begin month {month}")
        
        # generate url
        url = f'{URL_TEMPLATE}{year}-{month}.parquet'
        # generate output location and filename
        output_dir = f"{tlc_output_dir}/{year}-{month}.parquet"
        # download
        urlretrieve(url, output_dir) 
        
        print(f"Completed {month}-{year}")

Begin month 01
Completed 01-2022
Begin month 02
Completed 02-2022
Begin month 03
Completed 03-2022
Begin month 04
Completed 04-2022
Begin month 05
Completed 05-2022
Begin month 06
Completed 06-2022


In [6]:
# Download the taxi zone data
shapefile_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"
shapefile_output_dir = "../data/taxi_zones.zip"
urlretrieve(shapefile_url, shapefile_output_dir)

('../data/taxi_zones.zip', <http.client.HTTPMessage at 0x14ff13860>)

In [9]:
# Unzip the taxi zone data
import zipfile
with zipfile.ZipFile(shapefile_output_dir, 'r') as zip_ref:
    zip_ref.extractall("../data/taxi_zones")

In [11]:
# download the lookup data
lookup_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"
lookup_output_dir = "../data/taxi_zones/taxi_zone_lookup.csv"
urlretrieve(lookup_url, lookup_output_dir)

('../data/taxi_zones/taxi_zone_lookup.csv',
 <http.client.HTTPMessage at 0x1633126c0>)