# NYC Taxi Project: Getting the Data

In [2]:
## NOTES:
## TODO: dotenv configuration
#  TODO: Commenting and Formatting

## Alternative Methods: 
# [] url read into pandas and then export to parquet (pandas to pyarrow export? or pandas export?)
# [] incorporate gpu or parallelization?

In [3]:
# import packages
import os
import requests
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as csv
from pyarrow import fs
from hdfs import InsecureClient
from tqdm.notebook import tqdm

In [4]:
# quick script to create all year-month dates for nyc-taxi files
dates = []
for year in list(map(str, range(2017, 2021))):
    for month in list(map(str, range(1, 13))):
        if len(month) == 1:
            month = '0' + month
        else:
            pass
        date = year + '-' + month
        dates.append(date)

In [None]:
# loop over urls to download files
# nyc-taxi files are stored on s3 with a standard filename convention
failed_links = []
csv_file_path = os.path.join('./data', 'staging', 'nyc-taxi')
for i, date in enumerate(tqdm(dates)):
    filename = f'yellow_taxi_{date}.csv'
    if filename in os.listdir(csv_file_path):
        print(f'File already exisits: {csv_file_path+filename}')
        continue
    else:
        url = f"https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{date}.csv"
        print(f'fetching yellow taxi {date} data from: \n{url}')
        try:    
            file_req = requests.get(url)
            url_content = file_req.content
            csv_file = open(f'{csv_file_path}/yellow_taxi_{date}.csv', 'wb')
            csv_file.write(url_content)
            csv_file.close()
        
        except ConnectionError:
            failed_links.append(url)

print(f'{i+1 - len(failed_links)}/{i+1} files successfully fetched')

In [29]:
# check total size of data files in csv format
csv_size = round(sum(os.path.getsize(os.path.join(csv_file_path, f)) for f in os.listdir(csv_file_path))/1e9, 2)

print(f'Total csv data size: {csv_size} GB')

Total csv data size: 29.16 GB


In [None]:
# get lookup table for taxi location ids
url = 'https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv'
print(f'fetching yellow taxi locations table...')
file_req = requests.get(url)
url_content = file_req.content
csv_file = open(f'./data/taxi_location_lookup_table.csv', 'wb')
csv_file.write(url_content)
csv_file.close()

In [None]:
# convert csv files to parquet files
parquet_file_path = os.path.join('./data', 'nyc-taxi')
tq_csv_files = tqdm([file for file in os.listdir(csv_file_path)])
for csv_file in tq_csv_files:
    parquet_file = csv_file.replace('csv', 'parquet')
    if parquet_file in os.listdir(parquet_file_path): 
        print(f'{parquet_file} already exists')
        continue
    tq_csv_files.set_description(f'Converting {csv_file} to parquet...')
    try:
        table = csv.read_csv(f'{csv_file_path}/{csv_file}')
    except pa.lib.ArrowInvalid:
        df = pd.read_csv(f'{csv_file_path}/{csv_file}', low_memory=False)
        table = pa.Table.from_pandas(df)
    pq.write_table(table, f'{parquet_file_path}/{parquet_file}', compression='snappy')

In [31]:
# check total file size of parquet data files
prqt_size = round(sum(os.path.getsize(os.path.join(parquet_file_path, f)) for f in os.listdir(parquet_file_path)) / 1e9, 2)
print(f'Total parquet data size: {prqt_size} GB')

Total parquet data size: 5.64 GB


In [8]:
# load data into hdfs
namenode_URI = 'http://localhost:9870'
hadoop_user = 'hadoop'

hdfs = InsecureClient(namenode_URI, user=hadoop_user)
hdfs_path = '/user/hadoop/input/'

for parquet in os.listdir(parquet_file_path):
    local_path = os.path.join('./data', 'nyc-taxi', parquet)
    hdfs.upload(hdfs_path, local_path, n_threads=4)

hdfs.list(hdfs_path)

HdfsError: Remote path '/user/hadoop/input/yellow_taxi_2019-07.parquet' already exists.