<a href="https://colab.research.google.com/github/dvignoles/nyc-transpo/blob/master/nyc_transpo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import requests, zipfile, io, os, re, csv

TAXI_CSV_PATH = "./taxi_data"
BIKE_CSV_PATH = "./bike_data"

# We will use with 2013-2018 June data since June 2013 is when Citi Bike was started, and 2013-2018 data is available for both taxi and bikes.
YEARS = ["2013", "2014", "2015", "2016", "2017", "2018"]

In [0]:
# Set up directories and util functions to store the downloaded csv files

if os.path.isdir(TAXI_CSV_PATH) is False:
  os.mkdir(TAXI_CSV_PATH)
if os.path.isdir(BIKE_CSV_PATH) is False:
  os.mkdir(BIKE_CSV_PATH)

def save_csv(file_path, http_data):
    with open(file_path, "w") as f:
      writer = csv.writer(f)
      reader = csv.reader(http_data.text.splitlines())
      for row in reader:
        writer.writerow(row)
        
def get_csv_name(data_type, year):
  if data_type is "taxi":
    return TAXI_CSV_PATH + "/" + year + "-06.csv"
  elif data_type is "bike":
    return BIKE_CSV_PATH + "/" + year + "-06.csv"
  else:
    raise Exception("Data type must be taxi or bike")

# Taxi Dataframes


In [0]:
# Download and save the files first so they don't have to be redownloaded every time

for n in YEARS:
  file_path = get_csv_name("taxi", n)
  if os.path.isfile(file_path) is False:
    data = requests.get("https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_201" + n + "-06.csv")
    save_csv(file_path, data)
    
# Load the actual data

df_taxi_2013 = pd.read_csv(get_csv_name("taxi", "2013"), nrows=100000, low_memory=False)
df_taxi_2014 = pd.read_csv(get_csv_name("taxi", "2014"), nrows=100000, low_memory=False)
df_taxi_2015 = pd.read_csv(get_csv_name("taxi", "2015"), nrows=100000, low_memory=False)
df_taxi_2016 = pd.read_csv(get_csv_name("taxi", "2016"), nrows=100000, low_memory=False)
df_taxi_2017 = pd.read_csv(get_csv_name("taxi", "2017"), nrows=100000, low_memory=False)
df_taxi_2018 = pd.read_csv(get_csv_name("taxi", "2018"), nrows=100000, low_memory=False)

# Citi Bike Dataframes
Since Citi Bike does not directly provide csv files, we must handle the zip files, extract, and load them accordingly.

In [0]:
reg_pattern = re.compile("^\d+.*\.csv$")

for n in YEARS:
  # See if the file exists
  file_path = get_csv_name("bike", n)
  if os.path.isfile(file_path) is True:
    continue

  # If not, request and save it
  req_url = ""
  if n != "2017" and n != "2018":
    req_url = "https://s3.amazonaws.com/tripdata/" + n + "06-citibike-tripdata.zip"
  else:
    req_url = "https://s3.amazonaws.com/tripdata/" + n + "06-citibike-tripdata.csv.zip"
  z = zipfile.ZipFile(io.BytesIO(requests.get(req_url).content))
  orig_file_name = list(filter(reg_pattern.match, z.namelist()))[0]
  z.extract(orig_file_name)
  os.rename(orig_file_name, get_csv_name("bike", n)) # Renaming the file will automatically remove it

In [0]:
df_bike_2013 = pd.read_csv(get_csv_name("bike", "2013"), nrows=100000, low_memory=False)
df_bike_2014 = pd.read_csv(get_csv_name("bike", "2014"), nrows=100000, low_memory=False)
df_bike_2015 = pd.read_csv(get_csv_name("bike", "2015"), nrows=100000, low_memory=False)
df_bike_2016 = pd.read_csv(get_csv_name("bike", "2016"), nrows=100000, low_memory=False)
df_bike_2017 = pd.read_csv(get_csv_name("bike", "2017"), nrows=100000, low_memory=False)
df_bike_2018 = pd.read_csv(get_csv_name("bike", "2018"), nrows=100000, low_memory=False)