<a href="https://colab.research.google.com/github/dvignoles/nyc-transpo/blob/master/nyc_transpo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import requests, zipfile, io, os, re, csv

TAXI_CSV_PATH = "./taxi_data"
BIKE_CSV_PATH = "./bike_data"

# We will use with 2013-2018 June data since June 2013 is when Citi Bike was started, and 2013-2018 data is available for both taxi and bikes.
YEARS = ["2013", "2014", "2015", "2016", "2017", "2018"]

In [0]:
# Set up directories and util functions to store the downloaded csv files

if os.path.isdir(TAXI_CSV_PATH) is False:
  os.mkdir(TAXI_CSV_PATH)
if os.path.isdir(BIKE_CSV_PATH) is False:
  os.mkdir(BIKE_CSV_PATH)

def save_csv(file_path, http_data):
    with open(file_path, "w") as f:
      writer = csv.writer(f)
      reader = csv.reader(http_data.text.splitlines())
      for row in reader:
        writer.writerow(row)
        
def get_csv_name(data_type, year):
  if data_type is "taxi":
    return TAXI_CSV_PATH + "/" + year + "-06.csv"
  elif data_type is "bike":
    return BIKE_CSV_PATH + "/" + year + "-06.csv"
  else:
    raise Exception("Data type must be taxi or bike")

# Taxi Dataframes
Since these downloads take a long time, do NOT reset all run times when it asks you to! It will wipe all downloaded files, and you'll have to download them again!

In [0]:
# Download and save the files first so they don't have to be redownloaded every time

for n in YEARS:
  file_path = get_csv_name("taxi", n)
  if os.path.isfile(file_path) is False:
    data = requests.get("https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_" + n + "-06.csv")
    save_csv(file_path, data)

dtypes = {
    "vendor_id": "str",
    "VendorID": "str",
    "pickup_datetime": "str",
    "dropoff_datetime": "str",
    "tpep_pickup_datetime": "str",
    "tpep_dropoff_datetime": "str",
    "passenger_count": np.int32,
    "trip_distance": np.float32,
    "pickup_longitude": np.float32,
    "pickup_latitude": np.float32,
    "PULocationID": np.int32,
    "DOLocationID": np.int32,
    "rate_code": np.int32,
    "RateCodeID": np.int32,
    "store_and_fwd_flag": "str",
    "dropoff_longitude": np.float32,
    "dropoff_latitude": np.float32,
    "payment_type": "str",
    "fare_amount": np.float32,
    "surcharge": np.float32,
    "mta_tax": np.float32,
    "tip_amount": np.float32,
    "tolls_amount": np.float32,
    "total_amount": np.float32
}

# Load the actual data

df_taxi_2013 = pd.read_csv(get_csv_name("taxi", "2013"), dtype=dtypes, nrows=1000000)
df_taxi_2014 = pd.read_csv(get_csv_name("taxi", "2014"), dtype=dtypes, nrows=1000000)
df_taxi_2015 = pd.read_csv(get_csv_name("taxi", "2015"), dtype=dtypes, nrows=1000000)
df_taxi_2016 = pd.read_csv(get_csv_name("taxi", "2016"), dtype=dtypes, nrows=1000000)
df_taxi_2017 = pd.read_csv(get_csv_name("taxi", "2017"), dtype=dtypes, nrows=1000000)
df_taxi_2018 = pd.read_csv(get_csv_name("taxi", "2018"), dtype=dtypes, nrows=1000000)

In [101]:
df_taxi_2013.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,CMT,2013-06-03 00:02:12,2013-06-03 00:10:07,1,1.3,-73.981583,40.773529,1,N,-73.981827,40.782124,CRD,7.5,0.5,0.5,2.12,0.0,10.62
1,CMT,2013-06-03 00:03:03,2013-06-03 00:19:27,1,4.9,-73.999565,40.728367,1,N,-73.952927,40.729546,CRD,17.0,0.5,0.5,3.6,0.0,21.6
2,CMT,2013-06-03 00:01:30,2013-06-03 00:28:11,1,17.700001,-73.788445,40.641151,2,N,-73.985451,40.744194,CRD,52.0,0.0,0.5,5.0,5.33,62.830002
3,CMT,2013-06-03 00:04:14,2013-06-03 00:27:50,1,12.1,-73.862816,40.768875,1,N,-74.008797,40.738842,CRD,34.5,0.5,0.5,7.1,0.0,42.599998
4,CMT,2013-06-03 00:04:53,2013-06-03 00:10:46,1,1.1,-73.964905,40.806881,1,N,-73.962349,40.794987,CRD,6.5,0.5,0.5,1.5,0.0,9.0


# Citi Bike Dataframes
Since Citi Bike does not directly provide csv files, we must handle the zip files, extract, and load them accordingly.

In [0]:
reg_pattern = re.compile("^\d+.*\.csv$")

for n in YEARS:
  # See if the file exists
  file_path = get_csv_name("bike", n)
  if os.path.isfile(file_path) is True:
    continue

  # If not, request and save it
  req_url = ""
  if n != "2017" and n != "2018":
    req_url = "https://s3.amazonaws.com/tripdata/" + n + "06-citibike-tripdata.zip"
  else:
    req_url = "https://s3.amazonaws.com/tripdata/" + n + "06-citibike-tripdata.csv.zip"
  z = zipfile.ZipFile(io.BytesIO(requests.get(req_url).content))
  orig_file_name = list(filter(reg_pattern.match, z.namelist()))[0]
  z.extract(orig_file_name)
  os.rename(orig_file_name, get_csv_name("bike", n)) # Renaming the file will automatically remove it

In [0]:
# The dtypes can't be passed in atm because some columns that should be integers have non-number strings in them
# Some columns that should be integers are floats instead because pandas can't put NaN values in integer columns
"""
dtypes = {
    "tripduration": np.int32,
    "starttime": "str",
    "stoptime": "str",
    "start station id": np.float32,
    "start station name": "str",
    "start station latitude": np.float32,
    "start station longitude": np.float32,
    "end station id": np.float32,
    "end station name": "str",
    "end station latitude": np.float32,
    "end station longitude": np.float32,
    "bikeid": np.int32,
    "usertype": "str",
    "birth year": np.float32,
    "gender": np.int32
}
"""


df_bike_2013 = pd.read_csv(get_csv_name("bike", "2013"))
df_bike_2014 = pd.read_csv(get_csv_name("bike", "2014"))
df_bike_2015 = pd.read_csv(get_csv_name("bike", "2015"))
df_bike_2016 = pd.read_csv(get_csv_name("bike", "2016"))
df_bike_2017 = pd.read_csv(get_csv_name("bike", "2017"))
df_bike_2018 = pd.read_csv(get_csv_name("bike", "2018"))

In [102]:
df_bike_2013.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,695,2013-06-01 00:00:01,2013-06-01 00:11:36,444,Broadway & W 24 St,40.742354,-73.989151,434.0,9 Ave & W 18 St,40.743174,-74.003664,19678,Subscriber,1983.0,1
1,693,2013-06-01 00:00:08,2013-06-01 00:11:41,444,Broadway & W 24 St,40.742354,-73.989151,434.0,9 Ave & W 18 St,40.743174,-74.003664,16649,Subscriber,1984.0,1
2,2059,2013-06-01 00:00:44,2013-06-01 00:35:03,406,Hicks St & Montague St,40.695128,-73.995951,406.0,Hicks St & Montague St,40.695128,-73.995951,19599,Customer,,0
3,123,2013-06-01 00:01:04,2013-06-01 00:03:07,475,E 15 St & Irving Pl,40.735243,-73.987586,262.0,Washington Park,40.691782,-73.97373,16352,Subscriber,1960.0,1
4,1521,2013-06-01 00:01:22,2013-06-01 00:26:43,2008,Little West St & 1 Pl,40.705693,-74.016777,310.0,State St & Smith St,40.689269,-73.989129,15567,Subscriber,1983.0,1
