# NYC Yellow cab dataset

In [1]:
import pandas as pd
import numpy as np
from time import time

## 1. Download Data

Only download the `TARGET_COLUMNS` from the S3 repository. 

__Warning:__ Even with an excellent connection, it takes around 3 min by month downloaded, so __~40 min__ for the full year.

In [2]:
%%time

TARGET_COLUMNS = ['tpep_pickup_datetime', 'PULocationID'] 

months = []
for m in range(1, 13):
    
    month = '{0:0>2}'.format(m)
    url = "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-{}.csv".format(month)
    
    print('--| ' + url)
    now = time()
    months.append(pd.read_csv(url, usecols=TARGET_COLUMNS))
    
    print('-->', int(time()-now), 'seconds\n')

--| https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-01.csv
--> 75 seconds

--| https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-02.csv
--> 147 seconds

--| https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-03.csv
--> 81 seconds

--| https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-04.csv
--> 199 seconds

--| https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-05.csv
--> 313 seconds

--| https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-06.csv
--> 137 seconds

--| https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-07.csv
--> 113 seconds

--| https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-08.csv
--> 158 seconds

--| https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-09.csv
--> 214 seconds

--| https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-10.csv
--> 92 seconds

--| https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2017-11.csv
--> 408 

## 2. Data Preprocessing

### 2.1 Build a single DataFrame

Merge `months` into a single DataFrame: `yellow`.

In [3]:
yellow = pd.concat(months, ignore_index=True)
yellow.columns = ['pickup_datetime', 'taxi_zone']

### 2.2 Create an hourly timestamp

Truncate `pickup_datetime` to groupby every trips that are starting from the same `taxi_zone` at the same hourly time.

In [5]:
# convert to datetime
yellow.pickup_datetime = pd.to_datetime(yellow.pickup_datetime, utc=True)

# Truncate the datetime to groupby by hour each trip
yellow.pickup_datetime = yellow.pickup_datetime.dt.floor('h')

# Add column to count the number of trips in each taxi zone
yellow['trip_counter'] = np.ones(yellow.shape[0])
yellow = yellow.groupby(['pickup_datetime', 'taxi_zone']).sum()

### 2.3 Index on time
Set `pickup_datetime` as index.

In [6]:
yellow.reset_index(inplace=True)
yellow.set_index('pickup_datetime', inplace=True)

### 2.4 Last cleaning step
Filter unwanted lines added retrospectively.

In [7]:
yellow = yellow['2017-01-01 00:00':'2017-12-31 23:00']

## 3. Export Data

In [8]:
PATH = 'data/' # Modify to fit your data folder.

In [9]:
yellow.to_csv(PATH + 'yellow.csv', index=True)