### Main Updates
* Dataset now includes average passenger count and trip duration
* Final version of fare hike correction
* For routes with no ride, replace non-count quantities by average 

### load packages

In [6]:
# for automatic reloading of modules (quick debugging)
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append('..') # add parent directory to path

import numpy as np 
import pandas as pd
from utils import processing as pr
import matplotlib.pyplot as plt
import geopandas as gpd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### load data for range of time of choice
Two years and three months of data.

In [8]:
# preprocess months and years to an (N,2)-array
months = np.arange(1,3+1,1)
years = np.arange(2023,2023+1,1)
month_year = np.meshgrid(years, months)
month_year = np.array(month_year).T.reshape(-1,2)

In [9]:
month_year

array([[2023,    1],
       [2023,    2],
       [2023,    3]])

### Load Inland Manhattan Taxi Zone Data

In [10]:
zdf = pd.read_csv('../assets/taxi_zone_lookup.csv')
zdf = zdf[(zdf['Borough'] == 'Manhattan') & (~zdf['LocationID'].isin([103, 104, 105, 153, 194, 202]))]
taxi_zones = list(zdf['LocationID'].values)

In [11]:
# generate data
ts = pr.generate_processed_data(month_year, vehicle_type='yellow', by_value=['PULocationID', 'DOLocationID'], additional_features=True, taxi_zones=taxi_zones)
ts.head()

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:22<00:00,  7.47s/it]


Unnamed: 0,pickup_datetime,DOLocationID,PULocationID,counts,total_amount,tip_amount,fare_amount,trip_distance,passenger_count,trip_duration
0,2023-01-01 00:00:00-05:00,4,4,0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-01-01 00:00:00-05:00,12,4,0,0.0,0.0,0.0,0.0,0.0,0.0
2,2023-01-01 00:00:00-05:00,13,4,0,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-01-01 00:00:00-05:00,24,4,0,0.0,0.0,0.0,0.0,0.0,0.0
4,2023-01-01 00:00:00-05:00,41,4,0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
ts.pickup_datetime.min(), ts.pickup_datetime.max()

(Timestamp('2023-01-01 00:00:00-0500', tz='America/New_York'),
 Timestamp('2023-03-31 23:00:00-0400', tz='America/New_York'))

In [13]:
# basic checks, notice our code accounts for Daylight Savings Time
print('Min Date: ', ts.pickup_datetime.min())
print('Max Date: ', ts.pickup_datetime.max())
print('Taxi Zones: ' + str(ts.PULocationID.unique().min()) + ' to ' + str(ts.PULocationID.unique().max()) )
print('Memory Usage (MB): ', ts.memory_usage().sum() / 1024**2)

Min Date:  2023-01-01 00:00:00-05:00
Max Date:  2023-03-31 23:00:00-04:00
Taxi Zones: 4 to 263
Memory Usage (MB):  336.8226318359375


### postprocessing
* Adjust quantities affected by rate hike
* Account for routes with no counts in a given hour

In [15]:
ts_p = pr.postprocess_data(ts)

### Save Taxi Dataframe and Geopandas Dataframe

In [16]:
os.system('mkdir -p ../../processed_taxi_data')

# save processed taxi dataframe
ts_p.to_pickle('../../processed_taxi_data/adjusted_yellow_2023-01_2023-03.pkl')

In [28]:
# Geopandas dataframe
gdf = gpd.read_file('../assets/tableau/taxi_zones/taxi_zones.shp')  # read shapefile
gdf = gdf[(gdf['borough'] == 'Manhattan') & (~gdf['LocationID'].isin([103, 104, 105, 153, 194, 202]))]

# extract coordinates of zones: https://gis.stackexchange.com/questions/412817/how-to-get-lat-and-lon-from-a-geopandas-geodataframe-polygon
gdf = gdf.to_crs(4326) # reproject data
gdf['lon'] = gdf.centroid.x  
gdf['lat'] = gdf.centroid.y
gdf.reset_index(drop=True, inplace=True)

# save geopandas dataframe
gdf.to_pickle('../../processed_taxi_data/manhattan_taxi_zones.pkl')


  gdf['lon'] = gdf.centroid.x

  gdf['lat'] = gdf.centroid.y
