# Data Processing

In [None]:
import pandas as pd
import datetime
import time
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns  #Plots
import pickle #to save the model
import warnings

In [ ]:
%%bash
wget https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv
wget https://data.cityofnewyork.us/api/geospatial/d3c5-ddgc\?method\=export\&format\=GeoJSON -O taxi_zones.geojson

In [None]:
%%bash
for i in {1..12}
do
    wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-$(printf "%02d" $i).parquet
done
mkdir yellow_tripdata_2022
mv yellow_tripdata_2022-*.parquet yellow_tripdata_2022


In [None]:

import datetime
import pandas as pd

trips = pd.read_parquet(
    'yellow_tripdata_2022'
    )
# Remove outliers.
trips = trips[(trips['tpep_pickup_datetime'] >= datetime.datetime(2022, 1, 1)) & (trips['tpep_pickup_datetime'] <= datetime.datetime(2022, 12, 31))]
trips.head()


In [ ]:
trips.dtypes

In [ ]:
trips.shape

In [None]:
taxi_zones = pd.read_csv('taxi+_zone_lookup.csv', usecols=['LocationID', 'Zone'])
taxi_zones.set_index(['LocationID'], inplace=True)
taxi_zones

In [None]:
import json
with open('taxi_zones.geojson') as fd:
    geojson = json.load(fd)

## Temporal Analysis




In [None]:
trips['PU_dayofweek'] = trips['tpep_pickup_datetime'].dt.dayofweek
trips['PU_hour'] = trips['tpep_pickup_datetime'].dt.hour
gb_time = trips.groupby(by=['PU_dayofweek', 'PU_hour'], as_index=False).agg(count=('PU_dayofweek', 'count'))
gb_time

In [None]:
import plotly.express as px

b = px.bar(
    gb_time,
    x='PU_hour',
    y='count',
    color='PU_dayofweek',
    color_continuous_scale='sunset_r',
)
b.show()

In [None]:
b = px.bar(
    gb_time,
    x='PU_dayofweek',
    y='count',
    color='PU_hour',
    color_continuous_scale='sunset_r',
)
b.show()

Spatial Analysis


## Spatial Analysis


In [None]:
gb_pu_location = trips.groupby(['PULocationID'], as_index=False).agg(count=('PULocationID', 'count'))
gb_pu_location

In [None]:
import plotly.graph_objects as go

fig = go.Figure(
    go.Choroplethmapbox(
        geojson=geojson,
        featureidkey='properties.location_id',
        locations=gb_pu_location['PULocationID'],
        z=gb_pu_location['count'],
        colorscale="Viridis",
        marker_opacity=0.7,
        marker_line_width=0.1
    )
)
fig.update_layout(
    mapbox_style="carto-positron",
    mapbox_zoom=9,
    mapbox_center = {"lat": 40.7158, "lon": -73.9805},
    height=600,
)
fig.show()