In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import shapefile
from pandas.plotting import table
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime
import geopandas as gpd
import folium
from datetime import date
from folium.plugins import FastMarkerCluster
import os
# a nice way of filtering out deprecated warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
fname = ["march_taxi.csv","april_taxi.csv","may_taxi.csv","june_taxi.csv","july_taxi.csv","august_taxi.csv",
         "september_taxi.csv", "october_taxi.csv", 
         "november_taxi.csv","december_taxi.csv"]
months = ["march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]


In [3]:

sf = gpd.read_file("../raw_data/taxi_zones.shp")
zone = pd.read_csv("../raw_data/taxi+_zone_lookup.csv")

# Convert the geometry shape to to latitude and longitude
# Reproduced from Lab 2 for MAST30034
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

In [4]:
month_number = [3,4,5,6,7,8,9,10,11,12]


In [5]:
start_time = datetime.now()
for i in range(len(fname)):
    df = pd.read_csv("../raw_data/"+fname[i])
   
    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
    # Code adapted from https://queirozf.com/entries/pandas-dataframe-examples-manipulating-date-and-time
    if (fname[i] == "march_taxi.csv"):
        df = df.loc[(pd.DatetimeIndex(df["tpep_pickup_datetime"]).month == 3) & (pd.DatetimeIndex(df["tpep_pickup_datetime"]).day >= 8)]
    else:
        df= df.loc[pd.DatetimeIndex(df["tpep_pickup_datetime"]).month == month_number[i]]
    df= df.loc[pd.DatetimeIndex(df["tpep_pickup_datetime"]).year == 2020]
    df["pickup_day"] = df["tpep_pickup_datetime"].map(lambda ts: ts.strftime("%m-%d-%y"))

    desc = df[["passenger_count", "trip_distance"]].describe()
    plt.style.use('seaborn-darkgrid')
    plt.figure(figsize = (15,15))
    plt.title("Passenger Count according to Trip Distance")
    plt.xlabel("Trip Distance")
    plt.ylabel("Passenger Count")
    plt.scatter(df[["trip_distance"]], df[["passenger_count"]])                                 
    
    plt.savefig('../plots/'+months[i]+"_desc_plot.png")
    plt.clf()
    
    ## Checked for valid trips since distance and fare amount has to be greater than 0 to count and pickup/dropoff locations have to be valid
    df = df.loc[(df["passenger_count"] > 0) & (df["trip_distance"] < 2000) & (df["trip_distance"] > 0) 
            & (df["fare_amount"] < 1000) & (df["fare_amount"] > 0) & (df["PULocationID"] > 0) & (df["PULocationID"] < 264)
           & (df["DOLocationID"] > 0) & (df["DOLocationID"] < 264)]
    df.to_csv("../preprocessed_data/" + months[i] + "_preprocess.csv")
    
    ## Join data with their respective geographies
    pickup_gdf = gpd.GeoDataFrame(pd.merge(df, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)
   
    dropoff_gdf = gpd.GeoDataFrame(pd.merge(df, sf, left_on='DOLocationID', right_on='LocationID')).drop('DOLocationID',axis=1)
    pickup_gdf['count'] = 1
    dropoff_gdf['count'] = 1
    
    # Group data based on day and LocationID
    rides_per_day = pickup_gdf.groupby(["pickup_day", "LocationID"]).count()[['count']].reset_index()
    passenger_per_day = pickup_gdf.groupby(["pickup_day", "LocationID"]).mean()[['passenger_count']].reset_index()
    distance_per_day = pickup_gdf.groupby(["pickup_day", "LocationID"]).mean()[['trip_distance']].reset_index()
    
    # Group data based on day and LocationID
    rides_per_day = pickup_gdf.groupby(["pickup_day", "LocationID"]).count()[['count']].reset_index()
    # Merged group day data with geodata
    rides_per_day_merged = sf.set_index('LocationID').join(rides_per_day.set_index('LocationID'))
    rides_per_day_merged = rides_per_day_merged.reset_index()
    passenger_per_day_merged = sf.set_index('LocationID').join(passenger_per_day.set_index('LocationID'))
    passenger_per_day_merged = passenger_per_day_merged.reset_index()
    distance_per_day_merged = sf.set_index('LocationID').join(distance_per_day.set_index('LocationID'))
    distance_per_day_merged = distance_per_day_merged.reset_index()

   

    
    # Get mean number of passengers per borough
    passenger_per_day_per_borough = passenger_per_day_merged.groupby(["pickup_day","borough"]).mean()[['passenger_count']].reset_index()
    passenger_per_day_per_borough.to_csv("../preprocessed_data/" + months[i] + "_passengers.csv")
    
    # Get count of pickups per borough
    rides_per_day_per_borough = rides_per_day_merged.groupby(["pickup_day","borough"]).sum()[['count']].reset_index()
    rides_per_day_per_borough["pickup_day"] = pd.to_datetime(rides_per_day_per_borough["pickup_day"])
    rides_per_day_per_borough.to_csv("../preprocessed_data/" + months[i] + "_rides_per_day.csv")
    
    
    pickup_geoJSON = pickup_gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()
    
    # Get average distance per borough
#     distance_merged = distance_merged.mean()[['trip_distance']].reset_index()
    distance_per_day_per_borough = distance_per_day_merged.groupby(["pickup_day","borough"]).mean()[['trip_distance']].reset_index()
    # Get average rides per day per borough
    distance_per_day_per_borough.to_csv("../preprocessed_data/" + months[i] + "_distance.csv")
    

    # Draw folium chropleth maps
    m_pickups = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)

    # Adapted from lab 2 of MAST30034
    # Plot pickup data
    folium.Choropleth(
        geo_data=pickup_geoJSON, # geoJSON 
        name='choropleth', # name of plot
        data=rides_per_day, # data source
        columns=['LocationID','count'], # the columns required
        key_on='properties.LocationID', # this is from the geoJSON's properties
        fill_color='OrRd', # color scheme
        fill_opacity=0.9,
        line_opacity=0.5,
        legend_name='Trips' # legend title
    ).add_to(m_pickups)

    m_pickups.save('../plots/' + months[i] +'foliumChoroplethPickups.html')

    m_pickups = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
    
    folium.Choropleth(
        geo_data=pickup_geoJSON, # geoJSON 
        name='choropleth', # name of plot
        data=passenger_per_day, # data source
        columns=['LocationID','passenger_count'], # the columns required
        key_on='properties.LocationID', # this is from the geoJSON's properties
        fill_color='OrRd', # color scheme
        fill_opacity=0.9,
        line_opacity=0.5,
        legend_name='Passenger Count' # legend title
    ).add_to(m_pickups)

    m_pickups.save('../plots/' + months[i] + 'foliumChoroplethPassengers.html')
    
    m_pickups = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)

    folium.Choropleth(
        geo_data=pickup_geoJSON, # geoJSON 
        name='choropleth', # name of plot
        data=distance_per_day, # data source
        columns=['LocationID','trip_distance'], # the columns required
        key_on='properties.LocationID', # this is from the geoJSON's properties
        fill_color='OrRd', # color scheme
        fill_opacity=0.9,
        line_opacity=0.5,
        legend_name='Trip Distance' # legend title
    ).add_to(m_pickups)

    m_pickups.save('../plots/' + months[i] + 'foliumChoroplethDistance.html')
 
 
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:06:25.517436


<Figure size 1080x1080 with 0 Axes>

<Figure size 1080x1080 with 0 Axes>

<Figure size 1080x1080 with 0 Axes>

<Figure size 1080x1080 with 0 Axes>

<Figure size 1080x1080 with 0 Axes>

<Figure size 1080x1080 with 0 Axes>

<Figure size 1080x1080 with 0 Axes>

<Figure size 1080x1080 with 0 Axes>

<Figure size 1080x1080 with 0 Axes>

<Figure size 1080x1080 with 0 Axes>