## Analysis

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from statsmodels.formula.api import ols
import statsmodels.api as sm

import pandas as pd
import numpy as np
import geopandas as gpd
import folium
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

In [None]:
# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 1 analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "3g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

In [None]:
# read training, test and sample dataset
train_sdf = spark.read.parquet('../data/curated/train')
test_sdf = spark.read.parquet("../data/curated/test")
sample_df = pd.read_parquet('../data/curated/sample_data')

### Distribution of Trip Time 

In [None]:
# distribution plot of the target variable "trip time"
sns.displot(sample_df["trip_time"])
plt.title("Distribution of Trip Time", fontsize=14)
plt.xlabel("Trip Time (s)", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.ticklabel_format(style='sci', axis='y')
plt.savefig("../plots/trip_time_dist.png", bbox_inches='tight')
plt.show()

In [None]:
# apply log transformation on trip time
log_time = np.log(sample_df["trip_time"])
sns.displot(log_time, kde=True)
plt.title("Distribution of Log Transformed Trip Time", fontsize=14)
plt.xlabel("Log transformed trip time (s)", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.savefig("../plots/log_trip_time_dist.png", bbox_inches='tight')
plt.show()

### Geospatial Visualisations

In [None]:
# sf stands for shape file
sf = gpd.read_file("../data/raw/taxi_zones/taxi_zones.shp")
zones = pd.read_csv("../data/raw/taxi_zones/taxi+_zone_lookup.csv")

# Convert the geometry shape to latitude and longitude
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

gdf = gpd.GeoDataFrame(
    pd.merge(zones, sf, on='LocationID', how='inner')
)

# create a JSON 
geoJSON = gdf[['LocationID', 'geometry']].drop_duplicates('LocationID').to_json()

# derive zone centroids 
gdf['centroid'] = gdf['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))
gdf[['Zone', 'LocationID', 'centroid']].head()

In [None]:
# compute median trip time of the whole training data in each pickup/dropoff location
pickup_median_time = train_sdf.groupBy("PULocationID")\
                              .agg(F.percentile_approx("trip_time", 0.5)\
                              .alias("median_trip_time"))\
                              .toPandas()
dropoff_median_time = train_sdf.groupBy("DOLocationID")\
                               .agg(F.percentile_approx("trip_time", 0.5)\
                               .alias("median_trip_time"))\
                               .toPandas()

# join the computed dataframe with geo dataframe
pickup_df = pickup_median_time.merge(gdf[['LocationID', 'geometry']], left_on='PULocationID', right_on='LocationID') \
                              .drop('LocationID', axis=1)

dropoff_df = dropoff_median_time.merge(gdf[['LocationID', 'geometry']], left_on='DOLocationID', right_on='LocationID') \
                                .drop('LocationID', axis=1)

In [None]:
# this function plots a choropleth map for a given dataframe and legend name
def draw_map(df, columns, legend_name):
    map = folium.Map(location=[40.73, -73.74], width=600, height=500, tiles="cartodbpositron", zoom_start=10)

    c = folium.Choropleth(
        geo_data=geoJSON, # geoJSON 
        name='choropleth', 
        data=df.reset_index(), # data source
        columns=columns, # the columns required
        key_on='properties.LocationID', # this is from the geoJSON's properties
        fill_color='YlOrRd', # color scheme
        line_opacity=0.1,
        fill_opacity=0.7,
        legend_name=legend_name
    )

    c.add_to(map)

    # mark all the airports on the map
    for zone_name, coord in gdf.loc[gdf['Zone'].str.contains('Airport'), ['Zone', 'centroid']].values:
        map.add_child(
            folium.Marker(location=coord, popup=zone_name, icon=folium.Icon(color='blue', icon='glyphicon-plane'))
        )

    return map

In [None]:
# plot the map of median trip time for each pickup location
pickup_map = draw_map(pickup_df, ['PULocationID','median_trip_time'], "Median Trip Duration (second)")
pickup_map.save('../plots/pickup_location_vs_median_trip_time_map.html')
pickup_map

In [None]:
# plot the map of median trip time of each dropoff location
dropoff_map = draw_map(dropoff_df, ['DOLocationID','median_trip_time'], "Median Trip Duration (second)")
dropoff_map.save('../plots/dropoff_location_vs_median_trip_time_map.html')
dropoff_map

It can be seen that trips to or from the three airports usually take longer. Also, trips within Manhattan generally have longer duration.
Therefore, we decide to create new binary attributes that indicate whether the trip is within Manhattan or is an airport trip.

In [None]:
# observe the location id of airports
gdf.loc[gdf['Zone'].str.contains('Airport'), ['LocationID', 'Zone']]

In [None]:
def add_location_attribute(sdf):
    # create new attributes that indicates whether a trip is to or from the three airports
    sdf = sdf.withColumn('Newark_trip', ((sdf['PULocationID'] == 1) | (sdf['DOLocationID'] == 1)).cast('BOOLEAN'))
    sdf = sdf.withColumn('JFK_trip', ((sdf['PULocationID'] == 132) | (sdf['DOLocationID'] == 132)).cast('BOOLEAN'))
    sdf = sdf.withColumn('LaGuardia_trip', ((sdf['PULocationID'] == 138) | (sdf['DOLocationID'] == 138)).cast('BOOLEAN'))
    
    # create a new attribute that indicates whether a trip is within Manhattan borough
    manhattan_ID = gdf.loc[gdf['Borough'].str.contains('Manhattan'), 'LocationID'].values.tolist()
    sdf = sdf.withColumn('Manhattan_trip', ((sdf['PULocationID'].isin(manhattan_ID)) | (sdf['DOLocationID'].isin(manhattan_ID))).cast('BOOLEAN'))
    return sdf

In [None]:
# replace picup/dropoff location id with four binary attributes
train_sdf = add_location_attribute(train_sdf)
train_sdf = train_sdf.drop("PULocationID", "DOLocationID")

test_sdf = add_location_attribute(test_sdf)
test_sdf = test_sdf.drop("PULocationID", "DOLocationID")

train_sdf.printSchema()

### Categoriacal Variables
- Anova Test  
The normality assumption for ANOVA states that the distribution of Y within each group is normally distributed. Hence, log transformation is applied to trip time before doing ANOVA tests.  

- Line plot   
The median trip time of the whole training data is computed and used to demonstrate the relationship between day of week and trip time.

In [None]:
# apply log transformation
sample_df['trip_time'] = np.log(sample_df['trip_time'])

# fit a ols model with three discrete attributes
ols_model = ols(
    formula="trip_time ~ C(day_of_week) + C(congestion_zone)",
    data=sample_df
).fit()

# display anova table of the fitted model
table = sm.stats.anova_lm(ols_model, typ=2)
pd.set_option('display.float_format', '{:.2e}'.format)
table

In [None]:
# compute median trip time in each weekday
trip_time_vs_day = train_sdf.groupBy("day_of_week")\
                            .agg(F.percentile_approx("trip_time", 0.5)\
                            .alias("median_trip_time"))\
                            .toPandas() 
                            
# change the order of the dataframe: Monday to Sunday
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
trip_time_vs_day = trip_time_vs_day.set_index('day_of_week').reindex(order).reset_index()

In [None]:
# line plot of median trip time versus weekday
sns.lineplot(data=trip_time_vs_day, x="day_of_week", y="median_trip_time")
plt.title("Median Trip Duration in Different Days of a Week", fontsize=14)
plt.xticks(rotation=30, fontsize=12)
plt.xlabel("")
plt.ylabel("Median trip duration (s)", fontsize=13)
plt.savefig("../plots/median_trip_time_vs_day.png", bbox_inches='tight')
plt.show()

### Continuous Variables
- Correlation heatmap   
Pearson correlations between trip time and all continous attributes are calculated and plotted as a heatmap.

- Line plot  
The median trip time in different hours of a day is computed and used to demonstrate the relationship between hour of day and trip time.

In [None]:
# plot a correlation heatmap between continuous attributes of the full training set
CORR_COLS = ['trip_time', 'trip_miles', 'Temperature (F)', 'Dew Point (F)', 'hour_of_day',
'Humidity (%)', 'Wind Speed (mph)', 'Pressure (in)', 'Precipitation (in)']

features = "correlation_features"
assembler = VectorAssembler(
    inputCols=CORR_COLS, # input names (can be list of fields)
    outputCol=features # output name (single vector output)
)

# transform the features 
feature_vector = assembler.transform(train_sdf.dropna('any')).select(features)
corr_matrix_dense = Correlation.corr(feature_vector, features)
corr_matrix = corr_matrix_dense.collect()[0][0].toArray().tolist()

df_corr = pd.DataFrame(corr_matrix, index=CORR_COLS, columns=CORR_COLS)
df_corr

In [None]:
# plot correlation heatmap
plt.figure(figsize = (10,6))
sns.set(font_scale=1.1) 
labels = ['Trip time', 'Trip miles', 'Temperature', 'Dew point', 'Hour of Day', 'Humidity', 'Wind speed', 'Pressure ', 'Precipitation']
sns.heatmap(df_corr, annot=True, xticklabels=labels, yticklabels=labels)
plt.title('Pearson Correlation Metric For Continuous Variables', fontsize=15)
plt.savefig("../plots/corr_matrix.png", bbox_inches='tight')
plt.show()

In [None]:
# compute median trip time in each hour of a day
trip_time_vs_hour = train_sdf.groupBy("hour_of_day")\
                             .agg(F.percentile_approx("trip_time", 0.5)\
                             .alias("median_trip_time"))\
                             .toPandas() 

# line plot of median trip time versus hour of day
sns.lineplot(data=trip_time_vs_hour, x="hour_of_day", y="median_trip_time")
plt.title("Median Trip Duration in Different Hours of a Day", fontsize=14)
plt.xlabel("Hour of day", fontsize=13)
plt.xticks(np.arange(0, 24, 3))
plt.ylabel("Median trip duration (s)", fontsize=13)
plt.savefig("../plots/median_trip_time_vs_hour.png", bbox_inches='tight')
plt.show()

### Feature Selection 
According to the feature correlation obtained from the computation and analysis above, we decided to drop the attributes that have little contribution in explaining the response variable "trip time".  

The following attributes are retained,  
- Discrete attribute  
    - JFK trip
    - Newark trip
    - LaGuardia trip
    - Manhattan trip
    - day of week
    - congestion zone
- Continuous attribute
    - trip miles
    - temperature
    - hour of day

In [None]:
chosen_attr = ['trip_miles', 'day_of_week', 'hour_of_day', 'JFK_trip', 'Newark_trip', 'LaGuardia_trip', 'Manhattan_trip',
                'congestion_zone', 'Temperature (F)', 'trip_time']

# feature selection on train and test data
train_sdf = train_sdf.select(chosen_attr)
test_sdf = test_sdf.select(chosen_attr)

train_sdf.show(1, vertical=True, truncate=100)

In [None]:
# save processed training and test data
train_sdf.write.mode('overwrite').parquet('../data/curated/new_train')
test_sdf.write.mode('overwrite').parquet('../data/curated/new_test')