In [1]:
import math
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv('../input/train.csv')

In [3]:
df.head(5)

## Taxi Usage

We will begin by looking at the total NYC Yellow Taxi Cab usage over time in 2016. To do this we need to break up the datetime value provided into month, day of week and hour of day columns for more indepth analysis. I will be focusing on the pickup_datetime for this as the pickup and dropoff times should occur within the same hour the majority of the time, although we will inspect this in more detail later in the analysis.

In [4]:
# Convert the date to a pandas datetime format
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], format="%Y/%m/%d %H:%M:%S")
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'], format="%Y/%m/%d %H:%M:%S")

# Pull out the month, day of week and hour of day and make a new feature for each
df['month'] = df['pickup_datetime'].dt.month
df['dow'] = df['pickup_datetime'].dt.dayofweek
df['hour'] = df['pickup_datetime'].dt.hour

# Count number of pickups made per month, day of week and hour of day
month_usage = pd.value_counts(df['month']).sort_index()
dow_usage = pd.value_counts(df['dow']).sort_index()
hour_usage = pd.value_counts(df['hour']).sort_index()

In [5]:
# Set custom xtick labels
x_tick_labels_month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
x_tick_labels_day = ['Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

# define subplot
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(19, 15))

figure = plt.subplot(2, 2, 1)
month_usage.plot.bar(alpha = 0.5, color = 'b')
plt.title('Pickups over Month of Year', fontsize = 20)
plt.xlabel('Month', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(month_usage.index - 1, x_tick_labels_month, rotation='90', fontsize=18)
plt.xticks(rotation=0)
plt.yticks(fontsize = 18)

figure = plt.subplot(2, 2, 2)
dow_usage.plot.bar(alpha = 0.5, color = 'r')
plt.title('Pickups over Day of Week', fontsize = 20)
plt.xlabel('Day of Week', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(dow_usage.index, x_tick_labels_day, rotation='90', fontsize=18)
plt.xticks(rotation=0)
plt.yticks(fontsize = 18)

figure = plt.subplot(2, 1, 2)
hour_usage.plot.bar(alpha = 0.5, color = 'g')
plt.title('Pickups over Hour of Day', fontsize = 20)
plt.xlabel('Hour of Day', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(rotation=0)
plt.yticks(fontsize = 18)

fig.tight_layout()
# print the total number of Taxi pickups
print ("There were a total of %d Taxi pickups made" % (len(df)))

As we can see, the training dataset does not contain all entries for 2016 and instead we get 1458644 trips occurring between January and June. Taxi pickups peak in March although usage is fairly consistent across all six months. Usage is lowest in January which was surprising to me given New Years Eve after parties and the fact that it is particularly cold in NYC around this time, which I thought might deter people from walking distances they might otherwise in the summer months.

Exploring pickups over the day of the week shows an increased demand towards the end of the week, Thursday, Friday and Saturday. Monday is the least busy day of the week, closely followed by Sunday.

Looking further we can observe that 18:00 and 19:00 are peak times in terms of pickup demand, whilst there is a substantial drop off in demand after midnight.

### Pickup Locations

We can plot this data to see whether the **pickup** location varies with hour of day. To do this I will be using the datashader package which has some very nice tutorial notebooks, one of which deals with New York Taxi data much like this data set, so I will be adapting it for the use cases below. You can find more about it here: https://github.com/bokeh/datashader . It is very useful for plotting large amounts of data.

In [6]:
# First we have to convert the GPS data into meters west and north of Greenwich (Web Mercator format)

def lonlat_to_meters(df, lon_name, lat_name):
    lat = df[lat_name]
    lon = df[lon_name]
    origin_shift = 2 * np.pi * 6378137 / 2.0
    mx = lon * origin_shift / 180.0
    my = np.log(np.tan((90 + lat) * np.pi / 360.0)) / (np.pi / 180.0)
    my = my * origin_shift / 180.0
    df.loc[:, lon_name] = mx
    df.loc[:, lat_name] = my
    
lonlat_to_meters(df, 'pickup_longitude', 'pickup_latitude')
lonlat_to_meters(df, 'dropoff_longitude', 'dropoff_latitude')

In [7]:
from bokeh.plotting import figure, output_notebook, show
from datashader.bokeh_ext import InteractiveImage

output_notebook()

x_range=(-8250000,-8210000)
y_range=(4965000,4990000)

def base_plot(tools='pan,wheel_zoom,reset',plot_width=900, plot_height=600, **plot_args):
    p = figure(tools=tools, plot_width=plot_width, plot_height=plot_height,
               x_range=x_range, y_range=y_range, 
               outline_line_color=None,
            min_border=0, min_border_left=0, min_border_right=0,
            min_border_top=0, min_border_bottom=0, **plot_args)
    
    p.axis.visible = False
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    return p
    
options = dict(line_color=None, fill_color='blue', size=5)

from IPython.core.display import HTML, display
display(HTML("<style>.container { width:90% !important; }</style>"))

In [8]:
import datashader as ds
from datashader import transfer_functions as tf

In [9]:
cvs = ds.Canvas(plot_width=800, plot_height=500, x_range=x_range, y_range=y_range)
agg = cvs.points(df, 'pickup_longitude', 'pickup_latitude')
img = tf.shade(agg)

### Plot showing taxi pickup locations, colour coded by time
The colour scheme is as follows: red (midnight), yellow (4am), green (8am), cyan (noon), blue (4pm), purple (8pm).

In [10]:
from datashader.bokeh_ext import InteractiveImage
# Convert hour to a categorical value
df['EachHour'] = pd.to_datetime(df['pickup_datetime']).dt.hour.astype('category')

# Create a list of colours from red (midnight), yellow (4am), green (8am), cyan (noon), blue (4pm), purple (8pm)
colors = ["#FF0000","#FF3F00","#FF7F00","#FFBF00","#FFFF00","#BFFF00","#7FFF00","#3FFF00",
          "#00FF00","#00FF3F","#00FF7F","#00FFBF","#00FFFF","#00BFFF","#007FFF","#003FFF",
          "#0000FF","#3F00FF","#7F00FF","#BF00FF","#FF00FF","#FF00BF","#FF007F","#FF003F",]

def colorized_images(x_range, y_range, w, h):
    cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    agg = cvs.points(df, 'pickup_longitude', 'pickup_latitude', ds.count_cat('EachHour'))
    img = tf.shade(agg, color_key=colors)
    return tf.dynspread(img, threshold=0.5, max_px=4)

p = base_plot(background_fill_color="black", responsive=True, plot_width=int(900*1.5), plot_height=int(600*1.5))
InteractiveImage(p, colorized_images)

This plot is interactive and you can can click the side tool-bar to enable scroll-zooming.

We can see that the majority of pickups in mid-town are between 4-8 pm whilst further down towards the south pickups are later in the evening and toward midnight. This shows where people are going from, but does not show where people are going to, we will now produce the same plot for the **dropoffs**.

### Plot showing taxi dropoff locations, colour coded by time
The colour scheme is as follows: red (midnight), yellow (4am), green (8am), cyan (noon), blue (4pm), purple (8pm).

In [11]:
# Convert hour to a categorical value
df['EachHourD'] = pd.to_datetime(df['dropoff_datetime']).dt.hour.astype('category')

# Create a list of colours from red (midnight), yellow (4am), green (8am), cyan (noon), blue (4pm), purple (8pm)
colors = ["#FF0000","#FF3F00","#FF7F00","#FFBF00","#FFFF00","#BFFF00","#7FFF00","#3FFF00",
          "#00FF00","#00FF3F","#00FF7F","#00FFBF","#00FFFF","#00BFFF","#007FFF","#003FFF",
          "#0000FF","#3F00FF","#7F00FF","#BF00FF","#FF00FF","#FF00BF","#FF007F","#FF003F",]

def colorized_images(x_range, y_range, w, h):
    cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    agg = cvs.points(df, 'dropoff_longitude', 'dropoff_latitude', ds.count_cat('EachHourD'))
    img = tf.shade(agg, color_key=colors)
    return tf.dynspread(img, threshold=0.5, max_px=4)

p = base_plot(background_fill_color="black", responsive=True, plot_width=int(900*1.5), plot_height=int(600*1.5))
InteractiveImage(p, colorized_images)

From this plot we can see that people are being dropped off just south of Central Park or at the Southern tip of Manhattan early in the morning between 4am and 8am. We also see a lot more points plotted outside of Manhattan indicating that people may live outside and be commuting from the city in the evening (purple/red colours).

## Taxi Vendors

The data set provides a vendor_id which associates the taxi ride with a specific provider. We can look to see whether the number of pickups varies across providers and whether server access varies across providers. The store_and_fwd_flag indicates whether the trip record was held in vehicle memory before sending to the vendor because the vehicle did not have a connection to the server.

In [12]:
# Count number of pickups made per vendor, over month, day of week and hour of day
month_vendor = df.groupby(['month', 'vendor_id']).size().unstack()
dow_vendor = df.groupby(['dow', 'vendor_id']).size().unstack()
hour_vendor = df.groupby(['hour', 'vendor_id']).size().unstack()

# Count vehicles with server access
server = df.groupby(['store_and_fwd_flag', 'vendor_id']).size().unstack()

In [13]:
# Set custom xtick labels
x_tick_labels_month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
x_tick_labels_day = ['Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

fig, ax = plt.subplots(nrows=2,ncols=2, figsize=(9, 8))

figure = plt.subplot(2, 2, 1)
month_vendor.plot.bar(stacked=True, colormap='coolwarm', alpha = 0.7, ax = figure, legend = False)
plt.title('Pickups over Month of Year', fontsize = 13)
plt.xlabel('Month', fontsize = 12)
plt.ylabel('Count', fontsize = 12)
plt.xticks(month_usage.index - 1, x_tick_labels_month, rotation='90', fontsize=12)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 2)
dow_vendor.plot.bar(stacked=True, colormap='coolwarm', alpha = 0.7, ax = figure, legend = False)
plt.title('Pickups over Day of Week', fontsize = 13)
plt.xlabel('Day of Week', fontsize = 12)
plt.ylabel('Count', fontsize = 12)
plt.xticks(dow_usage.index, x_tick_labels_day, rotation='90', fontsize=12)
plt.xticks(rotation=0)


figure = plt.subplot(2, 2, 3)
hour_vendor.plot.bar(stacked=True, colormap='coolwarm', alpha = 0.7, ax = figure, legend = False)
plt.title('Pickups over Hour of Day', fontsize = 13)
plt.xlabel('Hour of Day', fontsize = 12)
plt.ylabel('Count', fontsize = 12)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 4)
server.plot.bar(stacked=True, colormap='coolwarm', alpha = 0.7, ax = figure)
plt.title('Vehicle Server Access', fontsize = 13)
plt.xlabel(' ', fontsize = 12)
plt.ylabel('Count', fontsize = 12)
plt.xticks(rotation=0)

fig.tight_layout()

We can see that the two providers handle approximately 50 % of the customers each, across all times of the month, week and day. We can also see that only a vendor 1 has cars with server access and that this access was only available for a small number of taxi runs provided by vendor 1.

## Multiple Passengers

The data contains a passenger count, it might be interesting to observe how the number of concurrent passengers changes over time. For instance perhaps passengers are more likely to share on a Friday or Saturday evening when groups may be travelling together to bars or clubs.

In [14]:
print ("Max passengers at once: %d" % df['passenger_count'].max())
print ("Average passengers at once: %d" % df['passenger_count'].mean())

In [15]:
# Count number of pickups made per vendor, over month, day of week and hour of day
month_pass = df.groupby(['month', 'passenger_count']).size().unstack()
dow_pass = df.groupby(['dow', 'passenger_count']).size().unstack()
hour_pass = df.groupby(['hour', 'passenger_count']).size().unstack()

In [16]:
# Set custom xtick labels
x_tick_labels_month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
x_tick_labels_day = ['Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

fig, ax = plt.subplots(nrows=2,ncols=2, figsize=(19, 15))

figure = plt.subplot(2, 2, 1)
month_pass.plot.bar(stacked=True, colormap='coolwarm', alpha = 0.8, ax = figure, legend = False)
plt.title('Multi-passenger # over Month of Year', fontsize = 20)
plt.xlabel('Month', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(month_usage.index - 1, x_tick_labels_month, rotation='90', fontsize=18)
plt.yticks(fontsize = 18)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 2)
dow_pass.plot.bar(stacked=True, colormap='coolwarm', alpha = 0.8, ax = figure, legend = False)
plt.title('Multi-passenger # over Day of Week', fontsize = 20)
plt.xlabel('Day of Week', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(dow_usage.index, x_tick_labels_day, rotation='90', fontsize=18)
plt.yticks(fontsize = 18)
plt.xticks(rotation=0)


figure = plt.subplot(2, 1, 2)
hour_pass.plot.bar(stacked=True, colormap='coolwarm', alpha = 0.8, ax = figure, legend = False)
plt.title('Multi-passenger # over Hour of Day', fontsize = 20)
plt.xlabel('Hour of Day', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(rotation = 0, fontsize = 18)
plt.yticks(fontsize = 18)
plt.legend(loc = "upper left")


fig.tight_layout()

Here we see that the maximum number of passengers in one vehicle was 9, whilst the average was 1. We also see that passenger counts over 6 or 7 were very rare and the vast majority of taxi rides carried 1 or 2 people. This distribution was fairly consistent across months and days of the week and hour of day.

## Journey Distance

We can use the Haversine formula\* to approximate the journey distance from the pickup co-ordinates to the drop-off co-ordinates. This is an approximation because it is 'as the crow flys' and does not represent the actual distance a Taxi cab might have to travel while obeying traffic laws and having to navigate the grid-system of roads. We can then investigate how journey distance changes over time.

[\*] credit: https://gist.github.com/rochacbruno/2883505 & https://nathanrooy.github.io/posts/2016-09-07/haversine-with-python/

In [17]:
# Define Harversine forumla

# 3959 # radius of the great circle in miles...some algorithms use 3956
# 6371 # radius in kilometers...some algorithms use 6367
# 3959 * 5280 # radius in feet
# 6371 * 1000 # radius in meters

def distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d

In [18]:
# Read in original data again as lat lon had previously been overwritten for datashader plots
df1 = pd.read_csv('../input/train.csv')

dist = []

# Calculate distance for each sample (takes about 6 minutes on my mac, must be a faster way though..)
for i, r in df1.iterrows():
    x = distance((df1['pickup_latitude'][i], df1['pickup_longitude'][i]), (df1['dropoff_latitude'][i], df1['dropoff_longitude'][i]))
    dist.append(x)

df['distance (km)'] = dist

In [19]:
print ("Max distance (km): %d" % df['distance (km)'].max())
print ("Average distance (km): %d" % df['distance (km)'].mean())

In [20]:
# Calculate average journey distance for month, day of week and hour of day
month_distance = df.groupby('month')['distance (km)'].mean()
dow_distance = df.groupby('dow')['distance (km)'].mean()
hour_distance = df.groupby('hour')['distance (km)'].mean()

In [21]:
# Filter data to remove journey distances over 25 km (account for well over 75 % of the data)
filtered = df[(df['distance (km)'] >= 1) & (df['distance (km)'] < 25)]

In [22]:
# Set custom xtick labels
x_tick_labels_month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
x_tick_labels_day = ['Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

fig, ax = plt.subplots(nrows=2,ncols=2, figsize=(9, 8))

figure = plt.subplot(2, 2, 2)
month_distance.plot.bar(alpha = 0.5, color = 'b')
plt.title('Average Journey Distance over Month', fontsize = 13)
plt.xlabel('Month', fontsize = 12)
plt.ylabel('Average Journey Distance (km)', fontsize = 12)
plt.xticks(month_usage.index - 1, x_tick_labels_month, rotation='90', fontsize=12)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 3)
dow_distance.plot.bar(alpha = 0.5, color = 'r')
plt.title('Average Journey Distance over Day of the Week', fontsize = 13)
plt.xlabel('Month', fontsize = 12)
plt.ylabel('Average Journey Distance (km)', fontsize = 12)
plt.xticks(dow_usage.index, x_tick_labels_day, rotation='90', fontsize=12)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 4)
hour_distance.plot.bar(alpha = 0.5, color = 'g')
plt.title('Average Journey Distance over Hour of Day', fontsize = 13)
plt.xlabel('Hour', fontsize = 12)
plt.ylabel('Average Journey Distance (km)', fontsize = 12)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 1)
sns.distplot(filtered['distance (km)'].dropna(), bins = 30, kde=False, color = 'm')
plt.title('Frequency of Journey Distances < 30km', fontsize = 13)
plt.xlabel('Distance (km)', fontsize = 12)
plt.ylabel('Count', fontsize = 12)
plt.xticks(rotation=0)

fig.tight_layout()

# could add plot against vendor_id here as an extra

We can see that there have either been some very extravagant taxi trips, or there has been a data entry error as the maximum journey distance is recorded as 1240 kilometers. The length of Manhattan is 21.6 kilometers, so to put this in perspective, this ride could have taken you as far as Nashville Tennessee.

Average trip distance is pretty stable across month and day of week. The longest average distances occur early morning each day at 05:00 and 06:00, perhaps when people are commuting in to work.

## Journey Duration

Here we will do a similar time based analysis on the duration of each journey. Then, having already calculated an approximate journey distance we will then calculate an average speed for the journey, perhaps giving insight into when traffic is good or bad in the city.

In [23]:
# Calculate average journey duration for month, day of week and hour of day
month_duration = df.groupby('month')['trip_duration'].mean()
dow_duration = df.groupby('dow')['trip_duration'].mean()
hour_duration = df.groupby('hour')['trip_duration'].mean()

# Convert to minutes
month_duration = month_duration / 60
dow_duration = dow_duration / 60
hour_duration = hour_duration / 60

In [24]:
print ("Max duration (mins): %d" % (df['trip_duration'].max() / 60))
print ("Average duration (mins): %d" % (df['trip_duration'].mean() / 60))

In [25]:
# Filter data to remove journey durations over 1 hour
filtered2 = df['trip_duration'] / 60

filtered2 = filtered2[(filtered2 >= 1) & (filtered2 < 60)] 

In [26]:
# Set custom xtick labels
x_tick_labels_month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
x_tick_labels_day = ['Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

fig, ax = plt.subplots(nrows=2,ncols=2, figsize=(9, 8))

figure = plt.subplot(2, 2, 2)
month_duration.plot.bar(alpha = 0.5, color = 'b')
plt.title('Average Journey Duration over Month', fontsize = 13)
plt.xlabel('Month', fontsize = 12)
plt.ylabel('Average Journey Duration (mins)', fontsize = 12)
plt.xticks(month_usage.index - 1, x_tick_labels_month, rotation='90', fontsize=12)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 3)
dow_duration.plot.bar(alpha = 0.5, color = 'r')
plt.title('Average Journey Duration over Day of the Week', fontsize = 13)
plt.xlabel('Month', fontsize = 12)
plt.ylabel('Average Journey Duration (mins)', fontsize = 12)
plt.xticks(dow_usage.index, x_tick_labels_day, rotation='90', fontsize=12)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 4)
hour_duration.plot.bar(alpha = 0.5, color = 'g')
plt.title('Average Journey Duration over Hour of Day', fontsize = 13)
plt.xlabel('Hour', fontsize = 12)
plt.ylabel('Average Journey Duration (mins)', fontsize = 12)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 1)
sns.distplot(filtered2.dropna(), bins = 30, kde=False, color = 'm')
plt.title('Frequency of Journey Durations < 60 mins', fontsize = 13)
plt.xlabel('Average Duration (mins)', fontsize = 12)
plt.ylabel('Count', fontsize = 12)
plt.xticks(rotation=0)

fig.tight_layout()

# could add plot against vendor_id here as an extra

Again we see that there may be either a data entry error or a very long taxi ride as the max recorded journey duration is 58771 minutes or 1.34 months. 

### Journey Speed

In [27]:
df['speed (km/h)'] = df['distance (km)'] / (df['trip_duration'] / 60) * 60 # multiply 60 to get km per hour from km per min

In [28]:
print ("Max speed (km/h): %d" % df['speed (km/h)'].max())
print ("Average speed (km/h): %d" % df['speed (km/h)'].mean())

In [29]:
# Calculate average journey duration for month, day of week and hour of day
month_speed = df.groupby('month')['speed (km/h)'].mean()
dow_speed = df.groupby('dow')['speed (km/h)'].mean()
hour_speed = df.groupby('hour')['speed (km/h)'].mean()

# Filter speeds
filtered3 = df['speed (km/h)']
filtered3 = filtered3[(filtered3 >= 1) & (filtered3 < 50)] 

In [30]:
# Set custom xtick labels
x_tick_labels_month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
x_tick_labels_day = ['Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

fig, ax = plt.subplots(nrows=2,ncols=2, figsize=(9, 8))

figure = plt.subplot(2, 2, 2)
month_speed.plot.bar(alpha = 0.5, color = 'b')
plt.title('Average Journey Speed over Month of Year', fontsize = 13)
plt.xlabel('Month', fontsize = 12)
plt.ylabel('Average Journey Speed (km/h)', fontsize = 12)
plt.xticks(month_usage.index - 1, x_tick_labels_month, rotation='90', fontsize=12)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 3)
dow_speed.plot.bar(alpha = 0.5, color = 'r')
plt.title('Average Journey Speed over Day of the Week', fontsize = 13)
plt.xlabel('Month', fontsize = 12)
plt.ylabel('Average Journey Speed (km/h)', fontsize = 12)
plt.xticks(dow_usage.index, x_tick_labels_day, rotation='90', fontsize=12)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 4)
hour_speed.plot.bar(alpha = 0.5, color = 'g')
plt.title('Average Journey Speed over Hour of Day', fontsize = 13)
plt.xlabel('Hour', fontsize = 12)
plt.ylabel('Average Journey Speed (km/h)', fontsize = 12)
plt.xticks(rotation=0)

figure = plt.subplot(2, 2, 1)
sns.distplot(filtered3.dropna(), bins = 30, kde=False, color = 'm')
plt.title('Frequency of Journey Speeds < 50 km/h', fontsize = 13)
plt.xlabel('Average Speed (km/h)', fontsize = 12)
plt.ylabel('Count', fontsize = 12)
plt.xticks(rotation=0)

fig.tight_layout()

Now we can finally see that it is likely to be a data entry error as the max speed is recorded at 9274 km per hour. However, we can see that the average journey speed is 14 km per hour.  Trips get slower during mid week but are much faster on Sundays. Also, trips in the early morning are fast than during the day, trips at 05:00 are considerably faster, which makes sense as there should be less traffic. We can answer the crucial question, 'When is the fastest time to take a cab?'.... Sunday - 05:00 .