This project's goal is to analyze data about Uber rides while using various data visualization frameworks that are available for Python. Data on Uber rides in New York from April to September of 2014 are included in the dataset. The information was gathered by FiveThirtyEight and is accessible on Kaggle .
The variables in the dataset are:
name
descripiton
Date/Time
The date and hour of Uber pickup
Lat
The Latitude of the Uber pickup
Lon
The longitude of the Uber pickup
Base
The LTC base company code affiliated with the Uber pickup
import pandas as pd
apr_data = pd .read_csv ('./data/uber-raw-data-apr14.csv' ) # Pickups in april
may_data = pd .read_csv ('./data/uber-raw-data-may14.csv' ) # Pickups in may
jun_data = pd .read_csv ('./data/uber-raw-data-jun14.csv' ) # pickups in june
jul_data = pd .read_csv ('./data/uber-raw-data-jul14.csv' ) # pickups in july
aug_data = pd .read_csv ('./data/uber-raw-data-aug14.csv' ) # pickups in august
sep_data = pd .read_csv ('./data/uber-raw-data-sep14.csv' ) # pickups in september
# Concat all the subsets in just one variable
data = pd .concat ([apr_data , may_data , jun_data , jul_data , aug_data , sep_data ])
del apr_data , may_data , jun_data , jul_data , aug_data , sep_data # Deleting the unnecessary variables to clean the memory
data ['Date/Time' ] = pd .to_datetime (data ['Date/Time' ], format = '%m/%d/%Y %H:%M:%S' ) # String to datetime
data ['day' ] = data ['Date/Time' ].dt .day # Extract the day
data ['month' ] = data ['Date/Time' ].dt .month # Extract the month
data ['year' ] = data ['Date/Time' ].dt .year # Extract the year
data ['day_of_week' ] = data ['Date/Time' ].dt .day_of_week # Extract the day of the week
data ['hour' ] = data ['Date/Time' ].dt .hour # Extract the hour
data ['minute' ] = data ['Date/Time' ].dt .minute # Extract the minute
data ['second' ] = data ['Date/Time' ].dt .second # Extract the second
data .head ()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
Date/Time
Lat
Lon
Base
day
month
year
day_of_week
hour
minute
second
0
2014-04-01 00:11:00
40.7690
-73.9549
B02512
1
4
2014
1
0
11
0
1
2014-04-01 00:17:00
40.7267
-74.0345
B02512
1
4
2014
1
0
17
0
2
2014-04-01 00:21:00
40.7316
-73.9873
B02512
1
4
2014
1
0
21
0
3
2014-04-01 00:28:00
40.7588
-73.9776
B02512
1
4
2014
1
0
28
0
4
2014-04-01 00:33:00
40.7594
-73.9722
B02512
1
4
2014
1
0
33
0
days_names = {0 :'Monday' , 1 :'Tuesday' , 2 :'Wednesday' , 3 :'Thursday' , 4 :'Friday' , 5 :'Saturday' , 6 :'Sunday' }
months_names = {4 :'April' , 5 :'May' , 6 :'June' , 7 :'July' , 8 :'August' , 9 :'September' }
Trips by the hours in a day
hour_data = data .groupby ('hour' )['hour' ].count ().to_frame ().rename (columns = {'hour' :'Total' })
import matplotlib .pyplot as plt
fig , ax = plt .subplots (figsize = (11 , 6 ), dpi = 100 )
fig .fontsize = 20
ax .plot (hour_data , color = '#09091a' )
ax .set_xlim (0 , 23 )
ax .set_xticks (range (24 ))
ax .set_ylim (0 , hour_data .Total .max () + 5000 )
ax .spines [['top' , 'right' ]].set_visible (False )
ax .set_xlabel ('Hour' , fontsize = 14 , color = '#222233' )
plt .suptitle ('Number of trips during the day' , fontsize = 18 , color = '#09091a' ,
x = 0.123 , y = 1.05 , ha = 'left' )
ax .set_title ('Most trips in 2014 took place after 15:00' ,
fontsize = 14 , loc = 'left' , color = '#1fbad6' , y = 1.1 , ha = 'left' )
ax .vlines (15 , ymin = 0 , ymax = hour_data .Total .max (), color = '#c0c0c8' )
ax .text (15.3 , hour_data .loc [15 ] / 1.2 , '2,521,360 trips \n were recorded between\n 15:00 and 23:00 hours.' , color = '#222233' )
ax .text (14.8 , hour_data .loc [15 ] / 3 , '2,288,433 trips \n were recorded between\n 00:00 and 15:00.' ,
horizontalalignment = 'right' , color = '#222233' )
ax .plot (17 , hour_data .loc [17 ], 'o' , color = '#222233' )
ax .text (17 , hour_data .loc [17 ] * 1.02 , 'Pickups peak at 17:00 with 336,190 trips.' )
plt .show ()
data_hour_month = pd .crosstab (data .hour , data .month )
data_hour_month = data_hour_month .rename (columns = months_names )
data_hour_month
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
month
April
May
June
July
August
September
hour
0
11910
13875
14514
17953
21451
24133
1
7769
8186
9167
11527
14471
16107
2
4935
5372
6189
8562
10105
10702
3
5040
5946
6937
9199
10376
10789
4
6095
6945
7701
10040
11774
12675
5
9476
10789
11955
14932
16525
20262
6
18498
21015
22030
23456
24907
33307
7
24924
27413
30834
32545
34064
43314
8
22843
25460
29771
33387
34566
44477
9
17939
20507
24298
28486
30195
38542
10
17865
20801
23584
28558
30706
37634
11
18774
22055
24155
30120
31778
38821
12
19425
23595
25233
30900
32106
39193
13
22603
27699
28937
35832
35764
45042
14
27190
34363
34428
41357
40644
52643
15
35324
43087
41586
46053
48197
61219
16
42003
49127
48162
52403
53481
68224
17
45475
51508
50452
58260
57122
73373
18
43003
48965
45013
57268
55390
75040
19
38923
42387
38203
52332
53008
69660
20
36244
40731
40108
51859
51674
63988
21
36964
42217
40791
49528
51354
60606
22
30645
35556
35614
42218
46008
51817
23
20649
24836
24182
29346
33609
36568
fig , ax = plt .subplots (figsize = (11 , 6 ), dpi = 100 )
data_hour_month .plot (kind = 'bar' , stacked = True , color = ['#d9d9d9' , '#999999' , '#747474' , '#5d5d5d' , '#3f3f3f' , '#1fbad6' ], ax = ax )
ax .spines [['top' , 'right' ]].set_visible (False )
ax .set_xlabel ('Hour' , fontsize = 14 , color = '#222233' )
plt .legend (title = 'Month' )
plt .suptitle ('Number of trips by month and hour' , fontsize = 18 , color = '#09091a' ,
x = 0.123 , y = 1.05 , ha = 'left' )
ax .set_title ('In the month of September, more trips were registered' ,
fontsize = 14 , loc = 'left' , color = '#1fbad6' , y = 1.1 , ha = 'left' )
ax .text (17 , hour_data .loc [17 ], 'Peak hours are \n the same every month.' ,
horizontalalignment = 'right' , color = '#222233' )
plt .show ()
trips_avg = round (data .day .value_counts ().mean (), 0 )
daily_trips = data .day .value_counts ()
days_above_avg = daily_trips [daily_trips > trips_avg ].to_frame ().sort_index ()
palette = []
for i in range (32 ):
if i == 29 :
palette .append ('#1fbad6' )
elif i == 30 :
palette .append ('#d9d9d9' )
elif i + 1 in days_above_avg .index :
palette .append ('#3f3f3f' )
else :
palette .append ('#999999' )
import seaborn as sns
import matplotlib .pyplot as plt
sns .set (rc = {'figure.figsize' :(10 , 6 ),
'axes.facecolor' :'white' ,
'figure.facecolor' :'white' })
ax = sns .countplot (data = data , x = 'day' , palette = palette )
ax .set_xlabel ('Hour' , fontsize = 12 )
ax .set_ylabel ('' )
plt .suptitle ('Number of trips by day and month' , fontsize = 18 , color = '#09091a' ,
x = 0.123 , y = 1.05 , ha = 'left' )
ax .set_title ('17 out of 31 days are above average trips.' ,
fontsize = 14 , loc = 'left' , color = '#1fbad6' , y = 1.1 , ha = 'left' )
ax .hlines (trips_avg , xmin = - 0.5 , xmax = 31 , ls = '--' , colors = 'k' )
ax .text (31 , trips_avg , f"Average = { int (trips_avg )} " , va = 'center' )
ax .text (30 , daily_trips .loc [31 ], f"{ daily_trips .loc [31 ]} trips" )
ax .text (29 , daily_trips .loc [30 ], f"{ daily_trips .loc [30 ]} trips" , color = '#1fbad6' , weight = 'bold' )
ax ;
Trips by week day and month
data2 = data .copy ()
data2 = data2 .replace ({'month' : months_names , 'day_of_week' : days_names })
import plotly .express as px
palette = ['#0d47a1' , '#1565c0' , '#1976d2' , '#1e88e5' , '#2196f3' , '#42a5f5' , '#64b5f6' , '#90caf9' ]
px .histogram (data2 , x = 'month' , color = 'day_of_week' , barmode = 'group' ,
labels = {'month' :'Months' , 'day_of_week' :'Day of week' },
title = 'Trips by week day and month' ,
color_discrete_sequence = palette ,
category_orders = {'day_of_week' : ['Monday' , 'Tuesday' , 'Wednesday' , 'Quinta' , 'Thursday' , 'Saturday' , 'Sunday' ]}
).update_layout (yaxis_title = '' ,
plot_bgcolor = 'rgb(255, 255, 255)' )
from plotnine import ggplot
from plotnine import *
import plotnine as p9
trips = data .groupby ('month' )['month' ].count ().to_frame ().rename (columns = {'month' :'Total' }).reset_index ()
palette = ('#2d9dff' , '#2d9dff' , '#2d9dff' , '#2d9dff' , '#2d9dff' , '#2d9dff' )
p9 .options .figure_size = (10 , 6 )
ggplot (trips )\
+ aes (x = 'month' , y = 'Total' , fill = 'factor(month)' )\
+ geom_col ()\
+ coord_flip ()\
+ geom_text (
aes (label = 'Total' ),
ha = 'right'
)\
+ labs (
y = 'Trips' ,
x = 'Months' ,
title = 'Trips by month'
)\
+ theme_minimal ()\
+ theme (legend_position = 'none' )\
+ scale_x_continuous (breaks = list (range (4 , 10 )), labels = ['April' , 'May' , 'June' , 'July' , 'August' , 'September' ])\
+ scale_fill_manual (values = palette )
base_trips = data .groupby ('Base' )['Base' ].count ().to_frame ().rename (columns = {'Base' :'Total' }).reset_index ()
base_trips
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
Base
Total
0
B02512
205673
1
B02598
1393113
2
B02617
1458853
3
B02682
1212789
4
B02764
263899
import altair as alt
bars = alt .Chart (base_trips , title = 'Trips by Base' ).mark_bar ().encode (
x = 'Total' ,
y = "Base"
)
text = bars .mark_text (
align = 'right' ,
baseline = 'middle' ,
dx = - 3 , color = '#ffffff'
).encode (
text = 'Total'
)
(bars + text ).properties (height = 200 )
month_base_trips = pd .crosstab (data .Base , data .month )
month_base_trips = month_base_trips .rename (columns = months_names )
month_base_trips
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
month
April
May
June
July
August
September
Base
B02512
35536
36765
32509
35021
31472
34370
B02598
183263
260549
242975
245597
220129
240600
B02617
108001
122734
184460
310160
355803
377695
B02682
227808
222883
194926
196754
173280
197138
B02764
9908
9504
8974
8589
48591
178333
from bokeh .io import show
from bokeh .models import ColumnDataSource , FactorRange
from bokeh .plotting import figure
x = [(base , mes ) for base in month_base_trips .index .values [:] for mes in month_base_trips .columns ]
counts = [month_base_trips .loc [base , mes ] for base in month_base_trips .index .values [:] for mes in month_base_trips .columns ]
source = ColumnDataSource (data = dict (x = x , counts = counts ))
p = figure (x_range = FactorRange (* x ), plot_height = 350 , title = "Trips by base and month" ,
toolbar_location = None , tools = "" )
p .vbar (x = 'x' , top = 'counts' , width = 0.9 , source = source )
p .y_range .start = 0
p .x_range .range_padding = 0.1
p .xaxis .major_label_orientation = 1
p .xgrid .grid_line_color = None
show (p )
Trips by Base and day of week
data2 = data .copy ()
data2 = data2 .replace ({'month' : months_names , 'day_of_week' : days_names })
base_days_week_trips = pd .crosstab (data2 .Base , data2 .day_of_week )
base_days_week_trips
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
day_of_week
Friday
Monday
Saturday
Sunday
Thursday
Tuesday
Wednesday
Base
B02512
33319
25460
26773
20490
35032
31670
32929
B02598
229908
163542
198832
146652
235157
202378
216644
B02617
234379
176416
206554
164452
240216
214167
222669
B02682
201594
143372
170160
126511
205091
176198
189863
B02764
41939
32682
43795
32075
39649
39376
34383
import pygal
from pygal .style import LightenStyle
dark_lighten_style = LightenStyle ('#336676' )
bar_chart = pygal .Bar (style = dark_lighten_style , height = 250 )
bar_chart .title = 'Trips by Base and day of week'
bar_chart .x_labels = base_days_week_trips .index .values [:]
for column in ['Monday' , 'Tuesday' , 'Wednesday' , 'Thursday' , 'Friday' , 'Saturday' , 'Sunday' ]:
bar_chart .add (column , base_days_week_trips [column ])
bar_chart .render_to_file ('trips_base_week_day.svg' )
Heatmap: trips by hour and day
trips = pd .crosstab (data .hour , data .day ) / 1_000
import matplotlib .pyplot as plt
import numpy as np
fig , ax = plt .subplots (figsize = (10 , 10 ))
im = ax .imshow (trips , cmap = plt .get_cmap ("Blues" , 13 ), vmin = 0 , vmax = 13 )
ax .set_xticks (np .arange (len (trips .columns )), labels = trips .columns , fontsize = 10 )
ax .set_yticks (np .arange (len (trips .index )), labels = trips .index , fontsize = 10 )
ax .set_title ("Trips by hour and day" , fontsize = 20 )
cbar = ax .figure .colorbar (im , ticks = np .arange (14 ), fraction = 0.035 , ax = ax )
cbar .ax .set_ylabel ("Trips in thounsands" , rotation = - 90 , va = "bottom" , fontsize = 12 )
ax .spines [:].set_visible (False )
ax .set_xticks (np .arange (trips .shape [1 ]+ 1 )- .5 , minor = True )
ax .set_yticks (np .arange (trips .shape [0 ]+ 1 )- .5 , minor = True )
ax .grid (which = "minor" , color = "w" , linestyle = '-' , linewidth = 3 )
ax .tick_params (which = "minor" , bottom = False , left = False )
ax .set_xlabel ('Day' , fontsize = 12 )
ax .set_ylabel ('Hour' , fontsize = 12 )
plt .show ()
Heatmap: Trips by month and day
import seaborn as sns
trips = pd .crosstab (data .month , data .day ) / 1_000
corridas_plot = trips .rename (index = months_names )
fig , ax = plt .subplots (figsize = (20 , 7 ))
sns .heatmap (trips ,
vmin = 0 ,
vmax = 45 ,
cmap = plt .get_cmap ("Blues" , 9 ),
ax = ax ,
linewidths = 2 )
ax .set_title ('Trips by month and day' , fontsize = 20 )
ax .set_xlabel ('Day' , fontsize = 12 )
ax .set_ylabel ('' , fontsize = 12 )
ax .collections [0 ].colorbar .set_label ('Trips in thousands' , fontsize = 12 )
Heatmap: Trips by month and week day
trips = pd .crosstab (data .month , data .day_of_week ) / 1_000
trips = trips .rename (index = months_names , columns = days_names )
import plotly .graph_objs as go
plot = go .Heatmap (z = trips .values [:],
x = trips .columns ,
y = trips .index ,
colorscale = 'Blues' ,
xgap = 2 ,
ygap = 2 ,
zmin = 0 ,
zmax = 165 ,
colorbar = dict (title = 'Trips in thousands' )
)
layout = go .Layout (title = 'Trips by month and week day' )
fig = go .Figure (data = plot , layout = layout )
fig .show ()
Heatmap: Trips by Base and month
trips = data .groupby (['Base' , 'month' ])['hour' ].count ().reset_index ().rename (columns = {'hour' :'Total' })
trips = trips .replace ({'month' :months_names })
trips ['Total' ] /= 1000
trips ['Total' ] = trips ['Total' ].round (2 )
from plotnine import *
import plotnine as p9
p9 .options .figure_size = (10 , 6 )
ggplot (trips )\
+ aes (x = 'month' , y = 'Base' , fill = 'Total' )\
+ geom_tile (aes (width = .95 , height = .95 ))\
+ geom_text (aes (label = 'Total' ), size = 10 )\
+ labs (
y = 'Base' ,
x = '' ,
title = 'Trips by Base and month'
)\
+ theme_minimal ()\
+ scale_fill_gradient (low = '#cbe7ff' , high = '#08306b' )\
+ scale_x_discrete (limits = ('April' , 'May' , 'June' , 'July' , 'August' , 'September' ))
Heatmap: Trips by base and week day
trips = data .groupby (['Base' , 'day_of_week' ])['hour' ].count ().reset_index ().rename (columns = {'hour' :'Total' })
trips = trips .replace ({'day_of_week' :days_names })
trips ['Total' ] /= 1000
trips ['Total' ] = trips ['Total' ].round (2 )
trips .head ()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
Base
day_of_week
Total
0
B02512
Monday
25.46
1
B02512
Tuesday
31.67
2
B02512
Wednesday
32.93
3
B02512
Thursday
35.03
4
B02512
Friday
33.32
import altair as alt
alt .Chart (trips , title = 'Trips by Base and week day' ).mark_rect ().encode (
x = alt .X ('day_of_week' , axis = alt .Axis (title = 'Week day' ), sort = ['Monday' , 'Tuesday' , 'Wednesday' , 'Thursday' ,
'Friday' , 'Saturday' , 'Sunday' ]),
y = 'Base' ,
color = alt .Color ('Total' , scale = alt .Scale (scheme = 'blues' )),
).properties (height = 300 , width = 300 )
import pandas as pd
import numpy as np
from bokeh .plotting import figure
from bokeh .tile_providers import get_provider , WIKIMEDIA
from bokeh .io import output_notebook , show
from pyproj import Proj , transform
import warnings
warnings .filterwarnings ("ignore" )
inProj = Proj (init = 'epsg:3857' )
outProj = Proj (init = 'epsg:4326' )
lons , lats = [], []
for lon , lat in list (set (zip (data ["Lon" ], data ["Lat" ]))):
x , y = transform (outProj , inProj , lon , lat )
lons .append (x )
lats .append (y )
data_map = pd .DataFrame ([])
data_map ["MercatorX" ] = lons
data_map ["MercatorY" ] = lats
data_map .head ()
wikimedia = get_provider (WIKIMEDIA )
ny_lon1 , ny_lat1 = transform (outProj , inProj , - 73.7 , 40.58 )
ny_lon2 , ny_lat2 = transform (outProj , inProj , - 74.15 , 40.92 )
p = figure (plot_width = 900 , plot_height = 700 ,
x_range = (ny_lon1 , ny_lon2 ), y_range = (ny_lat1 , ny_lat2 ),
x_axis_type = "mercator" , y_axis_type = "mercator" ,
title = "Uber rides in NY" )
p .add_tile (wikimedia )
p .circle (x = "MercatorX" , y = "MercatorY" ,
size = 2 ,
fill_color = "dodgerblue" , line_color = "dodgerblue" ,
fill_alpha = 0.3 ,
source = data_map )
show (p )
import matplotlib .pyplot as plt
import numpy as np
import pandas as pd
from cartopy import crs as ccrs
from cartopy import feature as cfeature
# Set the domain for defining the plot region.
latN = 40.92
latS = 40.58
lonW = - 74.15
lonE = - 73.7
cLat = (latN + latS )/ 2
cLon = (lonW + lonE )/ 2
base_colors = {'B02512' :'red' , 'B02598' :'green' , 'B02617' :'blue' , 'B02682' :'yellow' , 'B02764' :'gray' }
bases = data .Base .unique ()
proj = ccrs .LambertConformal (central_longitude = cLon , central_latitude = cLat )
res = '10m' # Coarsest and quickest to display; other options are '10m' (slowest), '50m', 1110m.
fig = plt .figure (figsize = (18 , 12 ))
ax = plt .subplot (1 ,1 , 1 , projection = proj )
ax .set_extent ([lonW , lonE , latS , latN ])
ax .add_feature (cfeature .OCEAN .with_scale (res ))
ax .add_feature (cfeature .COASTLINE .with_scale (res ))
ax .set_title ('New York Map on Uber rides during 2014 (Apr-Sep) by Base' )
for base in bases :
lat = data .query (f'Base == "{ base } "' ).Lat
lon = data .query (f'Base == "{ base } "' ).Lon
ax .scatter (lon , lat , s = 9 , c = base_colors [base ],
edgecolor = None , alpha = 0.75 ,
transform = ccrs .PlateCarree (), label = base )
plt .legend ()
plt .show ()