# Aggregate Analysis
## Dataset: Cycle Share Dataset
#### David Snowberger - 26 Mar 18¶

In [1]:
# imports
import numpy as np
import pandas as pd
from datetime import date

In [2]:
# data frames
df_stn = pd.read_csv('data/station.csv')
df_trp = pd.read_csv('data/trip.csv', skiprows=50794, header=None)
df_wea = pd.read_csv('data/weather.csv')

In [3]:
df_stn.head()

Unnamed: 0,station_id,name,lat,long,install_date,install_dockcount,modification_date,current_dockcount,decommission_date
0,BT-01,3rd Ave & Broad St,47.618418,-122.350964,10/13/2014,18,,18,
1,BT-03,2nd Ave & Vine St,47.615829,-122.348564,10/13/2014,16,,16,
2,BT-04,6th Ave & Blanchard St,47.616094,-122.341102,10/13/2014,16,,16,
3,BT-05,2nd Ave & Blanchard St,47.61311,-122.344208,10/13/2014,14,,14,
4,CBD-03,7th Ave & Union St,47.610731,-122.332447,10/13/2014,20,,20,


In [4]:
df_trp.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,431,10/13/2014 10:31,10/13/2014 10:48,SEA00298,985.935,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Male,1960.0
1,432,10/13/2014 10:32,10/13/2014 10:48,SEA00195,926.375,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Male,1970.0
2,433,10/13/2014 10:33,10/13/2014 10:48,SEA00486,883.831,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Female,1988.0
3,434,10/13/2014 10:34,10/13/2014 10:48,SEA00333,865.937,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Female,1977.0
4,435,10/13/2014 10:34,10/13/2014 10:49,SEA00202,923.923,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Male,1971.0


In [5]:
df_wea.head()

Unnamed: 0,Date,Max_Temperature_F,Mean_Temperature_F,Min_TemperatureF,Max_Dew_Point_F,MeanDew_Point_F,Min_Dewpoint_F,Max_Humidity,Mean_Humidity,Min_Humidity,...,Mean_Sea_Level_Pressure_In,Min_Sea_Level_Pressure_In,Max_Visibility_Miles,Mean_Visibility_Miles,Min_Visibility_Miles,Max_Wind_Speed_MPH,Mean_Wind_Speed_MPH,Max_Gust_Speed_MPH,Precipitation_In,Events
0,10/13/2014,71,62.0,54,55,51,46,87,68,46,...,29.79,29.65,10,10,4,13,4,21,0.0,Rain
1,10/14/2014,63,59.0,55,52,51,50,88,78,63,...,29.75,29.54,10,9,3,10,5,17,0.11,Rain
2,10/15/2014,62,58.0,54,53,50,46,87,77,67,...,29.71,29.51,10,9,3,18,7,25,0.45,Rain
3,10/16/2014,71,61.0,52,49,46,42,83,61,36,...,29.95,29.81,10,10,10,9,4,-,0.0,Rain
4,10/17/2014,64,60.0,57,55,51,41,87,72,46,...,29.78,29.73,10,10,6,8,3,-,0.14,Rain


In [6]:
# Convert dates to a more usable format
df_stn['install_date'] = pd.to_datetime(df_stn['install_date'])
df_stn['modification_date'] = pd.to_datetime(df_stn['modification_date'])
df_trp[2] = pd.to_datetime(df_trp[2])
df_trp[1] = pd.to_datetime(df_trp[1])
df_trp['Month'] = df_trp[1].dt.month
df_wea['Date'] = pd.to_datetime(df_wea['Date'])
df_wea['Month'] = df_wea['Date'].dt.month

In [7]:
# Mean timedelta of all trips
(df_trp[2] - df_trp[1]).mean()

Timedelta('0 days 00:20:02.417385')

In [8]:
# Mode of rider age
df_trp['age'] = 2018 - df_trp[11]
df_trp['age'].groupby(df_trp.age,sort=False).count().sort_values(ascending=False).truncate(after=30.0)

age
31.0    13904
33.0     8793
37.0     7797
36.0     7569
34.0     7355
30.0     7044
Name: age, dtype: int64

In [9]:
# Mean and median precipitation by month (inches)
mu_ppm = df_wea['Precipitation_In'].groupby(df_wea.Month).mean()
med_ppm = df_wea['Precipitation_In'].groupby(df_wea.Month).median()

print('Mean:\n{}'.format(mu_ppm))
print('\nMedian:\n{}'.format(med_ppm))

Mean:
Month
1     0.143548
2     0.168421
3     0.156935
4     0.051333
5     0.012419
6     0.030500
7     0.012097
8     0.018226
9     0.041000
10    0.189000
11    0.187833
12    0.236290
Name: Precipitation_In, dtype: float64

Median:
Month
1     0.020
2     0.040
3     0.025
4     0.000
5     0.000
6     0.000
7     0.000
8     0.000
9     0.000
10    0.040
11    0.035
12    0.100
Name: Precipitation_In, dtype: float64


In [10]:
# average number of bikes at dock per station
df_stn['current_dockcount'].groupby(df_stn.station_id).mean()

station_id
BT-01     18
BT-03     16
BT-04     16
BT-05     14
CBD-03    20
CBD-04    18
CBD-05    20
CBD-06    18
CBD-07    20
CBD-13    18
CD-01      0
CH-01     16
CH-02     20
CH-03     16
CH-05     16
CH-06     16
CH-07     18
CH-08     26
CH-09     16
CH-12     14
CH-15     16
CH-16     18
DPD-01    18
DPD-03    24
EL-01     16
EL-03     18
EL-05     18
FH-01      0
FH-04     20
ID-04     16
PS-04     18
PS-05     18
SLU-01    20
SLU-02    18
SLU-04    18
SLU-07    18
SLU-15    20
SLU-16    20
SLU-17    16
SLU-18     0
SLU-19    16
SLU-20    20
SLU-21    20
SLU-22    18
UD-01     18
UD-02     18
UD-04     16
UD-07     16
UW-01      0
UW-02     14
UW-04     16
UW-06     14
UW-07     14
UW-10     16
UW-11     16
WF-01     24
WF-03     18
WF-04     18
Name: current_dockcount, dtype: int64

In [11]:
# Can't actually determine whether the number of bikes normally goes up or down after a modification date because
# only one modification date shown, which may be only the last modification date while excluding others and
# the current dockcount will represent the sum difference of modifications, rather than one specific modification.
# This is the closest that I can do:

# get modifications
stn_mod = df_stn.loc[pd.isnull(df_stn['modification_date']) == False]

# if current_dockcount is smaller than install_dockcount, it represents a decrease modification
dec = stn_mod.loc[stn_mod['install_dockcount'] > stn_mod['current_dockcount']]['modification_date'].count()

# if current_dockcount is larger than install_dockcount, it represents an increase modification
inc = stn_mod.loc[stn_mod['install_dockcount'] < stn_mod['current_dockcount']]['modification_date'].count()

# The difference between these sums indicates which happens more often
# There are more decrease modifications than increase (with some assumptions as alluded to above)
inc - dec

-3

In [12]:
# male vs female ridership
#Female riders
f = df_trp.loc[df_trp[10] == 'Female'][10].count()
#Male riders
m = df_trp.loc[df_trp[10] == 'Male'][10].count()
[f, m]

[30330, 112940]

In [13]:
# most and least common months to ride
# Most
most = df_trp['Month'].groupby(df_trp.Month).count().sort_values(ascending=False).truncate(after=7)

# Least
least = df_trp['Month'].groupby(df_trp.Month).count().sort_values().truncate(after=12)

print('Most:\n{}\n\nLeast:\n{}'.format(most,least))

Most:
Month
7    32150
Name: Month, dtype: int64

Least:
Month
12    10880
Name: Month, dtype: int64


In [14]:
# Average year-round temp
df_wea['Mean_Temperature_F'].mean()

56.5843023255814