In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import random
import time
px.set_mapbox_access_token(open('/Users/bkuo/Local Documents/Bootcamp 2020/Project/Capstone/CitiBikeCapStone/EDA/mapbox_token.txt').read())

In [None]:
# # sampling from original data source

# start_time = time.time()

# p = 0.1  # 10% of the lines
# # keep the header, then take only 10% of lines
# # if random from [0,1] interval is greater than 0.1 the row will be skipped
# tripdata_dwn = pd.read_csv(
#          '/Users/bkuo/Local Documents/Bootcamp 2020/Project/Capstone/CitiBikeCapStone/Data/Trip data/riders_cleaned.csv.gz',
#          compression='gzip',         
#          skiprows=lambda i: i>0 and random.random() > p
# )

# print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
tripdata_dwn = pd.read_csv('/Users/bkuo/Local Documents/Bootcamp 2020/Project/Capstone/CitiBikeCapStone/Data/Trip data/downsampled_rider.csv.gz', compression='gzip')


In [None]:
tripdata_dwn.columns

In [None]:
tripdata_dwn.head()

In [None]:
tripdata_dwn.describe()

In [None]:
tripdata_dwn.info()

In [None]:
tripdata_dwn.isna().sum()

In [None]:
# making a copy of original sample
tripdata = tripdata_dwn.copy() 

In [None]:
# clean birth year
tripdata['birth year'] = tripdata['birth year'].fillna(0)
tripdata[tripdata['birth year'] == '\\N'] = 0
tripdata['birth year'] = tripdata['birth year'].astype(float).astype(int) #can't pass float in string to int

In [None]:
# adding some more features
tripdata.starttime = pd.to_datetime(tripdata['starttime'])
tripdata.stoptime = pd.to_datetime(tripdata['stoptime'])
tripdata['age'] = [2020 - yr if yr != 0 else yr for yr in tripdata['birth year']]
tripdata['peak'] = [1 if p > 5 and p < 20 else 0 for p in tripdata.start_hour ]
tripdata.info()

In [None]:
tripdata.columns

In [None]:
df = tripdata


fig = px.scatter_matrix(df,
    dimensions=['tripduration', 'gender' ,'dayofweek', 'age', 'peak'])
fig.show()

In [None]:
# getting distinct lat lon using average by stations
s_lat_lon = tripdata[['start station name', 'start station id','start station latitude', 'start station longitude']]
s_lat_lon = s_lat_lon.drop_duplicates().rename(columns = {'start station name':'station', 'start station id':'station_id','start station latitude':'latitude', 
                            'start station longitude':'longitude'})
e_lat_lon = tripdata[['end station name', 'end station id', 'end station latitude', 'end station longitude']]
e_lat_lon = e_lat_lon.drop_duplicates().rename(columns = {'end station name':'station', 'end station id':'station_id','end station latitude':'latitude', 
                            'end station longitude':'longitude'})

stn_lat_lon = pd.concat([s_lat_lon, e_lat_lon]).drop_duplicates()
stn_lat_lon = stn_lat_lon.groupby(['station', 'station_id']).mean().reset_index()
stn_lat_lon = stn_lat_lon[stn_lat_lon['station'].str.contains("temporarily removed") == False]
stn_lat_lon = stn_lat_lon[stn_lat_lon['station'] != 0]

In [None]:
# df = stn_lat_lon
# neighborhood = px.scatter_mapbox(df, lat="latitude", lon="longitude", color="station",
#                         color_continuous_scale=px.colors.cyclical.IceFire,size_max=15, zoom=10)
# neighborhood.show()

In [None]:
present_stations = pd.DataFrame(tripdata.groupby(['start station id', 'start station name'])\
                                                 ['tripduration'].count())
present_stations = pd.merge(stn_lat_lon, 
                            present_stations, 
                            left_on = ['station', 'station_id'], 
                            right_on = ['start station name','start station id'], 
                            how = 'left').reset_index()
present_stations = present_stations.rename(columns = {'tripduration': 'ride_num'})
present_stations['ride_num'] = present_stations['ride_num'].fillna(0)

present_stations.head()


In [None]:
# issue: same id/coordinate with different name
tripdata[tripdata['start station id'] == 504]\
    [['start station id', 'start station name', 'start station latitude',
       'start station longitude']].drop_duplicates()

In [None]:
# setting map color scale:
colorscale = [
[0, 'rgb(38, 53, 113)'], 
[0.5, 'rgb(57, 162, 225)'],
[1, 'rgb(234, 32, 41)']
]

In [None]:
# starting station demand
df = present_stations
plot = px.scatter_mapbox(df, 
                         lat = "latitude", 
                         lon = "longitude", 
                         color = "ride_num",
                         hover_name = "station", 
                         color_continuous_scale=colorscale, 
                         size_max=2,                         
                         zoom=10)
# animation_frame="year", animation_group="country"



plot.show()

In [None]:
present_stations_hr = pd.DataFrame(tripdata.groupby(
                              ['start station id', 'start station name', 'start_hour'])\
                              ['tripduration'].count()).reset_index()
present_stations_hr = pd.merge(stn_lat_lon, 
                            present_stations_hr, 
                            left_on = ['station', 'station_id'], 
                            right_on = ['start station name','start station id'], 
                            how = 'left')

present_stations_hr = present_stations_hr.rename(columns = {'tripduration': 'ride_num'})
present_stations_hr = present_stations_hr[present_stations_hr['start station id'].isna() == False]

present_stations_hr.describe()

In [None]:
# starting station demand by hour
df = present_stations_hr
plot = px.scatter_mapbox(df, 
                         lat = "latitude", 
                         lon = "longitude", 
                         color = "ride_num",
                         hover_name = "station", 
                         labels={"ride_num": "Ride Number", "start_hour":"Start Hour"},
                         color_continuous_scale = colorscale,#px.colors.diverging.Temps, 
                         size_max=15,   
                         animation_frame="start_hour",
                         animation_group="ride_num",
                         color_continuous_midpoint = 500,
                         zoom=10) 
# plot.update_layout(
# #     title="Plot Title",
# #     xaxis_title="X Axis Title",
# #     yaxis_title="Y Axis Title",
#     legend_title="Ride Number",
# #     font=dict(
# #         family="Courier New, monospace",
# #         size=18,
# #         color="RebeccaPurple"
#     )
plot.show()

In [None]:
present_stations_hr2 = pd.DataFrame(tripdata.groupby(
                              ['end station id', 'end station name', 'stop_hour'])\
                              ['tripduration'].count()).reset_index()
present_stations_hr2 = pd.merge(stn_lat_lon, 
                            present_stations_hr2, 
                            left_on = ['station', 'station_id'], 
                            right_on = ['end station name','end station id'], 
                            how = 'left')

present_stations_hr2 = present_stations_hr2.rename(columns = {'tripduration': 'ride_num'})
present_stations_hr2 = present_stations_hr2[present_stations_hr2['end station id'].isna() == False]

present_stations_hr2.head()

In [None]:
# starting station demand by hour
df = present_stations_hr2
plot = px.scatter_mapbox(df, 
                         lat = "latitude", 
                         lon = "longitude", 
                         color = "ride_num",
                         hover_name = "station", 
                         color_continuous_scale = colorscale,#px.colors.diverging.Temps, 
                         size_max=15,   
                         animation_frame="stop_hour",
                         animation_group="ride_num",
#                          color_continuous_midpoint = 500,
                         zoom=10) 
plot.show()

In [None]:
# station pairs: single biggest pair is central park& 6 ave!
stn_pair = tripdata.groupby(['start station name', 'end station name'])['tripduration'].count()
stn_pair = pd.DataFrame(stn_pair).rename(columns = {'tripduration':'count'})
stn_pair = stn_pair.reset_index()
stn_pair = stn_pair[stn_pair['start station name'] != 0].sort_values('count', ascending = False)
stn_pair.head(20)

In [None]:
print(stn_pair.describe())
print(stn_pair.head(int(stn_pair.shape[0]*.25)).describe())
print(stn_pair.head(int(stn_pair.shape[0]*.10)).describe())
print(stn_pair.head(int(stn_pair.shape[0]*.05)).describe())
print(stn_pair.head(int(stn_pair.shape[0]*.005)).describe())

In [None]:
# busiest dow by season
import seaborn as sns

sns.set(rc={'figure.figsize':(12,8)})
hmap = tripdata.groupby(['season', 'dayofweek'])['tripduration'].count().reset_index()
hmap = hmap.rename(columns = {'tripduration':'demand'})
hmap = hmap.pivot('season', 'dayofweek', 'demand')
hmap
ax = sns.heatmap(hmap)


sns.set(rc={'axes.facecolor':'#f3f3f3', 'figure.facecolor':'#f3f3f3'})
colors = ["#39a2e1", "#263571", "#ea2029"]
sns.set_palette(sns.color_palette(colors))


In [None]:
# busiest time by dow

hmap = tripdata.groupby(['start_hour', 'dayofweek'])['tripduration'].count().reset_index()
hmap = hmap.rename(columns = {'tripduration':'demand'})
hmap = hmap.pivot('start_hour', 'dayofweek', 'demand')
ax = sns.heatmap(hmap)


In [None]:
# ride count by seasons
df = pd.DataFrame(tripdata.groupby(['start_hour','season'])['tripduration'].count().reset_index())
df = df.rename(columns = {'tripduration': 'ride_num'})
df = df[df['season'] != 0]
fig = px.line(df, x="start_hour", y="ride_num", color = 'season',
                 title="Ride Count by Hour by Seasons")
fig.update_layout(hovermode="x unified")

fig.show()

In [None]:
# df = pd.DataFrame(tripdata.groupby(['start_hour','season'])['tripduration'].count().reset_index())
# df = df.rename(columns = {'tripduration': 'ride_num'})
df = tripdata[tripdata.season != 0]
df['trip_mins'] = df.tripduration/60
df = df.groupby('season').median()
fig = px.bar(df, x= df.index, y="trip_mins",
                 title="Riding Length by Season")
fig.update_layout(hovermode="x unified")

fig.show()

In [None]:
# ride time by age and gender... but age>80~100 seems strange
df = tripdata[tripdata.age != 0]
df = df[tripdata.gender != 0]
df['trip_mins'] = df.tripduration/60
df = df.groupby(['age', 'gender']).median().reset_index()
fig = px.bar(df, x='age', y="trip_mins", color = 'gender',
                 title="Ride Time by Age and Gender")
fig.update_layout(hovermode="x unified")

fig.show()

In [None]:
# same graph as above, but only ages <100 and only look at genders with available data
df = tripdata[tripdata.age != 0]
df = df[tripdata.age < 100]
df = df[tripdata.gender != 0]
df['trip_mins'] = df.tripduration/60
df = df.groupby(['age', 'gender']).median().reset_index()
fig = px.bar(df, x='age', y="trip_mins", color = 'gender', barmode = 'overlay',
                 title="Ride Time by Age and Gender")
fig.update_layout(hovermode="x unified")

fig.show()

In [None]:
# overall riding trend through the years
df = tripdata[tripdata.year > 2010]
df = df.groupby([pd.Grouper(key="starttime", freq="M"),'season'])['tripduration'].count().reset_index()
df = df.rename(columns = {'tripduration': 'ride_num', 'starttime':'time'})
fig = px.line(df, x="time", y="ride_num", #color = 'season',
                 title="Demand Trend")
fig.update_layout(hovermode="x unified")


fig.show()

In [None]:
# overall riding trend through the years with season
df = tripdata[tripdata.year > 2010]
df = df.groupby([pd.Grouper(key="starttime", freq="M"),'season'])['tripduration'].count().reset_index()
df = df.rename(columns = {'tripduration': 'ride_num', 'starttime': 'time'})
fig = px.bar(df, x="time", y="ride_num", color = 'season',
                 title="Demand Trend")
fig.update_layout(hovermode="x unified")

fig.show()