## Run analysis on station summary data for questions 1 & 2

In [None]:
import pandas as pd
import numpy as np
from station import Station
import seaborn as sns
import matplotlib.pyplot as plt
import mpld3

%load_ext autoreload
%autoreload

In [None]:
# read in data files
ride_df = pd.read_csv('data/CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals.csv')
map_df = pd.read_csv('data/CTA_-_System_Information_-_List_of__L__Stops.csv')
station_map_names = {'Lake':'Lake/State', 'Jackson':'Jackson/State',
                     'Washington':'Washington/Dearborn', 'Morgan':'Morgan-Lake'}
map_df.replace({"STATION_NAME": station_map_names}, inplace=True)

In [None]:
station_summary = pd.read_csv('data/station_summary.csv')

In [None]:
# to_remove = station_summary[station_summary['Sat_mean'] == 0].index[0]
# station_summary.drop([to_remove], inplace=True)
station_summary['weekday-sat'] = station_summary['Weekday_mean'
                                                ] - station_summary['Sat_mean']
station_summary['weekday-sun'] = station_summary['Weekday_mean'
                                                ] - station_summary['Sun/Hol_mean']

In [None]:
# Find station with highest daily mean
station_summary.loc[station_summary.daily_mean.idxmax()]

In [None]:
high_means = station_summary.sort_values(by='daily_mean', ascending=False).iloc[0:10]

In [None]:
# Plot stations with highest daily means
fig, ax = plt.subplots()
sns.barplot(x='station', y='daily_mean', data=high_means, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.title('Top 10 Stations with Highest Daily Ridership', fontsize=18)
ax.set_xlabel('Station', fontsize=14)
ax.set_ylabel('Average Daily Rides', fontsize=14)
plt.tight_layout()
# plt.savefig('figs/dailyrides_mean.png')

In [None]:
# Find Washington/Wabash's std
station_summary[station_summary['station']=='Washington/Wabash']

In [None]:
station_summary.describe()

### Plot stations with high standard deviations by day of the week & season

In [None]:
high_stds = station_summary.sort_values(by='daily_std', ascending=False).iloc[0:10]
station_summary['high_std']=0
station_summary.loc[high_stds.index,'high_std']=1

high_stations = high_stds.station.tolist()
high_stds.set_index('station', inplace=True)

In [None]:
for stat in high_stations:
    stat = Station(stat, ride_df, map_df)
    stat.make_layered_hist('daytype')

In [None]:
for stat in high_stations:
    stat = Station(stat, ride_df, map_df)
    stat.make_layered_hist('season')

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(x="daily_std", y="weekday-sat", data=station_summary, ax=ax)
sns.scatterplot(x="daily_std", y="weekday-sat", data=high_stds, color='red', ax=ax)
ax.set_ylabel('Weekday - Sat Mean Rides', fontsize=16)
ax.set_xlabel('Daily Standard Deviation', fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.title('Weekday/Sat Differences vs. Daily STD', fontsize=18)
plt.tight_layout()
plt.savefig('figs/weekday_sat_std_scatter.png')

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(x="daily_std", y="daily_mean", data=station_summary, ax=ax)
sns.scatterplot(x="daily_std", y="daily_mean", data=high_stds, color='red', ax=ax)

In [None]:
df=station_summary
x='daily_std'
y='daily_mean'
color = 'high_std'
fig, ax = plt.subplots(subplot_kw=dict(facecolor='#EEEEEE'))

scatter = ax.scatter(np.array(df[x], dtype=float),
                     np.array(df[y], dtype=float),
                     c=np.array(df[color], dtype=float),
                     alpha=0.9,
                     vmin=0,
                     vmax=6,
                     cmap='Set2')


ax.grid(color='white', linestyle='solid')

ax.set_title(x+' vs '+y, size=20)
ax.set_xlabel('Mean Daily Standard Deviation', fontsize=16)
ax.set_ylabel('Mean Daily Rides', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

labels = df['station'].tolist()
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)

mpld3.display()


In [None]:
sns.distplot(station_summary['daily_std'])