In [21]:
import pandas as pd
import numpy as np

In [22]:
ttc = pd.read_csv('ttc_delays_cleaned.csv')

In [23]:
ttc_stations = ttc['Station']
ttc_stations.head(2)

0        HIGH PARK
1    SHEPPARD - YU
Name: Station, dtype: object

In [24]:
stations = pd.read_csv('csv_originals/station_ridership_grade.csv')
stations['Station'] = stations['Station'].str.upper()
stations.head(2)

Unnamed: 0,Station,Grade,2015 ridership
0,COLLEGE,Underground,47790
1,DAVISVILLE,Surface,25330


In [25]:
station_index = stations.set_index('Station')
station_index.index

Index(['COLLEGE', 'DAVISVILLE', 'DUNDAS', 'DUPONT', 'EGLINTON',
       'EGLINTON WEST', 'FINCH', 'GLENCAIRN', 'KING', 'LAWRENCE',
       'LAWRENCE WEST', 'MUSEUM', 'NORTH YORK CENTRE', 'OSGOODE', 'QUEEN',
       'QUEEN'S PARK', 'ROSEDALE', 'SHEPPARD WEST', 'ST. ANDREW', 'ST. CLAIR',
       'ST. CLAIR WEST', 'ST. PATRICK', 'SUMMERHILL', 'UNION', 'WELLESLEY',
       'WILSON', 'YORK MILLS', 'YORKDALE', 'BLOOR-YONGE - BD',
       'BLOOR-YONGE - YU', 'SPADINA - YU', 'SPADINA - BD', 'ST. GEORGE - YU',
       'ST. GEORGE - BD', 'SHEPPARD - SHP', 'SHEPPARD - YU', 'BATHURST', 'BAY',
       'BROADVIEW', 'CASTLE FRANK', 'CHESTER', 'CHRISTIE', 'COXWELL',
       'DONLANDS', 'DUFFERIN', 'DUNDAS WEST', 'GREENWOOD', 'HIGH PARK',
       'ISLINGTON', 'JANE', 'KEELE', 'KIPLING', 'LANSDOWNE', 'MAIN STREET',
       'OLD MILL', 'OSSINGTON', 'PAPE', 'ROYAL YORK', 'RUNNYMEDE',
       'SHERBOURNE', 'VICTORIA PARK', 'WARDEN', 'WOODBINE', 'KENNEDY - SRT',
       'KENNEDY - BD', 'ELLESMERE', 'LAWRENCE EAST', 'MCC

In [26]:
ttc_stations_index = pd.Index(ttc_stations.unique())
ttc_stations_index

Index([         'HIGH PARK',      'SHEPPARD - YU',          'LANSDOWNE',
         'BLOOR-YONGE - YU',           'DUFFERIN',  'NORTH YORK CENTRE',
                'RUNNYMEDE',              'QUEEN',         'ST. ANDREW',
                'WELLESLEY',            'KIPLING',        'ST. PATRICK',
             'KENNEDY - BD',               'JANE',           'OLD MILL',
                   'WILSON',          'DOWNSVIEW',                'BAY',
                    'FINCH',            'COXWELL',     'ST. CLAIR WEST',
                   'DUPONT',         'YORK MILLS',        'DUNDAS WEST',
                  'OSGOODE',            'COLLEGE',   'BLOOR-YONGE - BD',
            'VICTORIA PARK',    'ST. GEORGE - YU',     'SHEPPARD - SHP',
                'BROADVIEW',           'EGLINTON',           'ROSEDALE',
                     'PAPE',          'GREENWOOD',          'ISLINGTON',
                   'WARDEN',         'DAVISVILLE',           'LAWRENCE',
            'EGLINTON WEST',              'BLOOR', 

Lets find which of the Stations don't match

In [27]:
station_index.index.difference(ttc_stations_index)

Index(['MAIN STREET', 'SHEPPARD WEST'], dtype='object')

In [28]:
ttc_stations_index.difference(station_index.index)

Index([         nan,      'BLOOR',  'DOWNSVIEW',    'KENNEDY',       'MAIN',
         'SHEPPARD',    'SPADINA', 'ST. GEORGE',      'YONGE'],
      dtype='object')

Looks like we need to fix:
BLOOR–YONGE --> Bloor and Yonge?
MAIN STREET --> MAIN
SHEPPARD WEST --> SHEPPARD
SHEPPARD–YONGE --> DOWNSVIE

In [46]:
# Remove the STREET from main street like we did in the other file
stations.loc[stations['Station'] == 'MAIN STREET', 'Station'] = 'MAIN'

# Rename Sheppard West to DOWNSVIEW because this data predates 2017
stations.loc[stations['Station'] == 'SHEPPARD WEST', 'Station'] = 'DOWNSVIEW'

In [47]:
station_index = stations.set_index('Station')
ttc_stations_index = pd.Index(ttc_stations.unique())
station_index.index.difference(ttc_stations_index)

Index([], dtype='object')

In [48]:
ttc_stations_index.difference(station_index.index)

Index([nan, 'BLOOR', 'KENNEDY', 'SHEPPARD', 'SPADINA', 'ST. GEORGE', 'YONGE'], dtype='object')

In [49]:
ttc_exchange_stations = ttc[ttc['Station'].isin(['BLOOR', 'SHEPPARD', 'YONGE'])]

In [50]:
ttc_exchange_stations[['Station', 'Line']].drop_duplicates().sort_values('Station')

Unnamed: 0,Station,Line
69,BLOOR,
15880,SHEPPARD,
20421,SHEPPARD,Bloor-Danforth
1178,YONGE,


In [51]:
ttc_exchange_stations.groupby(['Station', 'Line']).size()

Station   Line          
SHEPPARD  Bloor-Danforth    37
dtype: int64

In [55]:
stations['Ridership'] = stations['2015 ridership']
stations['Ridership'].apply(lambda x: pd.to_numeric(x.replace(',',''), errors='coerce'))

0      47790.0
1      25330.0
2      81330.0
3      16140.0
4      72750.0
5      16830.0
6     100820.0
7       5720.0
8      55810.0
9      24560.0
10     19950.0
11      8100.0
12     24220.0
13     22490.0
14     48010.0
15     48070.0
16      5980.0
17         NaN
18     56120.0
19     36320.0
20     28110.0
21     30250.0
22      5780.0
23    118450.0
24     23140.0
25     22810.0
26     28150.0
27     19150.0
28    183240.0
29    216190.0
        ...   
44     29940.0
45     29620.0
46      9040.0
47     10390.0
48     37410.0
49     19820.0
50     15240.0
51     52930.0
52     19250.0
53         NaN
54      6620.0
55     31610.0
56     28710.0
57     20230.0
58     17040.0
59     25730.0
60     28610.0
61     29740.0
62     14380.0
63         NaN
64     69790.0
65      1560.0
66      8130.0
67      4620.0
68      2410.0
69     24630.0
70      8990.0
71      3050.0
72     33760.0
73      6600.0
Name: Ridership, Length: 74, dtype: float64

In [56]:
stations.to_csv('stations_cleaned.csv', index=False)