In [3]:
########################
## Original work with ##
## minor additions    ##
########################

import pandas as pd
import numpy as np
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype

In [2]:
#Format days of the week to not appear alphabetically on visuals
cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
cat_type = CategoricalDtype(categories=cats, ordered=True)

In [13]:
# Source: http://web.mta.info/developers/turnstile.html
df = pd.concat(map(pd.read_csv, (['http://web.mta.info/developers/data/nyct/turnstile/turnstile_190504.txt',
                    'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190511.txt',
                    'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190518.txt',
                    'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190525.txt',
                    'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190601.txt'])),ignore_index=True)

df.shape

(1026784, 11)

In [14]:
df.rename(columns = {
    'C/A': 'area',
    'UNIT': 'unit',
    'SCP': 'scp',
    'STATION': 'station',
    'DATE': 'date',
    'TIME': 'time',
    'ENTRIES': 'entries',
    'EXITS                                                               ': 'exits'},
    inplace = True)

In [15]:
df.sort_values(['station','area','unit','scp','date','time'], inplace=True)
df.head(30)

Unnamed: 0,area,unit,scp,station,LINENAME,DIVISION,date,time,DESC,entries,exits
31049,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,00:00:00,REGULAR,14647394,16373694
31050,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,04:00:00,REGULAR,14647394,16373709
31051,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,08:00:00,REGULAR,14647395,16373732
31052,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,12:00:00,REGULAR,14647395,16373766
31053,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,16:00:00,REGULAR,14647395,16373800
31054,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,20:00:00,REGULAR,14647395,16373826
31055,H007,R248,00-00-00,1 AV,L,BMT,04/28/2019,00:00:00,REGULAR,14647395,16373849
31056,H007,R248,00-00-00,1 AV,L,BMT,04/28/2019,04:00:00,REGULAR,14647395,16373865
31057,H007,R248,00-00-00,1 AV,L,BMT,04/28/2019,08:00:00,REGULAR,14647395,16373883
31058,H007,R248,00-00-00,1 AV,L,BMT,04/28/2019,12:00:00,REGULAR,14647395,16373909


In [16]:
#We can combine all three to create a unique ID for any turnstile
df['unit_id'] = df.scp+' '+df.area +' '+df.unit

# Create datetime variable
df['DateTime'] = pd.to_datetime(df.date+' '+df.time)

#Record what day of the week each entry is from
df['Day'] = df.DateTime.dt.day_name()

#Calculate the entries/exits in a four hour period by finding the difference between rows
df['four hour entries'] = df.groupby('unit_id').entries.diff().values
df['four hour exits'] = df.groupby('unit_id').exits.diff().values

#Replace NaN, negative values, and impossibly large values by the average of the turnstile
df['four hour entries'].fillna(-1, inplace=True)
df['four hour exits'].fillna(-1, inplace=True)
df['four hour entries'] = df.groupby(['unit_id','area'])['four hour entries'].transform(
        lambda x: np.where((x<0)|(x>20000),x.mask((x<0)|(x>20000)).mean(),x))
df['four hour exits'] = df.groupby(['unit_id','area'])['four hour exits'].transform(
        lambda x: np.where((x<0)|(x>20000),x.mask((x<0)|(x>20000)).mean(),x))
df.head()

Unnamed: 0,area,unit,scp,station,LINENAME,DIVISION,date,time,DESC,entries,exits,unit_id,DateTime,Day,four hour entries,four hour exits
31049,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,00:00:00,REGULAR,14647394,16373694,00-00-00 H007 R248,2019-04-27 00:00:00,Saturday,378.810573,407.942731
31050,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,04:00:00,REGULAR,14647394,16373709,00-00-00 H007 R248,2019-04-27 04:00:00,Saturday,0.0,15.0
31051,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,08:00:00,REGULAR,14647395,16373732,00-00-00 H007 R248,2019-04-27 08:00:00,Saturday,1.0,23.0
31052,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,12:00:00,REGULAR,14647395,16373766,00-00-00 H007 R248,2019-04-27 12:00:00,Saturday,0.0,34.0
31053,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,16:00:00,REGULAR,14647395,16373800,00-00-00 H007 R248,2019-04-27 16:00:00,Saturday,0.0,34.0


In [17]:
# Find total flow over 4 hour blocks
df['total flow'] = df['four hour entries']+df['four hour exits']
df.head()

Unnamed: 0,area,unit,scp,station,LINENAME,DIVISION,date,time,DESC,entries,exits,unit_id,DateTime,Day,four hour entries,four hour exits,total flow
31049,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,00:00:00,REGULAR,14647394,16373694,00-00-00 H007 R248,2019-04-27 00:00:00,Saturday,378.810573,407.942731,786.753304
31050,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,04:00:00,REGULAR,14647394,16373709,00-00-00 H007 R248,2019-04-27 04:00:00,Saturday,0.0,15.0,15.0
31051,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,08:00:00,REGULAR,14647395,16373732,00-00-00 H007 R248,2019-04-27 08:00:00,Saturday,1.0,23.0,24.0
31052,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,12:00:00,REGULAR,14647395,16373766,00-00-00 H007 R248,2019-04-27 12:00:00,Saturday,0.0,34.0,34.0
31053,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,16:00:00,REGULAR,14647395,16373800,00-00-00 H007 R248,2019-04-27 16:00:00,Saturday,0.0,34.0,34.0


In [18]:
#Find daily sum for each turnstile for each day of the week
df_daily = df.groupby(['unit_id', 'station','date'])
df_daily.head(50)

Unnamed: 0,area,unit,scp,station,LINENAME,DIVISION,date,time,DESC,entries,exits,unit_id,DateTime,Day,four hour entries,four hour exits,total flow
31049,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,00:00:00,REGULAR,14647394,16373694,00-00-00 H007 R248,2019-04-27 00:00:00,Saturday,378.810573,407.942731,786.753304
31050,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,04:00:00,REGULAR,14647394,16373709,00-00-00 H007 R248,2019-04-27 04:00:00,Saturday,0.000000,15.000000,15.000000
31051,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,08:00:00,REGULAR,14647395,16373732,00-00-00 H007 R248,2019-04-27 08:00:00,Saturday,1.000000,23.000000,24.000000
31052,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,12:00:00,REGULAR,14647395,16373766,00-00-00 H007 R248,2019-04-27 12:00:00,Saturday,0.000000,34.000000,34.000000
31053,H007,R248,00-00-00,1 AV,L,BMT,04/27/2019,16:00:00,REGULAR,14647395,16373800,00-00-00 H007 R248,2019-04-27 16:00:00,Saturday,0.000000,34.000000,34.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1002164,R419,R326,00-05-01,ZEREGA AV,6,IRT,05/31/2019,05:00:00,REGULAR,39,148,00-05-01 R419 R326,2019-05-31 05:00:00,Friday,0.000000,0.000000,0.000000
1002165,R419,R326,00-05-01,ZEREGA AV,6,IRT,05/31/2019,09:00:00,REGULAR,39,148,00-05-01 R419 R326,2019-05-31 09:00:00,Friday,0.000000,0.000000,0.000000
1002166,R419,R326,00-05-01,ZEREGA AV,6,IRT,05/31/2019,13:00:00,REGULAR,39,148,00-05-01 R419 R326,2019-05-31 13:00:00,Friday,0.000000,0.000000,0.000000
1002167,R419,R326,00-05-01,ZEREGA AV,6,IRT,05/31/2019,17:00:00,REGULAR,39,148,00-05-01 R419 R326,2019-05-31 17:00:00,Friday,0.000000,0.000000,0.000000


In [19]:
#Find daily sum for each STATION for each day of the week
df_daily = df.groupby(['station','date'])['total flow'].sum()
df_daily.head(20)

station  date      
1 AV     04/27/2019    16904.392871
         04/28/2019    12444.000000
         04/29/2019    36454.000000
         04/30/2019    39009.000000
         05/01/2019    40345.000000
         05/02/2019    40741.000000
         05/03/2019    42397.000000
         05/04/2019    17728.000000
         05/05/2019    13728.000000
         05/06/2019    36691.000000
         05/07/2019    39146.000000
         05/08/2019    40615.000000
         05/09/2019    41254.000000
         05/10/2019    41024.000000
         05/11/2019    18131.000000
         05/12/2019    11859.000000
         05/13/2019    36908.000000
         05/14/2019    40150.000000
         05/15/2019    41294.000000
         05/16/2019    40763.000000
Name: total flow, dtype: float64

In [20]:
#################################
##      Location & Mapping     ##
## Data Collection & Cleaning  ##
#################################

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_colwidth", 200)

In [21]:
# Revisit groupby analysis to confirm listing of top ridership stations (by total flow)
ridership = df.groupby(['station'])['total flow'].sum()
df_ridership = ridership.sort_values(ascending=False).reset_index().round(0)
df_ridership

Unnamed: 0,station,total flow
0,34 ST-PENN STA,9764486.0
1,GRD CNTRL-42 ST,8228594.0
2,34 ST-HERALD SQ,6902750.0
3,14 ST-UNION SQ,6174981.0
4,TIMES SQ-42 ST,5997518.0
5,23 ST,5912547.0
6,FULTON ST,5341551.0
7,42 ST-PORT AUTH,5320527.0
8,86 ST,4878903.0
9,125 ST,4311430.0


In [22]:
# Confirm total to do sanity check on data. In 2019, ~5.5M riders per day were logged. 
# For 5 week period, this is ~190M riders. By exits and entrances, 
# a count of ~380M for total flow would be expected
ridership_total = df_ridership['total flow'].sum().round(0)
print(f"{ridership_total:,}")

314,097,433.0


In [23]:
print(type(df_ridership))
print(df_ridership.shape)

<class 'pandas.core.frame.DataFrame'>
(378, 2)


In [24]:
# Add in necessary packages for zip code identification
import geopy
import pandas as pd
from geopy.point import Point

In [25]:
# Collect data from MTA on subway locations
df_stations = pd.read_csv('http://web.mta.info/developers/data/nyct/subway/Stations.csv')
df_stations.sort_values(by='Stop Name',inplace = True)
df_stations

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,North Direction Label,South Direction Label,ADA,ADA Notes
118,119,119,L06,BMT,Canarsie,1 Av,M,L,Subway,40.730953,-73.981628,8 Av,Brooklyn,1,
395,395,395,624,IRT,Lexington Av,103 St,M,6,Subway,40.7906,-73.947478,Uptown & The Bronx,Downtown,0,
155,156,156,A18,IND,8th Av - Fulton St,103 St,M,B C,Subway,40.796092,-73.961454,Uptown & The Bronx,Downtown & Brooklyn,0,
309,309,309,119,IRT,Broadway - 7Av,103 St,M,1,Subway,40.799446,-73.968379,Uptown & The Bronx,Downtown,0,
450,450,450,706,IRT,Flushing,103 St-Corona Plaza,Q,7,Elevated,40.749865,-73.8627,Flushing,Manhattan,0,
81,82,82,J14,BMT,Jamaica,104 St,Q,J Z,Elevated,40.695178,-73.84433,Jamaica,Manhattan,0,
193,193,193,A63,IND,Liberty Av,104 St,Q,A,Elevated,40.681711,-73.837683,Manhattan,Lefferts Blvd,0,
394,394,394,623,IRT,Lexington Av,110 St,M,6,Subway,40.79502,-73.94425,Uptown & The Bronx,Downtown,0,
194,194,194,A64,IND,Liberty Av,111 St,Q,A,Elevated,40.684331,-73.832163,Manhattan,Lefferts Blvd,0,
449,449,449,705,IRT,Flushing,111 St,Q,7,Elevated,40.75173,-73.855334,Flushing,Manhattan,0,


In [7]:
# Correct aspects of the df_stations data frame
df_stations.rename(columns = {'GTFS Latitude':'Lat','GTFS Longitude':'Lon'}, inplace = True) 
df_stations['Stop Name'] = df_stations['Stop Name'].str.upper() 
df_stations

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,Lat,Lon,North Direction Label,South Direction Label,ADA,ADA Notes
118,119,119,L06,BMT,Canarsie,1 AV,M,L,Subway,40.730953,-73.981628,8 Av,Brooklyn,1,
395,395,395,624,IRT,Lexington Av,103 ST,M,6,Subway,40.7906,-73.947478,Uptown & The Bronx,Downtown,0,
155,156,156,A18,IND,8th Av - Fulton St,103 ST,M,B C,Subway,40.796092,-73.961454,Uptown & The Bronx,Downtown & Brooklyn,0,
309,309,309,119,IRT,Broadway - 7Av,103 ST,M,1,Subway,40.799446,-73.968379,Uptown & The Bronx,Downtown,0,
450,450,450,706,IRT,Flushing,103 ST-CORONA PLAZA,Q,7,Elevated,40.749865,-73.8627,Flushing,Manhattan,0,
81,82,82,J14,BMT,Jamaica,104 ST,Q,J Z,Elevated,40.695178,-73.84433,Jamaica,Manhattan,0,
193,193,193,A63,IND,Liberty Av,104 ST,Q,A,Elevated,40.681711,-73.837683,Manhattan,Lefferts Blvd,0,
394,394,394,623,IRT,Lexington Av,110 ST,M,6,Subway,40.79502,-73.94425,Uptown & The Bronx,Downtown,0,
194,194,194,A64,IND,Liberty Av,111 ST,Q,A,Elevated,40.684331,-73.832163,Manhattan,Lefferts Blvd,0,
449,449,449,705,IRT,Flushing,111 ST,Q,7,Elevated,40.75173,-73.855334,Flushing,Manhattan,0,


In [8]:
df_stations.shape

(496, 15)

In [9]:
# Create function to extract zip codes using geopy, then append to df_stations
def get_zipcode(df, geolocator, lat_field, lon_field):
    location = geolocator.reverse((df[lat_field], df[lon_field]))
    try:
        return location.raw['address']['postcode']
    except Exception:
        print("Error raised")
        print(df['Stop Name'])


geolocator = geopy.Nominatim(user_agent='rhys.carter86@gmail.com')

In [10]:
zipcodes = df_stations.apply(get_zipcode, axis=1, geolocator=geolocator, lat_field='Lat', lon_field='Lon')
#print(zipcodes)

Error raised
72 ST
Error raised
PARKSIDE AV


In [27]:
print(type(zipcodes))
print(zipcodes.shape)

<class 'pandas.core.series.Series'>
(496,)


In [29]:
df_stations['Zipcode']=zipcodes
df_stations['STATION']=df_stations['Stop Name']
df_stations.head(50)

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,Lat,Lon,North Direction Label,South Direction Label,ADA,ADA Notes,Zipcode,STATION
118,119,119,L06,BMT,Canarsie,1 AV,M,L,Subway,40.730953,-73.981628,8 Av,Brooklyn,1,,10009,1 AV
395,395,395,624,IRT,Lexington Av,103 ST,M,6,Subway,40.7906,-73.947478,Uptown & The Bronx,Downtown,0,,10029,103 ST
155,156,156,A18,IND,8th Av - Fulton St,103 ST,M,B C,Subway,40.796092,-73.961454,Uptown & The Bronx,Downtown & Brooklyn,0,,10025-4403,103 ST
309,309,309,119,IRT,Broadway - 7Av,103 ST,M,1,Subway,40.799446,-73.968379,Uptown & The Bronx,Downtown,0,,10025,103 ST
450,450,450,706,IRT,Flushing,103 ST-CORONA PLAZA,Q,7,Elevated,40.749865,-73.8627,Flushing,Manhattan,0,,11368,103 ST-CORONA PLAZA
81,82,82,J14,BMT,Jamaica,104 ST,Q,J Z,Elevated,40.695178,-73.84433,Jamaica,Manhattan,0,,11418,104 ST
193,193,193,A63,IND,Liberty Av,104 ST,Q,A,Elevated,40.681711,-73.837683,Manhattan,Lefferts Blvd,0,,11419,104 ST
394,394,394,623,IRT,Lexington Av,110 ST,M,6,Subway,40.79502,-73.94425,Uptown & The Bronx,Downtown,0,,10029,110 ST
194,194,194,A64,IND,Liberty Av,111 ST,Q,A,Elevated,40.684331,-73.832163,Manhattan,Lefferts Blvd,0,,11419,111 ST
449,449,449,705,IRT,Flushing,111 ST,Q,7,Elevated,40.75173,-73.855334,Flushing,Manhattan,0,,11368,111 ST


In [49]:
# For spotchecking data, assess status of complexes having multiple stop names (stations)
df_complexes = df_stations.groupby(['Complex ID','Stop Name'])['Stop Name'].count()
df_complexes_sort = df_complexes.sort_values(ascending=False)
#print(type(df_complexes))
df_complexes_sort


Complex ID  Stop Name                            
623         CANAL ST                                 4
628         FULTON ST                                4
611         TIMES SQ-42 ST                           4
610         GRAND CENTRAL-42 ST                      3
617         ATLANTIC AV-BARCLAYS CTR                 3
621         BROADWAY JUNCTION                        3
608         4 AV-9 ST                                2
167         W 4 ST-WASH SQ                           2
614         59 ST-COLUMBUS CIRCLE                    2
606         COURT SQ                                 2
607         34 ST-HERALD SQ                          2
604         161 ST-YANKEE STADIUM                    2
603         149 ST-GRAND CONCOURSE                   2
602         14 ST-UNION SQ                           2
601         14 ST                                    2
461         QUEENSBORO PLAZA                         2
620         BOROUGH HALL                             2
636         JAY

In [51]:
# Converge data onto one stop name for use in merging station and ridership data
df_stations_unique_name = df_stations.drop_duplicates(subset = ["Stop Name"])
df_stations_unique_name

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,Lat,Lon,North Direction Label,South Direction Label,ADA,ADA Notes,Zipcode,STATION
118,119,119,L06,BMT,Canarsie,1 AV,M,L,Subway,40.730953,-73.981628,8 Av,Brooklyn,1,,10009,1 AV
395,395,395,624,IRT,Lexington Av,103 ST,M,6,Subway,40.7906,-73.947478,Uptown & The Bronx,Downtown,0,,10029,103 ST
450,450,450,706,IRT,Flushing,103 ST-CORONA PLAZA,Q,7,Elevated,40.749865,-73.8627,Flushing,Manhattan,0,,11368,103 ST-CORONA PLAZA
81,82,82,J14,BMT,Jamaica,104 ST,Q,J Z,Elevated,40.695178,-73.84433,Jamaica,Manhattan,0,,11418,104 ST
394,394,394,623,IRT,Lexington Av,110 ST,M,6,Subway,40.79502,-73.94425,Uptown & The Bronx,Downtown,0,,10029,110 ST
194,194,194,A64,IND,Liberty Av,111 ST,Q,A,Elevated,40.684331,-73.832163,Manhattan,Lefferts Blvd,0,,11419,111 ST
153,154,154,A16,IND,8th Av - Fulton St,116 ST,M,B C,Subway,40.805085,-73.954882,Uptown & The Bronx,Downtown & Brooklyn,0,,10026,116 ST
307,307,307,117,IRT,Broadway - 7Av,116 ST-COLUMBIA UNIVERSITY,M,1,Subway,40.807722,-73.96411,Uptown & The Bronx,Downtown,0,,10027,116 ST-COLUMBIA UNIVERSITY
79,80,80,J12,BMT,Jamaica,121 ST,Q,J Z,Elevated,40.700492,-73.828294,Jamaica,Manhattan,0,,11418,121 ST
306,306,306,116,IRT,Broadway - 7Av,125 ST,M,1,Elevated,40.815581,-73.958372,Uptown & The Bronx,Downtown,0,,10027,125 ST


In [53]:
# Ensure linking column name is consistent across both data frames
df_stations_unique_name.rename(columns = {'STATION':'station'}, inplace = True)
df_stations_unique_name.head()

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,Lat,Lon,North Direction Label,South Direction Label,ADA,ADA Notes,Zipcode,station
118,119,119,L06,BMT,Canarsie,1 AV,M,L,Subway,40.730953,-73.981628,8 Av,Brooklyn,1,,10009,1 AV
395,395,395,624,IRT,Lexington Av,103 ST,M,6,Subway,40.7906,-73.947478,Uptown & The Bronx,Downtown,0,,10029,103 ST
450,450,450,706,IRT,Flushing,103 ST-CORONA PLAZA,Q,7,Elevated,40.749865,-73.8627,Flushing,Manhattan,0,,11368,103 ST-CORONA PLAZA
81,82,82,J14,BMT,Jamaica,104 ST,Q,J Z,Elevated,40.695178,-73.84433,Jamaica,Manhattan,0,,11418,104 ST
394,394,394,623,IRT,Lexington Av,110 ST,M,6,Subway,40.79502,-73.94425,Uptown & The Bronx,Downtown,0,,10029,110 ST


In [65]:
# Manually correct station names causing link issues between the df_stations_unique_name 
# and the df_ridership dataframes

df_stations_unique_name.at[318, 'station'] = '34 ST-PENN STA'
df_stations_unique_name.at[402, 'station'] = 'GRD CNTRL-42 ST'
df_stations_unique_name.at[162, 'station'] = '42 ST-PORT AUTH'
df_stations_unique_name.at[160, 'station'] = '59 ST COLUMBUS'
df_stations_unique_name.at[225, 'station'] = '47-50 STS ROCK'
df_stations_unique_name.at[447, 'station'] = 'FLUSHING-MAIN'
df_stations_unique_name.at[171, 'station'] = 'PATH NEW WTC'
df_stations_unique_name.at[267, 'station'] = 'JKSN HT-ROOSVLT'
df_stations_unique_name.at[338, 'station'] = 'ATL AV-BARCLAY'
df_stations_unique_name.at[24, 'station'] = 'JAY ST-METROTEC'
df_stations_unique_name.at[275, 'station'] = 'LEXINGTON AV/53'
df_stations_unique_name.at[389, 'station'] = '161/YANKEE STAD'
df_stations_unique_name.at[278, 'station'] = 'JAMAICA CENTER'
df_stations_unique_name.at[313, 'station'] = '72 ST'

df_stations_unique_name


Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,Lat,Lon,North Direction Label,South Direction Label,ADA,ADA Notes,Zipcode,station
118,119,119,L06,BMT,Canarsie,1 AV,M,L,Subway,40.730953,-73.981628,8 Av,Brooklyn,1,,10009,1 AV
395,395,395,624,IRT,Lexington Av,103 ST,M,6,Subway,40.7906,-73.947478,Uptown & The Bronx,Downtown,0,,10029,103 ST
450,450,450,706,IRT,Flushing,103 ST-CORONA PLAZA,Q,7,Elevated,40.749865,-73.8627,Flushing,Manhattan,0,,11368,103 ST-CORONA PLAZA
81,82,82,J14,BMT,Jamaica,104 ST,Q,J Z,Elevated,40.695178,-73.84433,Jamaica,Manhattan,0,,11418,104 ST
394,394,394,623,IRT,Lexington Av,110 ST,M,6,Subway,40.79502,-73.94425,Uptown & The Bronx,Downtown,0,,10029,110 ST
194,194,194,A64,IND,Liberty Av,111 ST,Q,A,Elevated,40.684331,-73.832163,Manhattan,Lefferts Blvd,0,,11419,111 ST
153,154,154,A16,IND,8th Av - Fulton St,116 ST,M,B C,Subway,40.805085,-73.954882,Uptown & The Bronx,Downtown & Brooklyn,0,,10026,116 ST
307,307,307,117,IRT,Broadway - 7Av,116 ST-COLUMBIA UNIVERSITY,M,1,Subway,40.807722,-73.96411,Uptown & The Bronx,Downtown,0,,10027,116 ST-COLUMBIA UNIVERSITY
79,80,80,J12,BMT,Jamaica,121 ST,Q,J Z,Elevated,40.700492,-73.828294,Jamaica,Manhattan,0,,11418,121 ST
306,306,306,116,IRT,Broadway - 7Av,125 ST,M,1,Elevated,40.815581,-73.958372,Uptown & The Bronx,Downtown,0,,10027,125 ST


In [66]:
# Address a Zipcode +4 creating a link issue between the df_stations_unique_name and the df_ridership dataframes
df_stations_unique_name.at[447, 'Zipcode'] = '11354' # is 111354 in data
df_stations_unique_name

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,Lat,Lon,North Direction Label,South Direction Label,ADA,ADA Notes,Zipcode,station
118,119,119,L06,BMT,Canarsie,1 AV,M,L,Subway,40.730953,-73.981628,8 Av,Brooklyn,1,,10009,1 AV
395,395,395,624,IRT,Lexington Av,103 ST,M,6,Subway,40.7906,-73.947478,Uptown & The Bronx,Downtown,0,,10029,103 ST
450,450,450,706,IRT,Flushing,103 ST-CORONA PLAZA,Q,7,Elevated,40.749865,-73.8627,Flushing,Manhattan,0,,11368,103 ST-CORONA PLAZA
81,82,82,J14,BMT,Jamaica,104 ST,Q,J Z,Elevated,40.695178,-73.84433,Jamaica,Manhattan,0,,11418,104 ST
394,394,394,623,IRT,Lexington Av,110 ST,M,6,Subway,40.79502,-73.94425,Uptown & The Bronx,Downtown,0,,10029,110 ST
194,194,194,A64,IND,Liberty Av,111 ST,Q,A,Elevated,40.684331,-73.832163,Manhattan,Lefferts Blvd,0,,11419,111 ST
153,154,154,A16,IND,8th Av - Fulton St,116 ST,M,B C,Subway,40.805085,-73.954882,Uptown & The Bronx,Downtown & Brooklyn,0,,10026,116 ST
307,307,307,117,IRT,Broadway - 7Av,116 ST-COLUMBIA UNIVERSITY,M,1,Subway,40.807722,-73.96411,Uptown & The Bronx,Downtown,0,,10027,116 ST-COLUMBIA UNIVERSITY
79,80,80,J12,BMT,Jamaica,121 ST,Q,J Z,Elevated,40.700492,-73.828294,Jamaica,Manhattan,0,,11418,121 ST
306,306,306,116,IRT,Broadway - 7Av,125 ST,M,1,Elevated,40.815581,-73.958372,Uptown & The Bronx,Downtown,0,,10027,125 ST


In [67]:
# Merge the df_stations_unique_name and the df_ridership dataframes to assign zip codes to station locations
# Note that further clean up is needed for those stations outside of the top 30 stations (> 2M in total volume, excluding Yankees Stadium, 
# which was specifically targeted for high total flow in the Bronx)

df_stations_with_zip = pd.merge(df_ridership,df_stations_unique_name, on='station',how='left')
df_stations_with_zip.sort_values(by='total flow',ascending=False).reset_index().round(0)

Unnamed: 0,index,station,total flow,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,Lat,Lon,North Direction Label,South Direction Label,ADA,ADA Notes,Zipcode
0,0,34 ST-PENN STA,9764486.0,318.0,318.0,128,IRT,Broadway - 7Av,34 ST-PENN STATION,M,1 2 3,Subway,41.0,-74.0,Uptown & The Bronx,Downtown & Brooklyn,1.0,,10018
1,1,GRD CNTRL-42 ST,8228594.0,402.0,610.0,631,IRT,Lexington Av,GRAND CENTRAL-42 ST,M,4 5 6,Subway,41.0,-74.0,Uptown & The Bronx,Downtown & Brooklyn,1.0,,10017
2,2,34 ST-HERALD SQ,6902750.0,227.0,607.0,D17,IND,6th Av - Culver,34 ST-HERALD SQ,M,B D F M,Subway,41.0,-74.0,Uptown & The Bronx - Queens,Downtown & Brooklyn,1.0,,10019
3,3,14 ST-UNION SQ,6174981.0,15.0,602.0,R20,BMT,Broadway - Brighton,14 ST-UNION SQ,M,N Q R W,Subway,41.0,-74.0,Uptown & Queens,Downtown & Brooklyn,1.0,,10003
4,4,TIMES SQ-42 ST,5997518.0,11.0,611.0,R16,BMT,Broadway - Brighton,TIMES SQ-42 ST,M,N Q R W,Subway,41.0,-74.0,Uptown & Queens,Downtown & Brooklyn,1.0,,10036
5,5,23 ST,5912547.0,14.0,14.0,R19,BMT,Broadway - Brighton,23 ST,M,R W,Subway,41.0,-74.0,Uptown & Queens,Downtown & Brooklyn,0.0,,10010
6,6,FULTON ST,5341551.0,106.0,628.0,M22,BMT,Jamaica,FULTON ST,M,J Z,Subway,41.0,-74.0,Brooklyn,Broad St,1.0,,10005
7,7,42 ST-PORT AUTH,5320527.0,163.0,611.0,A27,IND,8th Av - Fulton St,42 ST-PORT AUTHORITY BUS TERMINAL,M,A C E,Subway,41.0,-74.0,Uptown - Queens,Downtown & Brooklyn,1.0,,10036
8,8,86 ST,4878903.0,311.0,311.0,121,IRT,Broadway - 7Av,86 ST,M,1,Subway,41.0,-74.0,Uptown & The Bronx,Downtown,0.0,,10024
9,9,125 ST,4311430.0,306.0,306.0,116,IRT,Broadway - 7Av,125 ST,M,1,Elevated,41.0,-74.0,Uptown & The Bronx,Downtown,0.0,,10027


In [68]:
print(df_stations_with_zip.shape)

(378, 18)


In [70]:
#################################
## Location & Mapping Analysis ##
##     and Viz Development     ##
#################################

import folium
import matplotlib.pyplot as plt

In [71]:
import geopandas as gp

In [73]:
# Import zipcode data for New York City. Source for initial zip code layer visualization:
# https://github.com/fedhere/PUI2015_EC/blob/master/mam1612_EC/nyc-zip-code-tabulation-areas-polygons.geojson

NYC_zipcodes = gp.read_file('/Users/arcarter/Downloads/nyc-zip-code-tabulation-areas-polygons.geojson')
NYC_zipcodes['postalCode'] = NYC_zipcodes_json['postalCode'].astype(str)
NYC_zipcodes.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   OBJECTID        262 non-null    int64   
 1   postalCode      262 non-null    object  
 2   PO_NAME         262 non-null    object  
 3   STATE           262 non-null    object  
 4   borough         262 non-null    object  
 5   ST_FIPS         262 non-null    object  
 6   CTY_FIPS        262 non-null    object  
 7   BLDGpostalCode  262 non-null    int64   
 8   Shape_Leng      262 non-null    float64 
 9   Shape_Area      262 non-null    float64 
 10  @id             262 non-null    object  
 11  geometry        262 non-null    geometry
dtypes: float64(2), geometry(1), int64(2), object(7)
memory usage: 24.7+ KB


In [47]:
NYC_zipcodes

Unnamed: 0,OBJECTID,postalCode,PO_NAME,STATE,borough,ST_FIPS,CTY_FIPS,BLDGpostalCode,Shape_Leng,Shape_Area,@id,geometry
0,1,11372,Jackson Heights,NY,Queens,36,81,0,20624.692317,20163280.0,http://nyc.pediacities.com/Resource/PostalCode/11372,"POLYGON ((-73.86942 40.74916, -73.89507 40.74647, -73.89619 40.74851, -73.89584 40.74855, -73.89525 40.74831, -73.89654 40.75054, -73.89580 40.75062, -73.89652 40.75439, -73.87222 40.75694, -73.87..."
1,2,11004,Glen Oaks,NY,Queens,36,81,0,23002.816039,22606530.0,http://nyc.pediacities.com/Resource/PostalCode/11004,"POLYGON ((-73.71068 40.75004, -73.70869 40.74876, -73.70713 40.74958, -73.70420 40.75017, -73.70219 40.74474, -73.70309 40.74455, -73.70098 40.73890, -73.71371 40.73617, -73.71568 40.73963, -73.71..."
2,3,11040,New Hyde Park,NY,Queens,36,81,0,15749.161511,6269333.0,http://nyc.pediacities.com/Resource/PostalCode/11040,"POLYGON ((-73.70098 40.73890, -73.70309 40.74455, -73.70219 40.74474, -73.70420 40.75017, -73.70713 40.74958, -73.70869 40.74876, -73.71068 40.75004, -73.70750 40.75269, -73.70478 40.75527, -73.70..."
3,4,11426,Bellerose,NY,Queens,36,81,0,35932.810639,49418360.0,http://nyc.pediacities.com/Resource/PostalCode/11426,"POLYGON ((-73.72270 40.75373, -73.72251 40.75336, -73.72118 40.75232, -73.72083 40.75117, -73.71966 40.74895, -73.71867 40.74631, -73.71568 40.73963, -73.71474 40.73783, -73.71148 40.73250, -73.71..."
4,5,11365,Fresh Meadows,NY,Queens,36,81,0,38693.565676,69385870.0,http://nyc.pediacities.com/Resource/PostalCode/11365,"POLYGON ((-73.81089 40.72717, -73.81116 40.72831, -73.81088 40.73044, -73.81039 40.73833, -73.81497 40.73857, -73.81510 40.73923, -73.81321 40.73899, -73.81036 40.73887, -73.81013 40.74230, -73.81..."
5,6,11373,Elmhurst,NY,Queens,36,81,0,33755.870988,42659400.0,http://nyc.pediacities.com/Resource/PostalCode/11373,"POLYGON ((-73.88722 40.72753, -73.88723 40.72893, -73.88752 40.73008, -73.88756 40.73020, -73.88806 40.73009, -73.88852 40.73453, -73.88938 40.73586, -73.89139 40.73601, -73.88869 40.73655, -73.89..."
6,7,11001,Floral Park,NY,Queens,36,81,0,13594.924549,9155180.0,http://nyc.pediacities.com/Resource/PostalCode/11001,"POLYGON ((-73.70098 40.73890, -73.69960 40.73959, -73.69988 40.73837, -73.70086 40.73588, -73.70655 40.72747, -73.70906 40.72664, -73.71048 40.72733, -73.71075 40.73059, -73.71148 40.73250, -73.71..."
7,8,11375,Forest Hills,NY,Queens,36,81,0,36277.076839,55587770.0,http://nyc.pediacities.com/Resource/PostalCode/11375,"POLYGON ((-73.85625 40.73672, -73.85205 40.73768, -73.85183 40.73726, -73.85134 40.73740, -73.85004 40.73781, -73.84558 40.73985, -73.84499 40.73576, -73.84414 40.73373, -73.84266 40.73163, -73.84..."
8,9,11427,Queens Village,NY,Queens,36,81,0,31231.968593,39568340.0,http://nyc.pediacities.com/Resource/PostalCode/11427,"POLYGON ((-73.74169 40.73682, -73.73568 40.73813, -73.73377 40.73890, -73.73453 40.73544, -73.73481 40.73516, -73.73545 40.73346, -73.73117 40.73268, -73.73193 40.73092, -73.73292 40.73088, -73.73..."
9,10,11374,Rego Park,NY,Queens,36,81,0,26323.994393,25203460.0,http://nyc.pediacities.com/Resource/PostalCode/11374,"POLYGON ((-73.86451 40.73407, -73.85872 40.73580, -73.85882 40.73595, -73.85625 40.73672, -73.85645 40.73625, -73.85470 40.73283, -73.85434 40.73297, -73.85401 40.73265, -73.85167 40.72839, -73.85..."


In [77]:
# Render base map layer, with features to then add including markers for specific train stations team will be targeting,
# and zip code information for high-traffic stations
NYC_map = folium.Map(location=[40.730610,-73.935242],zoom_start=10)
NYC_map

In [78]:
# Create map with zip codes and station traffic for top 30 (and Yankees Statidum foot-traffic)
# All other stations with low foot traffic or unable to be matched are grayed out

NYC_map = folium.Map(location=[40.730610,-73.935242],zoom_start=10)

folium.Choropleth(
     geo_data=NYC_zipcodes,
     data=df_stations_with_zip,
     columns=['Zipcode','total flow'],
     key_on='feature.properties.postalCode',
     fill_color='YlOrRd',nan_fill_color='gray'
).add_to(NYC_map)

"""
def get_zipcode(df, geolocator, lat_field, lon_field):
    location = geolocator.reverse((df[lat_field], df[lon_field]))
    try:
        return location.raw['address']['postcode']
    except Exception:
        print("Error raised")


geolocator = geopy.Nominatim(user_agent='rhys.carter86@gmail.com')


zipcodes = df_stations.apply(get_zipcode, axis=1, geolocator=geolocator, lat_field='Lat', lon_field='Lon')
#print(zipcodes)
"""


    
folium.Marker([40.827994,-73.925831],
             popup='<strong>Yankee Stadium</strong>',
             tooltip=tooltip,
             icon=folium.Icon(color='orange',icon="train", prefix='fa')).add_to(NYC_map)

folium.Marker([40.684359, -73.977666],
             popup='<strong>Barclays Center</strong>',
             tooltip=tooltip,
             icon=folium.Icon(color='lightblue',icon="train", prefix='fa')).add_to(NYC_map)

folium.Marker([40.692180,-73.985942],
             popup='<strong>JAY ST-METROTECH</strong>',
             tooltip=tooltip,
             icon=folium.Icon(color='lightblue',icon="train", prefix='fa')).add_to(NYC_map)

folium.Marker([40.735736,-73.990568],
             popup='<strong>14th Street</strong>',
             tooltip=tooltip,
             icon=folium.Icon(color='pink',icon="train", prefix='fa')).add_to(NYC_map)

folium.Marker([40.759600,-73.830030],
             popup='<strong>Flushing Main</strong>',
             tooltip=tooltip,
             icon=folium.Icon(color='darkblue',icon="train", prefix='fa')).add_to(NYC_map)

"""
40.749719,-73.987823 '34 ST-HERALD SQ'
40.735736,-73.990568 '14 ST-UNION SQ'
40.754672,-73.986754 'TIMES SQ-42 ST'
"""
NYC_map.save('map.html')
NYC_map

In [79]:
######################################
## Transition to reusable functions ##
##     for Map Rendering            ##
######################################
# *Note: still in progress

NYC_map_template = folium.Map(location=[40.730610,-73.935242],zoom_start=10)

tooltip = 'Click for Station Name'
def folium_marker(df,stop_name,lat_field,lon_field):
    map_highlight = folium.Marker(df[lat_field],df[lon_field],
                                 popup='<strong>{}</strong>'.format(stop_name),
                                 tooltip=tooltip,
                                 icon=folium.Icon(color='red',icon='train',prefix='fa')).add_to(NYC_map_template)

In [81]:
df_stations_marker_test1 = df_stations_with_zip.sample(10)
df_stations_marker_test1

Unnamed: 0,station,total flow,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,Lat,Lon,North Direction Label,South Direction Label,ADA,ADA Notes,Zipcode
269,WOODLAWN,264890.0,378.0,378.0,401,IRT,Jerome Av,WOODLAWN,Bx,4,Elevated,40.886037,-73.878751,,Manhattan,0.0,,10468.0
141,116 ST-COLUMBIA,634068.0,,,,,,,,,,,,,,,,
373,BEACH 98 ST,32076.0,201.0,201.0,H13,IND,Rockaway,BEACH 98 ST,Q,A S,Viaduct,40.585307,-73.820558,Manhattan,Rockaway Park,0.0,,11693.0
186,3 AV 138 ST,460857.0,,,,,,,,,,,,,,,,
318,75 ST-ELDERTS,175996.0,,,,,,,,,,,,,,,,
213,NEW LOTS AV,357378.0,352.0,352.0,257,IRT,Eastern Pky,NEW LOTS AV,Bk,3,Elevated,40.666235,-73.884079,Manhattan,,0.0,,11207.0
25,42 ST-BRYANT PK,2470930.0,226.0,609.0,D16,IND,6th Av - Culver,42 ST-BRYANT PK,M,B D F M,Subway,40.754222,-73.984569,Uptown & The Bronx - Queens,Downtown & Brooklyn,0.0,,10019.0
356,BUSHWICK AV,107210.0,,,,,,,,,,,,,,,,
111,137 ST CITY COL,809425.0,,,,,,,,,,,,,,,,
216,MORISN AV/SNDVW,352571.0,,,,,,,,,,,,,,,,


In [82]:
df_stations_marker_test2 = df_stations_with_zip.head(10)
df_stations_marker_test2

Unnamed: 0,station,total flow,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,Lat,Lon,North Direction Label,South Direction Label,ADA,ADA Notes,Zipcode
0,34 ST-PENN STA,9764486.0,318.0,318.0,128,IRT,Broadway - 7Av,34 ST-PENN STATION,M,1 2 3,Subway,40.750373,-73.991057,Uptown & The Bronx,Downtown & Brooklyn,1.0,,10018
1,GRD CNTRL-42 ST,8228594.0,402.0,610.0,631,IRT,Lexington Av,GRAND CENTRAL-42 ST,M,4 5 6,Subway,40.751776,-73.976848,Uptown & The Bronx,Downtown & Brooklyn,1.0,,10017
2,34 ST-HERALD SQ,6902750.0,227.0,607.0,D17,IND,6th Av - Culver,34 ST-HERALD SQ,M,B D F M,Subway,40.749719,-73.987823,Uptown & The Bronx - Queens,Downtown & Brooklyn,1.0,,10019
3,14 ST-UNION SQ,6174981.0,15.0,602.0,R20,BMT,Broadway - Brighton,14 ST-UNION SQ,M,N Q R W,Subway,40.735736,-73.990568,Uptown & Queens,Downtown & Brooklyn,1.0,,10003
4,TIMES SQ-42 ST,5997518.0,11.0,611.0,R16,BMT,Broadway - Brighton,TIMES SQ-42 ST,M,N Q R W,Subway,40.754672,-73.986754,Uptown & Queens,Downtown & Brooklyn,1.0,,10036
5,23 ST,5912547.0,14.0,14.0,R19,BMT,Broadway - Brighton,23 ST,M,R W,Subway,40.741303,-73.989344,Uptown & Queens,Downtown & Brooklyn,0.0,,10010
6,FULTON ST,5341551.0,106.0,628.0,M22,BMT,Jamaica,FULTON ST,M,J Z,Subway,40.710374,-74.007582,Brooklyn,Broad St,1.0,,10005
7,42 ST-PORT AUTH,5320527.0,163.0,611.0,A27,IND,8th Av - Fulton St,42 ST-PORT AUTHORITY BUS TERMINAL,M,A C E,Subway,40.757308,-73.989735,Uptown - Queens,Downtown & Brooklyn,1.0,,10036
8,86 ST,4878903.0,311.0,311.0,121,IRT,Broadway - 7Av,86 ST,M,1,Subway,40.788644,-73.976218,Uptown & The Bronx,Downtown,0.0,,10024
9,125 ST,4311430.0,306.0,306.0,116,IRT,Broadway - 7Av,125 ST,M,1,Elevated,40.815581,-73.958372,Uptown & The Bronx,Downtown,0.0,,10027


In [85]:
#def folium_marker(df,stop_name,lat_field,lon_field)
#zipcodes = df_stations.apply(get_zipcode, axis=1, geolocator=geolocator, lat_field='Lat', lon_field='Lon')
marker_test = df_stations_marker_test2.apply(folium_marker,axis=1,stop_name='Stop Name',lat_field='Lat', lon_field='Lon')

In [11]:
###############################
## Alternative solution for  ##
## mapping station zip codes ##
###############################
# *Note: this was not presented in the original presentation. 
# This alternative solution uses the Google Map API instead. 
# This is modified from a Python 2-based response to this prompt available here:
# https://github.com/galenballew/Metis-Submissions/blob/master/projects/01%20Benson/.ipynb_checkpoints/zip_codes-checkpoint.ipynb

import pandas as pd
import urllib
import json
import urllib.parse
import urllib.request

In [26]:
# Establish unique listing of stations from the earlier merged 
mta_stations = list(df['station'].unique())
stations_zip = pd.DataFrame(columns = ['STATION', 'ZIP'])

In [27]:
df.shape
len(mta_stations)

378

In [28]:
mta_stations_header = mta_stations[0:19]
print(mta_stations_header)

['1 AV', '103 ST', '103 ST-CORONA', '104 ST', '110 ST', '111 ST', '116 ST', '116 ST-COLUMBIA', '121 ST', '125 ST', '135 ST', '137 ST CITY COL', '138/GRAND CONC', '14 ST', '14 ST-UNION SQ', '145 ST', '149/GRAND CONC', '14TH STREET', '15 ST-PROSPECT']


In [29]:
# Create a new data framre using the Google Map API
zips = pd.DataFrame(data=None)
gmaps_base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
api_key = 'AIzaSyBrJ9fMjk5oHryg83GbKFZPZRMJcfyZskg'
scontext = None

In [30]:
for station in mta_stations:
    try:
        search_criteria = {'address': station + ' station, New York, NY',
                           'key' : api_key
                           }
        #print(search_criteria)
        #print('\n')
        url = gmaps_base_url + urllib.parse.urlencode(search_criteria)
        #print(url)
        #print('\n')
        uh = urllib.request.urlopen(url, context=scontext)
        #print(uh)
        #print('\n')
        data = uh.read()
        #print(data)
        js = json.loads(data)
        #print(js)
        #request = requests.get("http://api.roblox.com/Marketplace/ProductInfo?assetId=1834225941").text
        #a = json.loads(request)
        #print(a['Sales'])
        dicts = js['results'][0]['address_components']
        zip_dict = (item for item in dicts if item["types"] == [ "postal_code" ]).__next__()
        zip_code = zip_dict['long_name']
        #print(station,zip_code)
        zips = zips.append(pd.Series((station, zip_code)), ignore_index=True)
    except:
        print("Didn't work: "+station)

Didn't work: BROADWAY JCT
Didn't work: E 149 ST
Didn't work: HOWARD BCH JFK
Didn't work: JAMAICA CENTER
Didn't work: KEW GARDENS
Didn't work: MYRTLE AV
Didn't work: NEREID AV


In [31]:
zips

Unnamed: 0,0,1
0,1 AV,10009
1,103 ST,10025
2,103 ST-CORONA,11368
3,104 ST,11418
4,110 ST,10029
5,111 ST,11418
6,116 ST,10035
7,116 ST-COLUMBIA,10027
8,121 ST,11418
9,125 ST,10035


In [32]:
# Address the stations which could not be found. Manually investigate and enter
errors = {'BROADWAY JCT': 11233,
'E 149 ST': 10455,
'HOWARD BCH JFK': 11414,
'JAMAICA CENTER': 11435,
'KEW GARDENS': 11415,
'MYRTLE AV': 11206,
'NEREID AV': 10466}

for station, zip_code in errors.items():
    zips = zips.append(pd.Series((station, zip_code)), ignore_index=True)

In [33]:
print(type(zips))
zips

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,0,1
0,1 AV,10009
1,103 ST,10025
2,103 ST-CORONA,11368
3,104 ST,11418
4,110 ST,10029
5,111 ST,11418
6,116 ST,10035
7,116 ST-COLUMBIA,10027
8,121 ST,11418
9,125 ST,10035


In [34]:
zips.rename(columns = {0: 'station', 1: 'zip_code'},
           inplace= True)
zips.head()

Unnamed: 0,station,zip_code
0,1 AV,10009
1,103 ST,10025
2,103 ST-CORONA,11368
3,104 ST,11418
4,110 ST,10029


In [35]:
df_stations_with_zip_v2 = pd.merge(df_ridership,zips, on='station',how='left')
df_stations_with_zip_v2.head(20)

Unnamed: 0,station,total flow,zip_code
0,34 ST-PENN STA,9764486.0,10001
1,GRD CNTRL-42 ST,8228594.0,10017
2,34 ST-HERALD SQ,6902750.0,10001
3,14 ST-UNION SQ,6174981.0,10003
4,TIMES SQ-42 ST,5997518.0,10018
5,23 ST,5912547.0,10011
6,FULTON ST,5341551.0,10038
7,42 ST-PORT AUTH,5320527.0,10036
8,86 ST,4878903.0,10024
9,125 ST,4311430.0,10035


In [40]:
df_zips_ridership = df_stations_with_zip_v2.groupby(['zip_code'])['total flow'].sum().sort_values(ascending=False)
print(df_zips_ridership)

zip_code
10001    18846364.0
10011    13441745.0
10003     9694088.0
10038     8732997.0
10018     8468448.0
10007     8401155.0
10017     8356375.0
10023     8232544.0
10019     7348019.0
11201     7135598.0
10022     6779262.0
10024     6614341.0
10035     5975083.0
10016     5499252.0
11226     5409864.0
10036     5320527.0
11372     5309949.0
11101     5288400.0
10029     5249281.0
10065     4904403.0
11217     4815905.0
10013     4588965.0
10012     4470414.0
10002     4408151.0
11211     3954113.0
10014     3650427.0
10004     3599594.0
10020     3572542.0
11354     3176610.0
11213     3047539.0
11225     3021229.0
10021     2988359.0
10031     2936305.0
11368     2920305.0
10025     2588101.0
11435     2560613.0
10468     2474522.0
11207     2377582.0
07302     2313821.0
11237     2240701.0
11377     2217023.0
10451     2179185.0
11206     2128216.0
11215     2065759.0
11373     2010862.0
10033     1991198.0
10452     1969419.0
10472     1934727.0
11435     1905280.0
10106     1

In [41]:
df_zips_ridership = df_zips_ridership.to_frame()
df_zips_ridership.reset_index(inplace=True)
df_zips_ridership

Unnamed: 0,zip_code,total flow
0,10001,18846364.0
1,10011,13441745.0
2,10003,9694088.0
3,10038,8732997.0
4,10018,8468448.0
5,10007,8401155.0
6,10017,8356375.0
7,10023,8232544.0
8,10019,7348019.0
9,11201,7135598.0


In [45]:
# Render base map layer, with features to then add including markers for specific train stations team will be targeting,
# and zip code information for high-traffic stations
import folium
import matplotlib.pyplot as plt
import geopandas as gp

NYC_map = folium.Map(location=[40.730610,-73.935242],zoom_start=10)
NYC_map

In [47]:
# Import zipcode data for New York City. Source for initial zip code layer visualization:
# https://github.com/fedhere/PUI2015_EC/blob/master/mam1612_EC/nyc-zip-code-tabulation-areas-polygons.geojson

NYC_zipcodes = gp.read_file('/Users/arcarter/Downloads/nyc-zip-code-tabulation-areas-polygons.geojson')
NYC_zipcodes['postalCode'] = NYC_zipcodes['postalCode'].astype(str)
NYC_zipcodes.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   OBJECTID        262 non-null    int64   
 1   postalCode      262 non-null    object  
 2   PO_NAME         262 non-null    object  
 3   STATE           262 non-null    object  
 4   borough         262 non-null    object  
 5   ST_FIPS         262 non-null    object  
 6   CTY_FIPS        262 non-null    object  
 7   BLDGpostalCode  262 non-null    int64   
 8   Shape_Leng      262 non-null    float64 
 9   Shape_Area      262 non-null    float64 
 10  @id             262 non-null    object  
 11  geometry        262 non-null    geometry
dtypes: float64(2), geometry(1), int64(2), object(7)
memory usage: 24.7+ KB


In [49]:
# Create map with zip codes for all matched stations. 
# Note that the coverage for this map differs slightly from the other method, 
# based off of lat long to zip code conversion. Further investigation needed 
# to confirm which approach is more accurate

NYC_map = folium.Map(location=[40.730610,-73.935242],zoom_start=10)

folium.Choropleth(
     geo_data=NYC_zipcodes,
     data=df_zips_ridership,
     columns=['zip_code','total flow'],
     key_on='feature.properties.postalCode',
     fill_color='YlOrRd',nan_fill_color='gray'
).add_to(NYC_map)

"""
def get_zipcode(df, geolocator, lat_field, lon_field):
    location = geolocator.reverse((df[lat_field], df[lon_field]))
    try:
        return location.raw['address']['postcode']
    except Exception:
        print("Error raised")


geolocator = geopy.Nominatim(user_agent='rhys.carter86@gmail.com')


zipcodes = df_stations.apply(get_zipcode, axis=1, geolocator=geolocator, lat_field='Lat', lon_field='Lon')
#print(zipcodes)
"""


tooltip = 'Click for Station Name'    
folium.Marker([40.827994,-73.925831],
             popup='<strong>Yankee Stadium</strong>',
             tooltip=tooltip,
             icon=folium.Icon(color='orange',icon="train", prefix='fa')).add_to(NYC_map)

folium.Marker([40.684359, -73.977666],
             popup='<strong>Barclays Center</strong>',
             tooltip=tooltip,
             icon=folium.Icon(color='lightblue',icon="train", prefix='fa')).add_to(NYC_map)

folium.Marker([40.692180,-73.985942],
             popup='<strong>JAY ST-METROTECH</strong>',
             tooltip=tooltip,
             icon=folium.Icon(color='lightblue',icon="train", prefix='fa')).add_to(NYC_map)

folium.Marker([40.735736,-73.990568],
             popup='<strong>14th Street</strong>',
             tooltip=tooltip,
             icon=folium.Icon(color='pink',icon="train", prefix='fa')).add_to(NYC_map)

folium.Marker([40.759600,-73.830030],
             popup='<strong>Flushing Main</strong>',
             tooltip=tooltip,
             icon=folium.Icon(color='darkblue',icon="train", prefix='fa')).add_to(NYC_map)

"""
40.749719,-73.987823 '34 ST-HERALD SQ'
40.735736,-73.990568 '14 ST-UNION SQ'
40.754672,-73.986754 'TIMES SQ-42 ST'
"""
NYC_map.save('map.html')
NYC_map