In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from IPython.display import Image

# enables inline plots, without it plots don't show up in the notebook
%matplotlib inline


In [90]:
location_df = pd.read_csv('http://web.mta.info/developers/data/nyct/subway/Stations.csv')
location_df.sample(5)

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude
182,182,182,A49,IND,8th Av - Fulton St,Ralph Av,Bk,C,Subway,40.678822,-73.920786
188,188,188,A55,IND,8th Av - Fulton St,Euclid Av,Bk,A C,Subway,40.675377,-73.872106
443,442,442,501,IRT,Dyre Av,Eastchester - Dyre Av,Bx,5,At Grade,40.8883,-73.830834
464,462,606,719,IRT,Flushing,Court Sq,Q,7,Elevated,40.747023,-73.945264
262,261,261,G08,IND,Queens Blvd,Forest Hills - 71 Av,Q,E F M R,Subway,40.721691,-73.844521


In [91]:
df_size=3
sample=location_df.sample(df_size).reset_index()
sample.drop(sample.columns[0], axis=1, inplace=True)
sample

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude
0,459,459,715,IRT,Flushing,40 St,Q,7,Elevated,40.743781,-73.924016
1,359,359,247,IRT,Nostrand,Flatbush Av - Brooklyn College,Bk,2 5,Subway,40.632836,-73.947642
2,369,369,611,IRT,Pelham,Elder Av,Bx,6,Elevated,40.828584,-73.879159


In [57]:
geolocator = Nominatim(user_agent="location code")

address_list=[]

for i in range(df_size):
    lat = sample.iat[i, 9]
    lon = sample.iat[i, 10]
    lat_lon = str(lat) + "," + str(lon)
    address = geolocator.reverse(lat_lon)
    address_list.append(address)
sample['Address']=address_list
    
#print(address_list)   
sample


Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,Address
0,380,380,405,IRT,Jerome Av,Bedford Park Blvd - Lehman College,Bx,4,Elevated,40.873412,-73.890064,"(2947, Jerome Avenue, Bedford Park, Bronx, Bro..."
1,269,269,G16,IND,Queens Blvd,Northern Blvd,Q,M R,Subway,40.752885,-73.906006,"(55-02, Broadway, Woodside, Queens County, NYC..."
2,59,59,B12,BMT,West End,9 Av,Bk,D,Open Cut,40.646292,-73.994324,"(9th Avenue Station, 9th Avenue, Borough Park,..."


In [63]:
import requests
import json

inputs=[]

for i in range(df_size):
    lat = sample.iat[i, 9]
    lon = sample.iat[i, 10]
    lat_lon_input = 'lat='+str(lat) + '&lon=' + str(lon)
    inputs.append(lat_lon_input)


walk_score=[]
    
for i in inputs:
    url = f'http://api.walkscore.com/score?format=json&{i}&wsapikey=24c9469a038f55c85cbbba8abeadcfcb'
    page = requests.get(url)
    walkscore_dict = json.loads(page.text)    
    walk_score.append(int(walkscore_dict['walkscore']))

sample['WalkScore']=walk_score

max_walk_score=max(walk_score)

normalized_walk_score=[]

for i in walk_score:
    norm=i/max_walk_score
    normalized_walk_score.append(norm)    

sample['Normalized WalkScore']=normalized_walk_score    
sample

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,Address,WalkScore,Normalized WalkScore
0,380,380,405,IRT,Jerome Av,Bedford Park Blvd - Lehman College,Bx,4,Elevated,40.873412,-73.890064,"(2947, Jerome Avenue, Bedford Park, Bronx, Bro...",93,0.989362
1,269,269,G16,IND,Queens Blvd,Northern Blvd,Q,M R,Subway,40.752885,-73.906006,"(55-02, Broadway, Woodside, Queens County, NYC...",94,1.0
2,59,59,B12,BMT,West End,9 Av,Bk,D,Open Cut,40.646292,-73.994324,"(9th Avenue Station, 9th Avenue, Borough Park,...",89,0.946809


In [92]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
MTA_df = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_180922.txt')
MTA_df.head(3)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,00:00:00,REGULAR,6759219,2291425
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,04:00:00,REGULAR,6759234,2291429
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/15/2018,08:00:00,REGULAR,6759251,2291453


In [85]:
MTA_station_names=MTA_df.STATION.unique()
MTA_station_lines=MTA_df['LINENAME'].values
location_lines=location_df['Daytime Routes'].values

def stationname(stop_name):
    if stop_name in MTA_station_names: 
        return stop_name, 100
    
    mta_name, score = process.extractOne(stop_name, MTA_station_names)
    
    if score < 90:
        #need to add if statement that checks if line names match
        #if MTA_station_lines[i]==
        return stop_name, score
    else:
        return mta_name, score

matches=[]
mta_names = sample['Stop Name'].values

for stop in mta_names:
    matches.append(stationname(stop))
print(matches)

[('Bedford Park Blvd - Lehman College', 86), ('NORTHERN BLVD', 100), ('9 AV', 100)]


In [159]:
MTA_station_names=MTA_df.STATION.unique()
#MTA_station_lines=MTA_df['LINENAME'].values
#location_lines=location_df['Daytime Routes'].values

def stationname(stop_name):
    if stop_name in MTA_station_names: 
        return stop_name, 100
    else:
        mta_name, score = process.extractOne(stop_name, MTA_station_names)
    
        if score < 90:
            # need to add if statement that checks if line names match
            # e.g. if MTA_station_lines[i]==
            return stop_name, score
        else:
            return mta_name, score

matches=[]
mta_names = location_df['Stop Name'].values

for stop in mta_names:
    matches.append(stationname(stop))

    
df = pd.DataFrame(matches,columns=('Best Match MTA Station','Fuzzy Score'))
df

Unnamed: 0,Best Match MTA Station,Fuzzy Score
0,ASTORIA DITMARS,95
1,ASTORIA BLVD,100
2,30 AV,100
3,BROADWAY,100
4,36 AV,100
5,39 Av,89
6,59 ST,90
7,5 AV/59 ST,100
8,57 ST-7 AV,95
9,49 ST,100


In [98]:
MTA_station_names=MTA_df.STATION.unique()
#MTA_station_lines=MTA_df['LINENAME'].values
#location_lines=location_df['Daytime Routes'].values

def countissues(stop_list):
    count=0
    for stop in stop_list:
        if stop.upper() in MTA_station_names: 
            count+=0
        else:
            mta_name, score = process.extractOne(stop, MTA_station_names)
            if score < 90:
                count+=1
    return count

location_names = location_df['Stop Name'].values

countissues(location_names)

64

In [83]:
location_names = location_df['Stop Name'].values
count=0
for stop in location_names:
    if stop.upper() in MTA_station_names: 
        count+=1
print(count)

336


In [163]:
def CanYouSpell(mylist,word):
    for char in word:
        if char in mylist:
            list(mylist).remove(char)
        else:
            return False
    return True
CanYouSpell('NQR456W','NWR')

True

In [200]:
MTA_station_names=MTA_df.STATION.unique()


def exact_matches(stop_name):
    mta_name, score = process.extractOne(stop_name, MTA_station_names)
    new_line = MTA_df.loc[MTA_df['STATION'] == mta_name.upper(), 'LINENAME'].iloc[0]
    old_line = location_df.loc[location_df['Stop Name'] == stop_name, 'Daytime Routes'].iloc[0].replace(" ","")
    
    if stop_name.upper() in MTA_station_names: 
        if CanYouSpell(new_line,old_line):
            return "Match!", stop_name, mta_name, score, old_line, new_line
        else:
            return "Lines don't match", stop_name, mta_name, score, old_line, new_line
    else:
        return "Names don't match", stop_name, mta_name, score, old_line, new_line

exactmatches=[]
mta_names = location_df['Stop Name'].values

for stop in mta_names:
    exactmatches.append(exact_matches(stop))
    
df = pd.DataFrame(exactmatches,columns=('Match','Stop Name','MTA Station','Fuzzy Score','old line','new line'))
df



Unnamed: 0,Match,Stop Name,MTA Station,Fuzzy Score,old line,new line
0,Names don't match,Astoria - Ditmars Blvd,ASTORIA DITMARS,95,NW,NQW
1,Match!,Astoria Blvd,ASTORIA BLVD,100,NW,NQW
2,Match!,30 Av,30 AV,100,NW,NQW
3,Lines don't match,Broadway,BROADWAY,100,NW,G
4,Match!,36 Av,36 AV,100,NW,NQW
5,Names don't match,39 Av,9 AV,89,NW,D
6,Names don't match,Lexington Av/59 St,59 ST,90,NWR,NQR456W
7,Match!,5 Av/59 St,5 AV/59 ST,100,NWR,NQRW
8,Names don't match,57 St - 7 Av,57 ST-7 AV,95,NQRW,NQRW
9,Match!,49 St,49 ST,100,NRW,NQRW


In [208]:
unique_df=df.drop_duplicates()

In [212]:
new_df = pd.merge(location_df,unique_df, on="Stop Name")

df_filtered = new_df[new_df['Match'] == "Match!"]
df_filtered.sample(5)

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,Match,MTA Station,Fuzzy Score,old line,new line
403,369,369,611,IRT,Pelham,Elder Av,Bx,6,Elevated,40.828584,-73.879159,Match!,ELDER AV,100,6,6
106,312,312,122,IRT,Broadway - 7Av,79 St,M,1,Subway,40.783934,-73.979917,Match!,79 ST,100,D,D
218,439,439,225,IRT,Lenox - White Plains Rd,125 St,M,2 3,Subway,40.807754,-73.945495,Match!,125 ST,100,ABCD,ACBD
125,193,193,A63,IND,Liberty Av,104 St,Q,A,Elevated,40.681711,-73.837683,Match!,104 ST,100,JZ,JZ
28,17,17,R22,BMT,Broadway - Brighton,Prince St,M,R W,Subway,40.724329,-73.997702,Match!,PRINCE ST,100,RW,NRW


In [238]:
trimmed_mapping=df_filtered[['GTFS Latitude', 'GTFS Longitude','MTA Station']]
trimmed_mapping=trimmed_mapping.reset_index()
trimmed_mapping.drop(trimmed_mapping.columns[0], axis=1, inplace=True)
trimmed_mapping

Unnamed: 0,GTFS Latitude,GTFS Longitude,MTA Station
0,40.770258,-73.917843,ASTORIA BLVD
1,40.766779,-73.921479,30 AV
2,40.756804,-73.929575,36 AV
3,40.764811,-73.973347,5 AV/59 ST
4,40.759901,-73.984139,49 ST
5,40.745494,-73.988691,28 ST
6,40.747215,-73.993365,28 ST
7,40.743070,-73.984264,28 ST
8,40.741303,-73.989344,23 ST
9,40.745906,-73.998041,23 ST


In [254]:
def appendaddress(datafm,starti,stopi):
    address_list=[]
    for i in range(stopi-starti):
        lat = datafm.loc[i+starti, 'GTFS Latitude']
        lon = datafm.loc[i+starti, 'GTFS Longitude']
        lat_lon = str(lat) + "," + str(lon)
        address = geolocator.reverse(lat_lon)
        address_list.append(address)
    datafm['Address']=address_list
    return datafm


In [255]:
first_30=trimmed_mapping.iloc[0:30]
first=appendaddress(first_30,0,30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [257]:
second_30=trimmed_mapping.iloc[30:60]
second=appendaddress(second_30,30,60)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [263]:
third_30=trimmed_mapping.iloc[60:90]
third=appendaddress(third_30,60,90)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [264]:
fourth_30=trimmed_mapping.iloc[90:120]
fourth=appendaddress(fourth_30,90,120)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [267]:
fifth_30=trimmed_mapping.iloc[120:150]
fifth=appendaddress(fifth_30,120,150)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [266]:
sixth_30=trimmed_mapping.iloc[150:180]
sixth=appendaddress(sixth_30,150,180)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [271]:
seventh_30=trimmed_mapping.iloc[180:210]
seventh=appendaddress(seventh_30,180,210)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [281]:
eighth_30=trimmed_mapping.iloc[210:240]
eighth=appendaddress(eighth_30,210,240)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [274]:
ninth_30=trimmed_mapping.iloc[240:270]
ninth=appendaddress(ninth_30,240,270)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [276]:
tenth_30=trimmed_mapping.iloc[270:300]
tenth=appendaddress(tenth_30,270,300)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [277]:
final_12=trimmed_mapping.iloc[300:312]
final=appendaddress(final_12,300,312)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [282]:
frames = [first, second, third, fourth, fifth, sixth, seventh, eigth, ninth, tenth, final]
final_mapping = pd.concat(frames)

In [284]:
final_mapping.tail(5)

Unnamed: 0,GTFS Latitude,GTFS Longitude,MTA Station,Address
307,40.878663,-73.838591,BAYCHESTER AV,"(3355, Edson Avenue, Co-Op City, Bronx, Bronx ..."
308,40.854364,-73.860495,MORRIS PARK,"(Morris Park Station House, Esplanade, Morris ..."
309,40.749145,-73.869527,JUNCTION BLVD,"(95-54, Roosevelt Avenue, Corona, Queens Count..."
310,40.746325,-73.896403,69 ST,"(Iglesia Universal del Reino de Dios, Roosevel..."
311,40.744149,-73.912549,52 ST,"(51-30, Roosevelt Avenue, Sunnyside Gardens, Q..."


In [286]:
inputs=[]

for i in range(312):
    lat = final_mapping.iat[i, 0]
    lon = final_mapping.iat[i, 1]
    lat_lon_input = 'lat='+str(lat) + '&lon=' + str(lon)
    inputs.append(lat_lon_input)


walk_score=[]
    
for i in inputs:
    url = f'http://api.walkscore.com/score?format=json&{i}&wsapikey=24c9469a038f55c85cbbba8abeadcfcb'
    page = requests.get(url)
    walkscore_dict = json.loads(page.text)    
    walk_score.append(int(walkscore_dict['walkscore']))

final_mapping['WalkScore']=walk_score

max_walk_score=max(walk_score)

normalized_walk_score=[]

for i in walk_score:
    norm=i/max_walk_score
    normalized_walk_score.append(norm)    

final_mapping['Normalized WalkScore']=normalized_walk_score    
final_mapping.tail(5)

Unnamed: 0,GTFS Latitude,GTFS Longitude,MTA Station,Address,WalkScore,Normalized WalkScore
307,40.878663,-73.838591,BAYCHESTER AV,"(3355, Edson Avenue, Co-Op City, Bronx, Bronx ...",78,0.78
308,40.854364,-73.860495,MORRIS PARK,"(Morris Park Station House, Esplanade, Morris ...",88,0.88
309,40.749145,-73.869527,JUNCTION BLVD,"(95-54, Roosevelt Avenue, Corona, Queens Count...",97,0.97
310,40.746325,-73.896403,69 ST,"(Iglesia Universal del Reino de Dios, Roosevel...",95,0.95
311,40.744149,-73.912549,52 ST,"(51-30, Roosevelt Avenue, Sunnyside Gardens, Q...",97,0.97
