import nltk
import pandas as pd
# ^^^ pyforest auto-imports - don't write above this line
# Imports

In [476]:
import folium
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [477]:
stations = pd.read_csv("./gtfs_data/stops.txt")

<IPython.core.display.Javascript object>

# Algorithms to use 

- steiner tree problem (basically getting the distance matrix) https://www.geeksforgeeks.org/steiner-tree/
- floyd-warshall algorithm https://en.wikipedia.org/wiki/Floyd–Warshall_algorithm
- dijkstra's algorithm https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm#Description

# Mapping Stations

In [478]:
len(stations)

1503

In [479]:
null_columns=stations.columns[stations.isnull().any()]
stations[null_columns].isnull().sum()

stop_code         1503
stop_desc         1503
zone_id           1503
stop_url          1503
parent_station     501
dtype: int64

In [480]:
stations.drop(columns = null_columns[:-1], inplace=True) # dropping if there are 1503 null values in a col

### Unique coordinates of Stations

In [481]:
mta_stations = stations[stations['stop_lon'] > -74.03]

In [482]:
unique_coords = list(set(zip(mta_stations['stop_lat'], mta_stations['stop_lon'])))

In [483]:
counter = 0
for x in unique_coords:
    if x[1] > -74.03:
        counter += 1
print(counter)

472


### Map

In [484]:
base_map = folium.folium.Map([40.7128, -74.0061], zoom_start=10, tiles='cartodbpositron') # city hall coords

In [485]:
for coord in unique_coords:
    folium.Circle(
            location = (coord[0], coord[1]),
            radius = 50, 
            popup = coord,
            color='crimson',
            fill=False,
        ).add_to(base_map)

In [486]:
base_map

#### Saving Map

In [487]:
# base_map.save("./saved_data/Only_stations.html")

### Actually stations

In [488]:
mta_actual_stations = mta_stations[mta_stations['location_type'] == 1]

#### Saving Actually stations

In [489]:
# mta_actual_stations.to_csv("./saved_data/non_unique_mta_stations.csv")

### Adding Station_id column and new df

In [490]:
mta_unique_station_locations = mta_actual_stations.copy()

In [491]:
mta_unique_station_locations['station_id'] = mta_actual_stations['stop_id']

In [492]:
mta_actual_stations.reset_index(drop=True, inplace=True)

### Stations with nearly the same latitude

In [493]:
# https://stackoverflow.com/questions/19006095/finding-non-unique-elements-in-list-not-working
duplicate_stations = [k for k, v in Counter(mta_actual_stations.stop_lat).items() if v > 1]

In [494]:
duplicate_stations

[40.750582, 40.824783000000004, 40.732338, 40.577422, 40.668234000000005]

In [495]:
duplicate_latitudes = mta_actual_stations[mta_actual_stations['stop_lat'].isin(duplicate_stations)]

In [496]:
# duplicate_latitudes.sort_values(by = 'stop_lat')

### Fixing L stations

In [497]:
mta_unique_station_locations.reset_index(drop=True, inplace=True)

In [498]:
L_stations = mta_unique_station_locations[mta_unique_station_locations['stop_id'].str.contains("L")]

In [499]:
rounded_L_station_lats = [round(x,3) for x in L_stations['stop_lat']]
rounded_L_station_lons = [round(x,3) for x in L_stations['stop_lon']]

In [500]:
# rounded_L_station_lats

In [501]:
L_near_match_list = []
for idx, x in enumerate(mta_unique_station_locations['stop_lat']):
    if round(x,3) in rounded_L_station_lats and \
    round(mta_unique_station_locations['stop_lon'][idx],3) in rounded_L_station_lons:
        L_near_match_list.append(idx)

In [502]:
near_matches = mta_unique_station_locations.iloc[L_near_match_list]

#### Making Dictionary to replace station_id

In [503]:
replacement_station_id_dict = {'140': '140_142', "142":'140_142', 'H04':'H04_H19', "H19":'H04_H19'}

In [504]:
replacement_station_id_dict.update({'631':'631_723_901', "723":'631_723_901', '901':'631_723_901'})

In [505]:
replacement_station_id_dict.update({'127':'127_725_902_R16', "725":'127_725_902_R16', 
                                    '902':'127_725_902_R16', "R16":'127_725_902_R16'})

In [506]:
# cases with the same latitude
replacement_station_id_dict.update({'718': '718_R09', "R09":'718_R09', "A12":'A12_D13', "D13":'A12_D13',
                                    "D43":'D43_N12', 'N12':'D43_N12', "H01":'H01_H02', "H02":'H01_H02',
                                    "A32":'A32_D20', "D20":'A32_D20'})

In [507]:
# L Stations
replacement_station_id_dict.update({'L22': "L22_J27", "J27": "L22_J27", "M12": "M12_G31", "G31":"M12_G31",
                                   'L01': "L01_A31", "A31":'L01_A31',
                                   "R20":"635_L03_R20", "L03":"635_L03_R20", "635":"635_L03_R20",
                                   "L17":"L17_M08", "M08":"L17_M08"})

In [508]:
# found through near exact coords and same name
replacement_station_id_dict.update({"Q01": "Q01_R23", "R23": "Q01_R23", "A38": "A38_M22", "M22":"A38_M22",
                                   'A45': 'A45_S01', "S01":"A45_S01", "235" : "235_D24_R31", "D24": "235_D24_R31",
                                   "R31":"235_D24_R31", "232":"232_423", "423":"232_423", 
                                   "719":"719_F09_G22", "F09":"719_F09_G22", "G22":"719_F09_G22",
                                   "A41":"A41_R29", "R29":"A41_R29", "D17": 'D17_R17', "R17":'D17_R17',
                                   '112': "112_A09", "A09":"112_A09", "125": "125_A24", "A24": "125_A24",
                                   "222":'222_415', "415":'222_415', "414":'414_D11', "D11":'414_D11',
                                   "710":"710_G14", "G14":"710_G14"})

## Finding Stations with the Same Name and Nearly Identical Coordinates

In [509]:
mta_unique_station_locations.reset_index(drop=True, inplace=True)

In [510]:
## gather the list with almost the same lat and lon, and then i'll manually check names 
nearly_the_same_coords = []
for x in range(len(mta_unique_station_locations)):
    orig_lat = round(mta_unique_station_locations['stop_lat'][x], 3)
    orig_lon = round(mta_unique_station_locations['stop_lon'][x], 3)
    for y in range(x+1, len(mta_unique_station_locations)):
        new_lat = round(mta_unique_station_locations['stop_lat'][y], 3)
        new_lon = round(mta_unique_station_locations['stop_lon'][y], 3)        
        if orig_lat == new_lat and orig_lon == new_lon:
            nearly_the_same_coords.append((x, y))

In [511]:
# just adding these entries to replacement_station_id_dict
new_names = []
for x in nearly_the_same_coords:
    name1 = mta_unique_station_locations['stop_id'][x[0]]
    name2 = mta_unique_station_locations['stop_id'][x[1]]
    if name1 not in replacement_station_id_dict.keys() and name2 not in replacement_station_id_dict.keys():
        new_names.append(name1+'_'+name2)

In [512]:
new_names

[]

In [513]:
already_covered_stations = []
for name in new_names:
    split_names = name.split("_")
    for station in split_names:
        if station in replacement_station_id_dict.keys():
            already_covered_stations.append(station)

In [514]:
nearly_the_same_coords[index][0]

105

In [515]:
# changing the station_id of each of the nearly_the_same stations
for index, x in enumerate(new_names):
    split_list = new_names[index].split("_")
    name1 = split_list[0]
    name2 = split_list[1]
    if name1 not in already_covered_stations and name2 not in already_covered_stations:
        idx1 = nearly_the_same_coords[index][0]
        idx2 = nearly_the_same_coords[index][1]
        mta_unique_station_locations['station_id'][idx1] = x
        mta_unique_station_locations['station_id'][idx2] = x

In [516]:
mta_unique_station_locations.shape

(477, 7)

In [517]:
# mta_unique_station_locations[mta_unique_station_locations.station_id.isnull() == True]

In [518]:
mta_unique_station_locations.replace(to_replace=replacement_station_id_dict, inplace=True)

In [519]:
mta_unique_station_locations.drop_duplicates(subset='station_id', inplace=True)

In [520]:
mta_unique_station_locations.shape

(444, 7)

In [521]:
mta_unique_station_locations.station_id.nunique()

444

In [522]:
mta_unique_station_locations[mta_unique_station_locations['station_id'].str.contains('R09')]

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station,station_id
169,718_R09,Queensboro Plaza,40.750582,-73.940202,1,,718_R09


 #### Saving df

In [523]:
# mta_unique_station_locations.to_csv("./saved_data/final_station_df.csv")

## Adding Connections Between Stations on the Map

In [524]:
shapes_df = pd.read_csv("./gtfs_data/shapes.txt")

<IPython.core.display.Javascript object>

In [525]:
# removing the staten island one 
shapes_df = shapes_df[shapes_df['shape_pt_lon'] > -74.03]

In [526]:
for x in list(shapes_df.shape_id.unique()):
    line_df = shapes_df[shapes_df['shape_id'] == x]
    list_of_coords = list(zip(line_df['shape_pt_lat'], line_df['shape_pt_lon']))
    folium.PolyLine(
        locations=list_of_coords,
        tooltip=x
    ).add_to(base_map)

In [527]:
# note: the mta data does not include the second avenue extension from lexington ave/63 street to 96th street
## https://en.wikipedia.org/wiki/Second_Avenue_Subway#/media/File:Second_Avenue_Subway_Map_vc.jpg
# Also note that I removed the staten island data
base_map

### Saving Map

In [528]:
# base_map.save('nyc_subway_w_connections.html')

## Creating Node_List To Use in post_man Problems

In [529]:
node_list_df = mta_unique_station_locations.copy()

In [530]:
node_list_df.shape

(444, 7)

In [531]:
node_list_df.drop_duplicates(subset='station_id', inplace=True)

In [532]:
node_list_df.set_index(keys = 'station_id', drop=True, inplace=True)

In [533]:
node_list_df.drop(columns = ['stop_id', 'stop_name', 'location_type', 'parent_station'], inplace=True)

In [534]:
node_list_df.columns = ['X', 'Y']

In [535]:
node_list_df.rename(index={'station_id': 'ID'}, inplace=True)

In [536]:
node_list_df.shape

(444, 2)

In [537]:
node_list_df

Unnamed: 0_level_0,X,Y
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1
101,40.889248,-73.898583
103,40.884667,-73.900870
104,40.878856,-73.904834
106,40.874561,-73.909831
107,40.869444,-73.915279
...,...,...
R42,40.634967,-74.023377
R43,40.629742,-74.025510
R44,40.622687,-74.028398
S03,40.674772,-73.957624


In [538]:
node_list_df.index.nunique()

444

#### Saving nodelist

In [549]:
# node_list_df.to_csv("./saved_data/nodelist_nyc_subway.csv")

## Matches between shapes and stations?

In [541]:
mta_unique_station_locations.iloc[-4:]

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station,station_id
472,R43,77 St,40.629742,-74.02551,1,,R43
473,R44,86 St,40.622687,-74.028398,1,,R44
475,S03,Park Pl,40.674772,-73.957624,1,,S03
476,S04,Botanic Garden,40.670343,-73.959245,1,,S04


In [542]:
station_lat_array = np.array(mta_unique_station_locations['stop_lat'])

In [543]:
shapes_lat = np.array((shapes_df['shape_pt_lat']))

In [544]:
# only 466 coordinates from the shapes latitudes match with the stations (beyond the 3 second ave stations)
np.intersect1d(station_lat_array, shapes_lat).shape

(439,)

In [545]:
shapes_df.iloc[89808:89811]

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
89808,FS.N01R,40.679989,-73.955849,35,
89809,FS.N01R,40.680596,-73.955827,36,
89810,FS.S01R,40.680596,-73.955827,0,
