from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.manifold import TSNE
import pandas as pd
from pyspark import SparkContext
import statistics
import altair as alt
# ^^^ pyforest auto-imports - don't write above this line
# Imports

In [141]:
import folium
from collections import Counter

In [1]:
stations = pd.read_csv("./gtfs_data/stops.txt")

<IPython.core.display.Javascript object>

# Algorithms to use 

- steiner tree problem (basically getting the distance matrix) https://www.geeksforgeeks.org/steiner-tree/
- floyd-warshall algorithm https://en.wikipedia.org/wiki/Floyd–Warshall_algorithm
- dijkstra's algorithm https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm#Description

# Mapping Stations

In [3]:
len(stations)

1503

In [4]:
null_columns=stations.columns[stations.isnull().any()]
stations[null_columns].isnull().sum()

stop_code         1503
stop_desc         1503
zone_id           1503
stop_url          1503
parent_station     501
dtype: int64

In [5]:
stations.drop(columns = null_columns[:-1], inplace=True) # dropping if there are 1503 null values in a col

In [6]:
stations.stop_lat.nunique() # 20 staten island stations

496

In [26]:
stations.parent_station.nunique()

501

In [27]:
stations.stop_id.nunique()

1503

In [40]:
# https://stackoverflow.com/questions/28679930/how-to-drop-rows-from-pandas-data-frame-that-contains-a-particular-string-in-a-p
## this approach DOES NOT WORK b/c "nassau" is included even though it's in BK
# stations_no_SI = stations[~stations.stop_id.str.contains("S3")]
# stations_no_SI = stations_no_SI[~stations_no_SI.stop_id.str.contains("S2")]

### Unique coordinates of Stations

In [79]:
unique_coords = list(set(zip(stations['stop_lat'], stations['stop_lon'])))

In [81]:
len(unique_coords)

496

In [82]:
unique_coords[0:4]

[(40.717304, -73.956872),
 (40.738228, -73.996209),
 (40.726523, -73.852719),
 (40.608670000000004, -73.957734)]

In [89]:
counter = 0
for x in unique_coords:
    if x[1] > -74.03:
        counter += 1
print(counter)

472


In [90]:
mta_stations = stations[stations['stop_lon'] > -74.03]

In [95]:
mta_stations.parent_station.nunique()

477

In [96]:
mta_stations.stop_lon.nunique()

472

In [101]:
unique_coords = list(set(zip(mta_stations['stop_lat'], mta_stations['stop_lon'])))

In [110]:
len(unique_coords)

472

### Map

In [199]:
base_map = folium.folium.Map([40.7128, -74.0061], zoom_start=10, tiles='cartodbpositron') # city hall coords

In [200]:
for coord in unique_coords:
    folium.Circle(
            location = (coord[0], coord[1]),
            radius = 50, 
            popup = coord,
            color='crimson',
            fill=False,
        ).add_to(base_map)

In [201]:
base_map

### Naming All Stations

In [111]:
mta_stations

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station
0,101,Van Cortlandt Park - 242 St,40.889248,-73.898583,1,
1,101N,Van Cortlandt Park - 242 St,40.889248,-73.898583,0,101
2,101S,Van Cortlandt Park - 242 St,40.889248,-73.898583,0,101
3,103,238 St,40.884667,-73.900870,1,
4,103N,238 St,40.884667,-73.900870,0,103
...,...,...,...,...,...,...
1429,S03N,Park Pl,40.674772,-73.957624,0,S03
1430,S03S,Park Pl,40.674772,-73.957624,0,S03
1431,S04,Botanic Garden,40.670343,-73.959245,1,
1432,S04N,Botanic Garden,40.670343,-73.959245,0,S04


In [117]:
hopefully_parent_stations = []
for x in mta_stations['stop_id']:
    if x[-1] not in ['S', 'N']:
        hopefully_parent_stations.append(x)

In [122]:
mta_stations.stop_name.nunique()

357

In [123]:
mta_stations.parent_station.nunique()

477

In [124]:
mta_stations.location_type.sum()

477

In [118]:
len(hopefully_parent_stations)

477

In [128]:
# hopefully_parent_stations

### Actually stations

In [125]:
mta_actual_stations = mta_stations[mta_stations['location_type'] == 1]

In [127]:
mta_actual_stations.stop_lat.nunique()

472

In [136]:
# which stations in mta_actual_stations have the same lat? 
lat_and_station = list(zip(mta_actual_stations.stop_lat, mta_actual_stations.location_type))

In [137]:
lat_and_station[0:4]

[(40.889247999999995, 1), (40.884667, 1), (40.878856, 1), (40.874561, 1)]

In [140]:
len(lat_and_station)

477

In [144]:
# https://stackoverflow.com/questions/19006095/finding-non-unique-elements-in-list-not-working
duplicate_stations = [k for k, v in Counter(mta_actual_stations.stop_lat).items() if v > 1]

In [145]:
duplicate_stations

[40.750582, 40.824783000000004, 40.732338, 40.577422, 40.668234000000005]

In [146]:
duplicate_latitudes = mta_actual_stations[mta_actual_stations['stop_lat'].isin(duplicate_stations)]

In [233]:
# duplicate_latitudes.sort_values(by = 'stop_lat')

In [164]:
duplicate_latitudes['stop_lat'][774]

40.824783000000004

In [157]:
duplicate_stations_w_station_id = {key:None for key in duplicate_stations}

In [162]:
duplicate_stations_w_station_id

{40.750582: None,
 40.824783000000004: None,
 40.732338: None,
 40.577422: None,
 40.668234000000005: None}

In [169]:
duplicate_stations_w_station_id[40.750582] = '718_R09'
duplicate_stations_w_station_id[40.824783000000004] = 'A13_D13'
duplicate_stations_w_station_id[40.577422] = 'D43_N12'
duplicate_stations_w_station_id[40.668234000000005] = 'H01_H02'
duplicate_stations_w_station_id[40.732338] = 'A32_D20'

### Adding Station_id column

In [170]:
mta_actual_stations['station_id'] = mta_actual_stations['stop_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [171]:
mta_actual_stations.reset_index(drop=True, inplace=True)

In [172]:
# making new unique identifiers for each station at a single coordinate
for idx, x in enumerate(mta_actual_stations['stop_lat']):
    if x in duplicate_stations:
        mta_actual_stations['station_id'][idx] = duplicate_stations_w_station_id[x]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [173]:
mta_actual_stations['station_id'].nunique()

472

In [174]:
mta_unique_station_locations = mta_actual_stations.drop_duplicates(subset=['station_id'])

In [175]:
mta_unique_station_locations.shape

(472, 7)

In [178]:
mta_unique_station_locations.stop_name.nunique()

355

## Adding Connections Between Stations on the Map

In [202]:
shapes_df = pd.read_csv("./gtfs_data/shapes.txt")

<IPython.core.display.Javascript object>

In [203]:
shapes_df.head()

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,1..N03R,40.702068,-74.013664,0,
1,1..N03R,40.703199,-74.014792,1,
2,1..N03R,40.703226,-74.01482,2,
3,1..N03R,40.703253,-74.014846,3,
4,1..N03R,40.70328,-74.01487,4,


In [204]:
shapes_df.shape_id.nunique()

224

In [205]:
shapes_df.shape

(124672, 5)

In [206]:
# removing the staten island one 
shapes_df = shapes_df[shapes_df['shape_pt_lon'] > -74.03]
# stations[stations['stop_lon'] > -74.03]

In [207]:
shapes_df.shape

(117752, 5)

In [208]:
for x in list(shapes_df.shape_id.unique()):
    line_df = shapes_df[shapes_df['shape_id'] == x]
    list_of_coords = list(zip(line_df['shape_pt_lat'], line_df['shape_pt_lon']))
    folium.PolyLine(
        locations=list_of_coords,
        tooltip=x
    ).add_to(base_map)

In [209]:
# note: the mta data does not include the second avenue extension from lexington ave/63 street to 96th street
## https://en.wikipedia.org/wiki/Second_Avenue_Subway#/media/File:Second_Avenue_Subway_Map_vc.jpg
# Also note that I removed the staten island data
base_map

### Saving Map

In [210]:
base_map.save('nyc_subway_w_connections.html')

## Creating Node_List To Use in post_man Problems

In [211]:
node_list_df = mta_unique_station_locations.copy()

In [213]:
node_list_df.set_index(keys = 'station_id', drop=True, inplace=True)

In [216]:
node_list_df.drop(columns = ['stop_id', 'stop_name', 'location_type', 'parent_station'], inplace=True)

In [221]:
node_list_df.columns = ['X', 'Y']

In [224]:
node_list_df.rename(index={'station_id': 'ID'}, inplace=True)

#### Saving nodelist

In [227]:
# node_list_df.to_csv("./saved_data/nodelist_nyc_subway.csv")

## Matches between shapes and stations?

In [243]:
mta_unique_station_locations.iloc[-4:]

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station,station_id
473,R44,86 St,40.622687,-74.028398,1,,R44
474,S01,Franklin Av,40.680596,-73.955827,1,,S01
475,S03,Park Pl,40.674772,-73.957624,1,,S03
476,S04,Botanic Garden,40.670343,-73.959245,1,,S04


In [242]:
"42 St" in mta_unique_station_locations['stop_name']

False

In [244]:
test_lat = mta_unique_station_locations['stop_lat'][474]

In [280]:
station_lat_array = np.array(mta_unique_station_locations['stop_lat'])

In [273]:
shapes_df.head()

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,1..N03R,40.702068,-74.013664,0,
1,1..N03R,40.703199,-74.014792,1,
2,1..N03R,40.703226,-74.01482,2,
3,1..N03R,40.703253,-74.014846,3,
4,1..N03R,40.70328,-74.01487,4,


In [279]:
shapes_lat = np.array((shapes_df['shape_pt_lat']))

In [275]:
shapes_lat.shape

(117752,)

In [276]:
match_of_shapes_and_test_station = np.where(station_lat_array == shapes_lat, 1, 0)

  """Entry point for launching an IPython kernel.


In [277]:
print(np.sum(match_of_shapes_and_test_station))
list(match_of_shapes_and_test_station).index(1, 89810)

0


TypeError: iteration over a 0-d array

In [281]:
# only 466 coordinates from the shapes latitudes match with the stations (beyond the 3 second ave stations)
np.intersect1d(station_lat_array, shapes_lat).shape

(466,)

In [257]:

shapes_df.iloc[89808:89811]

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
89808,FS.N01R,40.679989,-73.955849,35,
89809,FS.N01R,40.680596,-73.955827,36,
89810,FS.S01R,40.680596,-73.955827,0,


# Finding Routes

In [292]:
stop_times_df = pd.read_csv("./gtfs_data/stop_times.txt")

<IPython.core.display.Javascript object>

In [293]:
stop_times_df.shape

(535099, 9)

In [294]:
# I think I need to use the stop_sequence and the stop_id to map them out 
## A lot of suspiciously round numbers
stop_times_df.head(20)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:06:00,00:06:00,101S,1,,0,0,
1,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:07:30,00:07:30,103S,2,,0,0,
2,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:09:00,00:09:00,104S,3,,0,0,
3,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:10:30,00:10:30,106S,4,,0,0,
4,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:12:00,00:12:00,107S,5,,0,0,
5,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:13:00,00:13:00,108S,6,,0,0,
6,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:14:30,00:14:30,109S,7,,0,0,
7,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:16:00,00:16:00,110S,8,,0,0,
8,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:17:30,00:17:30,111S,9,,0,0,
9,AFA19GEN-1037-Sunday-00_000600_1..S03R,00:19:30,00:19:30,112S,10,,1,1,


In [295]:
stop_times_df.iloc[25000:25050]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
25000,AFA19GEN-1087-Weekday-00_067350_1..S03R,11:13:30,11:13:30,101S,1,,0,0,
25001,AFA19GEN-1087-Weekday-00_067350_1..S03R,11:15:00,11:15:00,103S,2,,0,0,
25002,AFA19GEN-1087-Weekday-00_067350_1..S03R,11:16:30,11:16:30,104S,3,,0,0,
25003,AFA19GEN-1087-Weekday-00_067350_1..S03R,11:18:00,11:18:00,106S,4,,0,0,
25004,AFA19GEN-1087-Weekday-00_067350_1..S03R,11:19:30,11:19:30,107S,5,,0,0,
25005,AFA19GEN-1087-Weekday-00_067350_1..S03R,11:20:30,11:20:30,108S,6,,0,0,
25006,AFA19GEN-1087-Weekday-00_067350_1..S03R,11:22:00,11:23:00,109S,7,,0,0,
25007,AFA19GEN-1087-Weekday-00_067350_1..S03R,11:24:30,11:24:30,110S,8,,0,0,
25008,AFA19GEN-1087-Weekday-00_067350_1..S03R,11:26:00,11:26:00,111S,9,,0,0,
25009,AFA19GEN-1087-Weekday-00_067350_1..S03R,11:28:00,11:29:00,112S,10,,1,1,
