import pandas as pd
# ^^^ pyforest auto-imports - don't write above this line
# Imports

In [141]:
import folium
from collections import Counter

In [1]:
stations = pd.read_csv("./gtfs_data/stops.txt")

<IPython.core.display.Javascript object>

# Algorithms to use 

- steiner tree problem (basically getting the distance matrix) https://www.geeksforgeeks.org/steiner-tree/
- floyd-warshall algorithm https://en.wikipedia.org/wiki/Floyd–Warshall_algorithm
- dijkstra's algorithm https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm#Description

# Mapping Stations

In [3]:
len(stations)

1503

In [4]:
null_columns=stations.columns[stations.isnull().any()]
stations[null_columns].isnull().sum()

stop_code         1503
stop_desc         1503
zone_id           1503
stop_url          1503
parent_station     501
dtype: int64

In [5]:
stations.drop(columns = null_columns[:-1], inplace=True) # dropping if there are 1503 null values in a col

In [6]:
stations.stop_lat.nunique() # 20 staten island stations

496

In [26]:
stations.parent_station.nunique()

501

In [27]:
stations.stop_id.nunique()

1503

In [40]:
# https://stackoverflow.com/questions/28679930/how-to-drop-rows-from-pandas-data-frame-that-contains-a-particular-string-in-a-p
## this approach DOES NOT WORK b/c "nassau" is included even though it's in BK
# stations_no_SI = stations[~stations.stop_id.str.contains("S3")]
# stations_no_SI = stations_no_SI[~stations_no_SI.stop_id.str.contains("S2")]

### Unique coordinates of Stations

In [79]:
unique_coords = list(set(zip(stations['stop_lat'], stations['stop_lon'])))

In [81]:
len(unique_coords)

496

In [82]:
unique_coords[0:4]

[(40.717304, -73.956872),
 (40.738228, -73.996209),
 (40.726523, -73.852719),
 (40.608670000000004, -73.957734)]

In [89]:
counter = 0
for x in unique_coords:
    if x[1] > -74.03:
        counter += 1
print(counter)

472


In [90]:
mta_stations = stations[stations['stop_lon'] > -74.03]

In [95]:
mta_stations.parent_station.nunique()

477

In [96]:
mta_stations.stop_lon.nunique()

472

In [101]:
unique_coords = list(set(zip(mta_stations['stop_lat'], mta_stations['stop_lon'])))

In [110]:
len(unique_coords)

472

### Map

In [107]:
base_map = folium.folium.Map([40.7128, -74.0061], zoom_start=10, tiles='cartodbpositron') # city hall coords

In [108]:
for coord in unique_coords:
    folium.Circle(
            location = (coord[0], coord[1]),
            radius = 50, 
            popup = coord,
            color='crimson',
            fill=False,
        ).add_to(base_map)

In [109]:
base_map

### Naming All Stations

In [111]:
mta_stations

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station
0,101,Van Cortlandt Park - 242 St,40.889248,-73.898583,1,
1,101N,Van Cortlandt Park - 242 St,40.889248,-73.898583,0,101
2,101S,Van Cortlandt Park - 242 St,40.889248,-73.898583,0,101
3,103,238 St,40.884667,-73.900870,1,
4,103N,238 St,40.884667,-73.900870,0,103
...,...,...,...,...,...,...
1429,S03N,Park Pl,40.674772,-73.957624,0,S03
1430,S03S,Park Pl,40.674772,-73.957624,0,S03
1431,S04,Botanic Garden,40.670343,-73.959245,1,
1432,S04N,Botanic Garden,40.670343,-73.959245,0,S04


In [117]:
hopefully_parent_stations = []
for x in mta_stations['stop_id']:
    if x[-1] not in ['S', 'N']:
        hopefully_parent_stations.append(x)

In [122]:
mta_stations.stop_name.nunique()

357

In [123]:
mta_stations.parent_station.nunique()

477

In [124]:
mta_stations.location_type.sum()

477

In [118]:
len(hopefully_parent_stations)

477

In [128]:
# hopefully_parent_stations

### Actually stations

In [125]:
mta_actual_stations = mta_stations[mta_stations['location_type'] == 1]

In [127]:
mta_actual_stations.stop_lat.nunique()

472

In [136]:
# which stations in mta_actual_stations have the same lat? 
lat_and_station = list(zip(mta_actual_stations.stop_lat, mta_actual_stations.location_type))

In [137]:
lat_and_station[0:4]

[(40.889247999999995, 1), (40.884667, 1), (40.878856, 1), (40.874561, 1)]

In [140]:
len(lat_and_station)

477

In [144]:
# https://stackoverflow.com/questions/19006095/finding-non-unique-elements-in-list-not-working
duplicate_stations = [k for k, v in Counter(mta_actual_stations.stop_lat).items() if v > 1]

In [145]:
duplicate_stations

[40.750582, 40.824783000000004, 40.732338, 40.577422, 40.668234000000005]

In [146]:
duplicate_latitudes = mta_actual_stations[mta_actual_stations['stop_lat'].isin(duplicate_stations)]

In [151]:
duplicate_latitudes.sort_values(by = 'stop_lat')

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station
858,D43,Coney Island - Stillwell Av,40.577422,-73.981233,1,
1293,N12,S.B. Coney Island,40.577422,-73.981233,1,
1044,H01,Aqueduct Racetrack,40.668234,-73.834058,1,
1047,H02,Aqueduct - N Conduit Av,40.668234,-73.834058,1,
609,A32,W 4 St,40.732338,-74.000495,1,
795,D20,W 4 St,40.732338,-74.000495,1,
507,718,Queensboro Plaza,40.750582,-73.940202,1,
1326,R09,Queensboro Plaza,40.750582,-73.940202,1,
561,A12,145 St,40.824783,-73.944216,1,
774,D13,145 St,40.824783,-73.944216,1,


In [164]:
duplicate_latitudes['stop_lat'][774]

40.824783000000004

In [157]:
duplicate_stations_w_station_id = {key:None for key in duplicate_stations}

In [162]:
duplicate_stations_w_station_id

{40.750582: None,
 40.824783000000004: None,
 40.732338: None,
 40.577422: None,
 40.668234000000005: None}

In [169]:
duplicate_stations_w_station_id[40.750582] = '718_R09'
duplicate_stations_w_station_id[40.824783000000004] = 'A13_D13'
duplicate_stations_w_station_id[40.577422] = 'D43_N12'
duplicate_stations_w_station_id[40.668234000000005] = 'H01_H02'
duplicate_stations_w_station_id[40.732338] = 'A32_D20'

### Adding Station_id column

In [170]:
mta_actual_stations['station_id'] = mta_actual_stations['stop_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [171]:
mta_actual_stations.reset_index(drop=True, inplace=True)

In [172]:
# making new unique identifiers for each station at a single coordinate
for idx, x in enumerate(mta_actual_stations['stop_lat']):
    if x in duplicate_stations:
        mta_actual_stations['station_id'][idx] = duplicate_stations_w_station_id[x]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [173]:
mta_actual_stations['station_id'].nunique()

472

In [174]:
mta_unique_station_locations = mta_actual_stations.drop_duplicates(subset=['station_id'])

In [175]:
mta_unique_station_locations.shape

(472, 7)

In [178]:
mta_unique_station_locations.stop_name.nunique()

355

# Adding Connections Between Stations