# Calculation of inter-station paths for the visualization

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import LineString
import plotly.express as px

### Load the geojson data for Amtrak Northeast Corridor Stations with geopandas
* The `geometry` feature gives the (longitude, latitude) coordinate for each station of interest along the route of the Northeast Corridor
* The `STNCODE` feature gives the Amtrak station abbreviation

In [2]:
geo_stations = gpd.read_file('./data/geo/Amtrak_Project_Stations.geojson')
geo_stations = geo_stations.set_index('STNCODE')

In [3]:
geo_stations

Unnamed: 0_level_0,OBJECTID,STNNAME,CITY2,STATE,STFIPS,urban,geometry
STNCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BOS,5,"Boston (South Station), Massachusetts",Boston,MA,25,YES,POINT (-71.05517 42.35164)
BBY,17,"Boston (Back Bay), Massachusetts",Boston,MA,25,YES,POINT (-71.07583 42.34732)
RTE,24,"Westwood, Route 128 Station, Massachusetts",Route 128,MA,25,YES,POINT (-71.14789 42.21024)
PVD,10,"Providence, Rhode Island",Providence,RI,44,YES,POINT (-71.41348 41.82949)
KIN,60,"West Kingston, Rhode Island",Kingston,RI,44,,POINT (-71.56060 41.48396)
NLC,63,"New London, Connecticut",New London,CT,9,YES,POINT (-72.09322 41.35427)
NHV,16,"New Haven, Connecticut",New Haven,CT,9,YES,POINT (-72.92667 41.29771)
STM,27,"Stamford, Connecticut",Stamford,CT,9,YES,POINT (-73.54216 41.04713)
NYP,1,"New York (Penn Station), New York",New York,NY,36,YES,POINT (-73.99446 40.75033)
NWK,14,"Newark (Penn Station), New Jersey",Newark,NJ,34,YES,POINT (-74.16475 40.73471)


### Define stations and mile markers for each station going in the Northbound and Southbound directions

In [4]:
amtrak_stations = list(geo_stations.index)

mile_markers = {'NB_MILE': {station: None for station in amtrak_stations} , 'SB_MILE': {station: None for station in amtrak_stations} } 

# Mile markers for each station along route starting in Boston and heading to DC
SB = [0, 1, 11, 43, 70, 105, 156, 195, 231, 241, 289, 322, 347, 416, 427, 448, 457]

# Mile markers for each station along route starting in DC and heading to Boston
NB = [457, 456, 446, 414, 387, 352, 301, 262, 226, 216, 168, 135, 110, 41, 30, 9, 0]

# Add to dictionary and then create data frame
for station, NB_mile, SB_mile in zip(amtrak_stations, NB, SB):
    mile_markers['NB_MILE'][station] = NB_mile
    mile_markers['SB_MILE'][station] = SB_mile

mile_cols = pd.DataFrame.from_dict(mile_markers, orient='columns')

### Create data frame of longitude/latitude values, indexed by Amtrak station code

In [5]:
lonlat = np.array([geo_stations.geometry.x, geo_stations.geometry.y]).T
lonlat = pd.DataFrame(lonlat, index = geo_stations.index, columns = ['LON', 'LAT'])

### Create weather city names columns indexed by Amtrak station codes

In [6]:
# The three stops in MA all resolve to Boston weather stations, so Boston appears three times
# Otherwise, all weather stations are generally the same
weather_location_names =  ['Boston, MA', 'Boston, MA', 'Boston, MA', 'Providence, RI', 'Kingston, RI',
                       'New London, CT', 'New Haven, CT', 'Stamford, CT', 'Manhattan, NY', 'Newark, NJ',
                       'Trenton, NJ', 'Philadelphia, PA', 'Wilmington, DE', 'Baltimore, MD',
                       'Baltimore BWI Airport, MD', 'New Carrollton, MD', 'Washington, DC']
weather_stations = pd.Series(weather_location_names, name = 'WEATHER_LOC', index = geo_stations.index)

### Create new data frame with all relevant station linking information, indexed by Amtrak station codes

In [7]:
# Create empty data frame with Southbound-oriented ordered index
gdf = pd.DataFrame(geo_stations.index, index = geo_stations.index)
# Extract desired columns from geo_stations data frame
gdf[['STNNAME', 'STATE', 'AMTRAK_LOC']] = geo_stations[['STNNAME','STATE', 'CITY2']]
# Combine with longitude and latitude and mile markers data
gdf = pd.concat([gdf, weather_stations, lonlat, mile_cols], axis = 1)
gdf

Unnamed: 0,STNCODE,STNNAME,STATE,AMTRAK_LOC,WEATHER_LOC,LON,LAT,NB_MILE,SB_MILE
BOS,BOS,"Boston (South Station), Massachusetts",MA,Boston,"Boston, MA",-71.05517,42.351642,457,0
BBY,BBY,"Boston (Back Bay), Massachusetts",MA,Boston,"Boston, MA",-71.075828,42.347317,456,1
RTE,RTE,"Westwood, Route 128 Station, Massachusetts",MA,Route 128,"Boston, MA",-71.147894,42.210242,446,11
PVD,PVD,"Providence, Rhode Island",RI,Providence,"Providence, RI",-71.413478,41.82949,414,43
KIN,KIN,"West Kingston, Rhode Island",RI,Kingston,"Kingston, RI",-71.560597,41.483959,387,70
NLC,NLC,"New London, Connecticut",CT,New London,"New London, CT",-72.093225,41.354267,352,105
NHV,NHV,"New Haven, Connecticut",CT,New Haven,"New Haven, CT",-72.92667,41.297714,301,156
STM,STM,"Stamford, Connecticut",CT,Stamford,"Stamford, CT",-73.54216,41.04713,262,195
NYP,NYP,"New York (Penn Station), New York",NY,New York,"Manhattan, NY",-73.994459,40.750327,226,231
NWK,NWK,"Newark (Penn Station), New Jersey",NJ,Newark,"Newark, NJ",-74.16475,40.734706,216,241


### Process to determine the station paths
* Store info in a dictionary

In [8]:
# Extract latitude and longitude columns and stations index and create dictionary for storage
lat = gdf['LAT']
lon = gdf['LON']
stations = gdf.index
geoloc_dict = {station: {'x': None, 'y': None, 'prev': None, 
                         'next': None, 'path2next': None, 
                         'path2prev': None, 'index_where': None,
                         'station_orient_SB': None, 'station_orient_NB': None} 
               for station in stations}

# For each station, set the x and y coordinates to the longitude and latitude values
# Also set the previous and next stations along the route (using Southbound-oriented indexing)
prev_stat = None
for i, station in enumerate(stations):
    geoloc_dict[station]['x'] = lon[station] 
    geoloc_dict[station]['y'] = lat[station] 
    geoloc_dict[station]['prev'] = prev_stat
    if station != 'WAS':
        geoloc_dict[station]['next'] = stations[i+1]
    prev_stat = station
    print(station, 'lon:', geoloc_dict[station]['x'], 'lat:', geoloc_dict[station]['y'], 
          'prev:', geoloc_dict[station]['prev'], 'next:', geoloc_dict[station]['next'])

BOS lon: -71.0551698912163 lat: 42.35164247995449 prev: None next: BBY
BBY lon: -71.07582800026684 lat: 42.34731700021211 prev: BOS next: RTE
RTE lon: -71.14789400005567 lat: 42.21024199995967 prev: BBY next: PVD
PVD lon: -71.4134779996708 lat: 41.829490000162025 prev: RTE next: KIN
KIN lon: -71.5605969999761 lat: 41.483959000033835 prev: PVD next: NLC
NLC lon: -72.09322499959666 lat: 41.35426700011465 prev: KIN next: NHV
NHV lon: -72.92666999955892 lat: 41.29771399982176 prev: NLC next: STM
STM lon: -73.54215999966355 lat: 41.047130000267344 prev: NHV next: NYP
NYP lon: -73.99445899996624 lat: 40.75032699989306 prev: STM next: NWK
NWK lon: -74.16474999990626 lat: 40.73470599995154 prev: NYP next: TRE
TRE lon: -74.75443999967372 lat: 40.21901100002319 prev: NWK next: PHL
PHL lon: -75.18104099974647 lat: 39.95561500001248 prev: TRE next: WIL
WIL lon: -75.55109500023144 lat: 39.737262999896686 prev: PHL next: BAL
BAL lon: -76.61568799967372 lat: 39.30730200018028 prev: WIL next: BWI
BWI 

### Creating a list of coordinates for latitude and longitude values
* Will convert to CSV format rather than GeoJSON

In [9]:
geo_route = gpd.read_file('./data/geo/Amtrak_Project_Routes.geojson')
geo_route

Unnamed: 0,OBJECTID,NAME,Shape_Leng,Shape_Le_1,Shape_Length,geometry
0,29,Regional,1041187.0,1041187.0,1358052.0,"MULTILINESTRING ((-76.45187 37.02302, -76.4527..."


### Read in the BOS-BBY data
* Data was not included in the geo_route file for some reason
* I manually collected the coordinates between BBY and BOS from Google Maps

In [10]:
bos_bby = pd.read_csv('./data/facts/BBY-BOS.csv')
bos_bby.head()

Unnamed: 0,Longitude,Latitude
0,-71.055112,42.351438
1,-71.055155,42.351094
2,-71.055263,42.350636
3,-71.055519,42.350146
4,-71.056269,42.348733


In [11]:
bos_bby.tail()

Unnamed: 0,Longitude,Latitude
37,-71.072185,42.347442
38,-71.072975,42.347488
39,-71.07337,42.347411
40,-71.074535,42.347319
41,-71.075117,42.347304


### Add all latitude/longitude coordinates to individual lists

In [12]:
lats = []
lons = []
for mlstring in geo_route.geometry:
    linestrings = mlstring.geoms
    for linestring in linestrings: 
        x, y = linestring.xy
        lats = np.append(lats, y)
        lons = np.append(lons, x)
print(len(lats), len(lons))

13258 13258


### Reverse them to Southbound orientation

In [13]:
lons = lons[::-1]
lats = lats[::-1]

### Add BOS-BBY to lats/lons

In [14]:
lats_bos_bby = np.array(bos_bby['Latitude'])
lons_bos_bby = np.array(bos_bby['Longitude'])

In [15]:
lats_full = np.concatenate((lats_bos_bby, lats))
lons_full = np.concatenate((lons_bos_bby, lons))

### Extract only Boston to Washington (exclude Virginia coordinates)

In [16]:
lats_bos_to_was = []
lons_bos_to_was = []
for x, y in zip(lons_full, lats_full):
    if x < -77.006422 and y < 38.896993:
        break
    else:
        lats_bos_to_was = np.append(lats_bos_to_was, y)
        lons_bos_to_was = np.append(lons_bos_to_was, x)
print(len(lons_bos_to_was))
print(len(lats_bos_to_was))

9592
9592


### Convert to numpy array and then data frame for later use

In [17]:
lonlat_bos_to_was = np.array([lons_bos_to_was, lats_bos_to_was]).T
print(lonlat_bos_to_was.shape)
lonlat_df = pd.DataFrame(lonlat_bos_to_was, columns = ["Longitude","Latitude"])
lonlat_df.tail()

(9592, 2)


Unnamed: 0,Longitude,Latitude
9587,-77.005867,38.886143
9588,-77.006019,38.885972
9589,-77.00611,38.8859
9590,-77.006242,38.885795
9591,-77.006305,38.885746


In [18]:
lonlat_df.head()

Unnamed: 0,Longitude,Latitude
0,-71.055112,42.351438
1,-71.055155,42.351094
2,-71.055263,42.350636
3,-71.055519,42.350146
4,-71.056269,42.348733


### Calculate the indices where the longitude and latitude are within a small tolerance distance of the station
* Coordinates from the original file do not include Boston-South Station, they only have data to/from Boston-Back Bay
* The tolerance is relatively large but further precision would exclude some stations because they have 0 points within the smaller range. 

In [19]:
for station in amtrak_stations:
    station_x, station_y = geoloc_dict[station]['x'], geoloc_dict[station]['y']
    prev_stat, next_stat = geoloc_dict[station]['prev'], geoloc_dict[station]['next']
    where_near_x = np.argwhere(np.isclose(lons_full, station_x, atol = 0.002))
    where_near_y = np.argwhere(np.isclose(lats_full, station_y, atol = 0.002))
    intersect = np.intersect1d(where_near_x, where_near_y)
    geoloc_dict[station]['index_where'] = intersect
    print(station, intersect)

BOS [0 1 2 3]
BBY [39 40 41 42 43 44 45 46 47 48 49 50 51]
RTE [194 195 196 197 198 199 200 201 202 203 204 205 206]
PVD [690 691 692 693 694 695 696 697 698 699 700 701 702 703 704]
KIN [1066 1067 1068 1069 1070 1071 1072]
NLC [1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861
 1862 1863 1864 1865]
NHV [3223 3224 3225 3226 3227 3228]
STM [4098 4099 4100 4101]
NYP [5108 5109 5110 5111 5112 5113]
NWK [5329 5330 5331 5332 5333 5334 5335 5353 5354 5355 5356 5357 5358 5359
 5360 5361]
TRE [5988 5989 5990 5991 5992 5993 5994]
PHL [6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719
 6720 6721 6722]
WIL [7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334
 7335 7336 7337]
BAL [8620 8621 8622 8623 8624 8625 8626 8627]
BWI [8942 8943 8944 8945 8946]
NCR [9282 9283 9284 9285 9286]
WAS [9552 9553 9554 9555 9556 9557 9558]


In [20]:
lonlat_df = lonlat_df[0:9552]
lonlat_df.shape[0]

9552

### Calculate inter-station paths to find "path groups" for the visualization separators

In [21]:
# For each station, add the group indicator to a list for a new column in the dataframe
between_station_paths = []
group_num = 0
groups_path = []
for curr_stat in amtrak_stations:
    prev_stat, next_stat = geoloc_dict[curr_stat]['prev'], geoloc_dict[curr_stat]['next']
    if next_stat is not None:
        curr_stat_index_where = geoloc_dict[curr_stat]['index_where'][0]
        next_stat_index_where = geoloc_dict[next_stat]['index_where'][0]
        path2next = []
        for i in range(curr_stat_index_where, next_stat_index_where):
            between_station_paths.append('{}-{}'.format(curr_stat, next_stat))
            path2next.append(group_num)
            groups_path.append(group_num)
        geoloc_dict[curr_stat]['path2next'] = path2next
        group_num += 1
    if prev_stat is not None:
        geoloc_dict[curr_stat]['path2prev'] = geoloc_dict[prev_stat]['path2next']
print(len(groups_path))

9552


In [22]:
print(len(groups_path))
print(lonlat_df.shape[0])

9552
9552


### Create series to add to dataframe

In [23]:
group_num = pd.Series(groups_path, name = 'Group')
group_text = pd.Series(between_station_paths, name = 'Connecting Path')
print(group_text.shape[0])
print(group_num.shape[0])

9552
9552


### Create southbound and northbound path groups

In [24]:
stations_orient_southbound = amtrak_stations[0:-1]
print(stations_orient_southbound)

['BOS', 'BBY', 'RTE', 'PVD', 'KIN', 'NLC', 'NHV', 'STM', 'NYP', 'NWK', 'TRE', 'PHL', 'WIL', 'BAL', 'BWI', 'NCR']


In [25]:
stations_orient_northbound = amtrak_stations[1:]
print(stations_orient_northbound)

['BBY', 'RTE', 'PVD', 'KIN', 'NLC', 'NHV', 'STM', 'NYP', 'NWK', 'TRE', 'PHL', 'WIL', 'BAL', 'BWI', 'NCR', 'WAS']


In [26]:
sb_path_group = pd.Series(index=group_num.index, name = 'Station_SB', dtype='object')
nb_path_group = pd.Series(index=group_num.index,  name = 'Station_NB', dtype='object')
for num in group_num.unique():
    nb_station = stations_orient_northbound[num]
    sb_station = stations_orient_southbound[num]
    print(num, '----', 'NB Station:', nb_station, 'SB Station:', sb_station)
    group_index = group_num.loc[group_num == num].index
    nb_path_group.iloc[group_index] = nb_station
    sb_path_group.iloc[group_index] = sb_station

0 ---- NB Station: BBY SB Station: BOS
1 ---- NB Station: RTE SB Station: BBY
2 ---- NB Station: PVD SB Station: RTE
3 ---- NB Station: KIN SB Station: PVD
4 ---- NB Station: NLC SB Station: KIN
5 ---- NB Station: NHV SB Station: NLC
6 ---- NB Station: STM SB Station: NHV
7 ---- NB Station: NYP SB Station: STM
8 ---- NB Station: NWK SB Station: NYP
9 ---- NB Station: TRE SB Station: NWK
10 ---- NB Station: PHL SB Station: TRE
11 ---- NB Station: WIL SB Station: PHL
12 ---- NB Station: BAL SB Station: WIL
13 ---- NB Station: BWI SB Station: BAL
14 ---- NB Station: NCR SB Station: BWI
15 ---- NB Station: WAS SB Station: NCR


In [27]:
lonlat_with_path_groups = pd.concat([lonlat_df, group_num, group_text, nb_path_group, sb_path_group], axis = 1)

In [28]:
print(lonlat_with_path_groups[0:50])

    Longitude   Latitude  Group Connecting Path Station_NB Station_SB
0  -71.055112  42.351438      0         BOS-BBY        BBY        BOS
1  -71.055155  42.351094      0         BOS-BBY        BBY        BOS
2  -71.055263  42.350636      0         BOS-BBY        BBY        BOS
3  -71.055519  42.350146      0         BOS-BBY        BBY        BOS
4  -71.056269  42.348733      0         BOS-BBY        BBY        BOS
5  -71.056772  42.347865      0         BOS-BBY        BBY        BOS
6  -71.056806  42.347741      0         BOS-BBY        BBY        BOS
7  -71.057108  42.347493      0         BOS-BBY        BBY        BOS
8  -71.057242  42.347253      0         BOS-BBY        BBY        BOS
9  -71.057410  42.346955      0         BOS-BBY        BBY        BOS
10 -71.057566  42.346757      0         BOS-BBY        BBY        BOS
11 -71.057689  42.346617      0         BOS-BBY        BBY        BOS
12 -71.057823  42.346443      0         BOS-BBY        BBY        BOS
13 -71.058137  42.34

### Export everything to CSV

In [29]:
lonlat_with_path_groups.to_csv("./data/facts/NE_regional_lonlat.csv", index = False)

In [30]:
gdf

Unnamed: 0,STNCODE,STNNAME,STATE,AMTRAK_LOC,WEATHER_LOC,LON,LAT,NB_MILE,SB_MILE
BOS,BOS,"Boston (South Station), Massachusetts",MA,Boston,"Boston, MA",-71.05517,42.351642,457,0
BBY,BBY,"Boston (Back Bay), Massachusetts",MA,Boston,"Boston, MA",-71.075828,42.347317,456,1
RTE,RTE,"Westwood, Route 128 Station, Massachusetts",MA,Route 128,"Boston, MA",-71.147894,42.210242,446,11
PVD,PVD,"Providence, Rhode Island",RI,Providence,"Providence, RI",-71.413478,41.82949,414,43
KIN,KIN,"West Kingston, Rhode Island",RI,Kingston,"Kingston, RI",-71.560597,41.483959,387,70
NLC,NLC,"New London, Connecticut",CT,New London,"New London, CT",-72.093225,41.354267,352,105
NHV,NHV,"New Haven, Connecticut",CT,New Haven,"New Haven, CT",-72.92667,41.297714,301,156
STM,STM,"Stamford, Connecticut",CT,Stamford,"Stamford, CT",-73.54216,41.04713,262,195
NYP,NYP,"New York (Penn Station), New York",NY,New York,"Manhattan, NY",-73.994459,40.750327,226,231
NWK,NWK,"Newark (Penn Station), New Jersey",NJ,Newark,"Newark, NJ",-74.16475,40.734706,216,241


In [31]:
gdf.to_csv('./data/facts/geo_stations_info.csv', index = False)