# Transit Time Data

**Drew Honson**

**7 November 2023**

In [1]:
# Tabular data
import pandas as pd
import numpy as np
from itertools import combinations
from datetime import datetime
from ast import literal_eval

# Set up googlemaps API
import googlemaps 
with open('data/gmaps_api.key') as f: # the API key is stored in a git ignored folder
    API_key = f.read()
gmaps = googlemaps.Client(key=API_key)

# Dataviz
import holoviews as hv
import bokeh.io

hv.extension('bokeh')
bokeh.io.output_notebook()

# Functions
def haversine(a,b):
    '''Calculates distance between two coordinates using the Haversine
    formula. a and b are tuples of the two coordinates in the form 
    (latitude, longitude)'''
    R = 6371 #km
    
    # Convert degrees to radians
    phi_a = a[0] * np.pi / 180
    phi_b = b[0] * np.pi / 180
    deltaphi = phi_a - phi_b
    
    deltalamb = (a[1] - b[1]) * np.pi / 180
    
    # Calculate Haversine values
    hav_a = np.sin(deltaphi / 2)**2 + np.cos(phi_a) * np.cos(phi_b) * np.sin(deltalamb / 2)**2
    hav_c = 2 * np.arctan2(np.sqrt(hav_a), np.sqrt(1-hav_a))
    
    return R * hav_c

def transit_penalty(test,
                   stop_penalty = 0.01,
                   dist_penalty = 0.05,
                   leg_penalty = 0.2,
                   paris_penalty = 0):
    '''Uses a gmaps directions output to calculate penalties for steps in a public
    transit journey'''
    
    # Set a dictionary to track parameters
    penalty_tracker = {'n_stops':[],
                       'bsc_dist':[],
                       'n_legs':0,
                       'paris':[],
                       'unhandled':[]}
    
    # Loop through journey steps
    for i in range(len(test[0]['legs'][0]['steps'])):
        # Set base for indexing
        base = test[0]['legs'][0]['steps'][i]

        # Extract travel mode
        travel_mode = test[0]['legs'][0]['steps'][i]['travel_mode']

        # Skip walking steps; usually a trivial distance
        if travel_mode == 'WALKING':
            pass

        # Handle public transit
        elif travel_mode == 'TRANSIT':
            # Add a leg to the journey
            penalty_tracker['n_legs'] += 1

            # Extract vehicle and stop information
            vehicle = base['transit_details']['line']['vehicle']['name']
            stop = base['transit_details']['arrival_stop']['name']

            # Add stop to check for Paris at end
            penalty_tracker['paris'].append(stop)

            # Add train stops
            if vehicle == 'Train':
                n_stops = base['transit_details']['num_stops']
                penalty_tracker['n_stops'].append(n_stops)

            # Add bus and subway distances 
            elif vehicle == 'Bus' or 'Subway':
                dist_km = base['distance']['value'] / 1000
                penalty_tracker['bsc_dist'].append(dist_km)

        # Add driving distance  
        elif travel_mode == 'DRIVING':
            penalty_tracker['n_legs'] += 1
            dist_km = base['distance']['value'] / 1000
            penalty_tracker['bsc_dist'].append(dist_km)

        # If unrecognized travel mode, add to unhandled
        else:
            penalty_tracker['unhandled'].append(travel_mode)
            
    # Calculate penalties
    penalties = {}

    # Number of heavy rail stops
    n_stops = np.sum(penalty_tracker['n_stops'])
    penalties['n_stops'] = stop_penalty * n_stops

    # Total vehicle distance
    bsc_dist = np.sum(penalty_tracker['bsc_dist'])
    penalties['bsc_dist'] = dist_penalty * bsc_dist

    # Number of legs
    penalties['n_legs'] = leg_penalty * penalty_tracker['n_legs']

    # Paris
    paris_exact = ['Paris' in i for i in penalty_tracker['paris']]
    paris_stations = [i in stations for i in penalty_tracker['paris']]
    paris_all = paris_exact + paris_stations

    if any(paris_all):
        penalties['paris'] = paris_penalty
    else:
        penalties['paris'] = 0

    total_penalty = 1 + sum(penalties.values())

    return total_penalty, penalty_tracker
    

# Town labels
town_dict = {'(48.6326, 3.4845)':['Louan-Villegruis','LO','France'],
             '(44.3203, 3.0658)':['Sévérac-le-Château',' MC','France'],
             '(47.4532, 0.5949)':['Angers','AN','France'],
             '(46.3764, 6.1202)':['Divonne les Bains','DB','France'],
             '(43.3529, 1.447)':['Espelette','ES','France'],
             '(48.659, 6.1415)':['Nancy','NA','France'],
             '(48.7922, 8.6354)':['Langenbrand','SL','Germany'],
             '(51.0017, 7.0383)':['Cologne-Bonn','CB','Germany']}

# Major Paris railway stations
stations = ["Gare d'Austerlitz",
           "Gare de Bercy",
           "Gare de l'Est",
           "Gare Montparnasse",
           "Gare du Nord",
           "Gare Saint-Lazare"]

## Importing data and demonstrating functions

First, the data generated from pre-processing is imported with Pandas. This includes the sample, the location where it was collected, the taxonomic data of cecal microbes, and the normalized number of reads per taxon.

In [2]:
alldata = pd.read_csv('data/filtData.csv')

alldata.head()

Unnamed: 0,taxonomy,run,reads,low tax,geo_loc_name,latitude,longitude,sample,coord,Phylum
0,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.000481,o__Bacteroidales,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
1,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.080886,g__Bacteroides,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
2,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.073182,s__acidifaciens,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
3,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.155513,s__uniformis,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
4,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.000963,g__Parabacteroides,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes


To make this dataframe more compatible with Google Maps, a standard "city, country" column is added based on the latitude and longitude already present in the dataframe.

In [3]:
for i in alldata.index:
    alldata.loc[i,'city'] = town_dict[alldata.loc[i,'coord']][0]
    alldata.loc[i,'citycountry'] = ', '.join(alldata.loc[i,['city','geo_loc_name']])
    
alldata.head()

  alldata.loc[i,'city'] = town_dict[alldata.loc[i,'coord']][0]
  alldata.loc[i,'citycountry'] = ', '.join(alldata.loc[i,['city','geo_loc_name']])


Unnamed: 0,taxonomy,run,reads,low tax,geo_loc_name,latitude,longitude,sample,coord,Phylum,city,citycountry
0,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.000481,o__Bacteroidales,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes,Angers,"Angers, France"
1,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.080886,g__Bacteroides,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes,Angers,"Angers, France"
2,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.073182,s__acidifaciens,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes,Angers,"Angers, France"
3,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.155513,s__uniformis,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes,Angers,"Angers, France"
4,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.000963,g__Parabacteroides,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes,Angers,"Angers, France"


To make sure that the API key for Google Maps is working, we can run a single pair of locations.

In [4]:
# Demonstration of function
test = gmaps.directions('Angers, France', 'Louan-Villegruis, France',mode='transit',transit_mode='rail',
                       departure_time=datetime(2023,11,6,9,0,0))

The output contains a large amount of information about the journey. For example:

In [5]:
# The distance of each step of the journey
print('Distance of step: ' + test[0]['legs'][0]['steps'][0]['distance']['text'])

# Mode of transit
print('Mode of Transit: ' + test[0]['legs'][0]['steps'][0]['transit_details']['line']['vehicle']['type'])

# Total number of legs
print('Number of Legs: ' + str(len(test[0]['legs'][0]['steps'])))

Distance of step: 108 km
Mode of Transit: HEAVY_RAIL
Number of Legs: 10


To parse these data into a relative distance measurement between two places, I wrote a function to calculate the number of heavy rail stops, the distance that must be travelled by bus, subway, or car, and the number of legs. The distance penalty for each of these can be set in the function. Arbitrarily, I've set the penalty per stop to 1% distance, the penalty for each kilometer travelled off of heavy rail to 4%, and the penalty per leg to 15%. As mentioned in the update summary, I plan to simplify this to only extract the total percentage of the journey covered by heavy rail.

In [6]:
p, d = transit_penalty(test,
                stop_penalty = 0.01,
                dist_penalty = 0.04,
                leg_penalty = 0.15,)

print('Compiled Data: ')
print(d)

print('Total Distance Penalty: ' + str(p))

Compiled Data: 
{'n_stops': [2, 1, 7], 'bsc_dist': [5.241, 4.476, 47.869], 'n_legs': 6, 'paris': ['Saint-Pierre-Des-Corps', 'Paris Montparnasse Hall 1 - 2', "Gare de l'Est", 'Provins', 'Ville Haute', 'Louan'], 'unhandled': []}
Total Distance Penalty: 4.30344


Next, the geographic distance and the distance penalty are calculated for all location pairs.

In [7]:
# Collect all city pairs
citycountry = np.unique(alldata['citycountry'])
pairs = []

for i in citycountry:
    for j in citycountry:
        pairs.append((i,j))
        
pairs = list(set(pairs))

for p in pairs:
    if p[0] == p[1]:
        pairs.remove(p)
    else:
        pass     

# Collect city-coordinate pairs
citycoords = {}
for i in town_dict:
    location = ', '.join([town_dict[i][0],town_dict[i][-1]])
    citycoords[location] = literal_eval(i)
    
citycoords

{'Louan-Villegruis, France': (48.6326, 3.4845),
 'Sévérac-le-Château, France': (44.3203, 3.0658),
 'Angers, France': (47.4532, 0.5949),
 'Divonne les Bains, France': (46.3764, 6.1202),
 'Espelette, France': (43.3529, 1.447),
 'Nancy, France': (48.659, 6.1415),
 'Langenbrand, Germany': (48.7922, 8.6354),
 'Cologne-Bonn, Germany': (51.0017, 7.0383)}

In [8]:
# Calculate distances and penalties
distdata = {}

for c in pairs:
    a = citycoords[c[0]]
    b = citycoords[c[1]]
    crowflies = haversine(a,b)
    
    gmap_dir = gmaps.directions(c[0],c[1],mode='transit',transit_mode='rail',
                       departure_time=datetime(2023,11,6,9,0,0))
    
    distdata[c] = {'crow':crowflies,
                   'gmaps':gmap_dir}
    
print('Done')

Done


Between some locations, Google Maps cannot find any path that primarily relies on public transit. These are flagged for the relative distance calculations down the line.

In [9]:
penalty_list = []

for i in distdata:
    d = distdata[i]
    if len(d['gmaps']) == 0:
        penalty_list.append((i,'gmaps error'))
    else:
        p, pdata = transit_penalty(d['gmaps'])
        penalty_list.append((i,p,pdata))
        
penalty_list[0]

(('Angers, France', 'Espelette, France'),
 2.70595,
 {'n_stops': [3, 3, 14, 4],
  'bsc_dist': [9.319],
  'n_legs': 5,
  'paris': ['Saint-Pierre-Des-Corps',
   'Saint-Jean',
   'Gare de Bayonne',
   'Gare de Cambo-les-Bains',
   'Mendi Alde Bourg'],
  'unhandled': []})

The relative distances are calculated by the geographic distance times the penalty. For now, pairs with errors are flagged and excluded from analysis. 

In [29]:
for i in penalty_list:
    location = i[0]
    if type(i[1]) == str:
        pass
        
    else:
        distdata[location]['adjdist'] = distdata[location]['crow'] * i[1]

ls_df = []

for i in distdata:
    if type(distdata[i]['adjdist']) != np.float64:
        pass
    else:
        ls_df.append((i[0],i[1],distdata[i]['crow'],distdata[i]['adjdist']))
    
df = pd.DataFrame(ls_df,columns=['start','end','dist (km)', 'adjdist (au)'])

df.head()

Unnamed: 0,start,end,dist (km),adjdist (au)
0,"Angers, France","Espelette, France",460.751877,1246.77154
1,"Cologne-Bonn, Germany","Langenbrand, Germany",270.996044,4564.792866
2,"Sévérac-le-Château, France","Angers, France",397.355563,1755.099654
3,"Louan-Villegruis, France","Angers, France",251.659635,975.998978
4,"Langenbrand, Germany","Espelette, France",819.82737,3578.382507


We can now see that adjusted distance is correlated with geographic distance. However, two populations of location pairs exist. In the population with higher adjusted differences, transit does little to reduce the effective distance between two places. For the lower adjusted differences, transit allows the relative distance to remain relatively constant over a wide range of geographic distances.

In [37]:
hv.Scatter(df,kdims=['dist (km)'],vdims=['adjdist (au)','start','end']
          ).opts(height=400,
                 width=400,
                 size=4,
                 alpha=0.5,
                 color='tomato',
                 tools=['hover'])

Up until this point, all calculations have been done for pairs going in both directions. For example, for the location pair Espelette and Angers relative distances were calculated both for journeys from Espelette to Angers and from Angers to Espelette. To reduce this to a single number, the average relative distance for both journeys is taken. This value is used for the remaining calculations.

In [42]:
sepair = []

for i in df.index:
    se = np.sort(df.loc[i,['start','end']])
    se = tuple(se)
    sepair.append(se)
    
df['se_pair'] = sepair

df.head()

Unnamed: 0,start,end,dist (km),adjdist (au),se_pair
0,"Angers, France","Espelette, France",460.751877,1246.77154,"(Angers, France, Espelette, France)"
1,"Cologne-Bonn, Germany","Langenbrand, Germany",270.996044,4564.792866,"(Cologne-Bonn, Germany, Langenbrand, Germany)"
2,"Sévérac-le-Château, France","Angers, France",397.355563,1755.099654,"(Angers, France, Sévérac-le-Château, France)"
3,"Louan-Villegruis, France","Angers, France",251.659635,975.998978,"(Angers, France, Louan-Villegruis, France)"
4,"Langenbrand, Germany","Espelette, France",819.82737,3578.382507,"(Espelette, France, Langenbrand, Germany)"


In [43]:
se_ls = []

for i in list(set(df['se_pair'])):
    subdf = df[df['se_pair'] == i]
    
    avg_adist = np.mean(subdf['adjdist (au)'])
    se_ls.append((i[0],i[1],subdf.iloc[0]['dist (km)'],avg_adist))
    
se_df = pd.DataFrame(se_ls,columns=['loc1','loc2','dist (km)','adj dist (au)'])

se_df.head()

Unnamed: 0,loc1,loc2,dist (km),adj dist (au)
0,"Angers, France","Espelette, France",460.751877,1246.77154
1,"Cologne-Bonn, Germany","Langenbrand, Germany",270.996044,4094.58763
2,"Angers, France","Louan-Villegruis, France",251.659635,1139.709862
3,"Cologne-Bonn, Germany","Sévérac-le-Château, France",799.990157,25814.522367
4,"Langenbrand, Germany","Louan-Villegruis, France",378.26948,1652.25272


In [44]:
se_df.to_csv('data/transittimes.csv',index=False)

In [45]:
%load_ext watermark
%watermark -v --iversions

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.15.0

pandas    : 2.1.1
googlemaps: 4.10.0
holoviews : 1.18.0
numpy     : 1.24.3
bokeh     : 3.2.1

