# Biogeme Test - SMTO Data

Gonna try to adapt the Biogeme Swissmetro example to university choice using SMTO data set. We'll try to predict __mode__ choice using distance.

In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd

hh_df = pd.read_csv('../../../Data/SMTO_2019/SMTO_2019_Households.csv', low_memory = False)
ps_df = pd.read_csv('../../../Data/SMTO_2019/SMTO_2019_Respondents.csv', low_memory = False)
zone_df = pd.read_csv('../../../Data/SMTO_2019/CampusZones.csv')

In [2]:
# Load relevant columns
df = ps_df[['pscampusmain']]
df = df.join(ps_df['psinstitution'])
df = df.join(hh_df[['HmTTS2006']])
df = df.join(ps_df[['psmainmodefalltypical']])
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusmain': 'Campus', 'psmainmodefalltypical': 'Mode'})

campus_name_to_num = {}
keys = range(len(zone_df['Campus']))
values = list(zone_df['Campus'])
for i in keys:
    campus_name_to_num[values[i]] = i
#print(campus_name_to_num)


In [3]:
# Load relevant columns
df = ps_df[['pscampusmain']]
df = df.join(ps_df['psinstitution'])
df = df.join(hh_df[['HmTTS2006']])
df = df.join(ps_df[['psmainmodefalltypical']])
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusmain': 'Campus', 'psmainmodefalltypical': 'Mode'})

# Convert Campus and Mode column to numerical column
# Create a Dictionary with Campus names from 0 to 26
campus_name_to_num = {}
keys = range(len(zone_df['Campus']))
values = list(zone_df['Campus'])
for i in keys:
    campus_name_to_num[values[i]] = i

mode_name_to_num = {"Drive alone": "Auto", "Drive with passenger(s) (household members only)": "Auto", "Drive with passenger(s) (including non-household members)": "Auto", "Auto passenger (driver is a household member)":"Auto", "Auto passenger (driver is a non-household member)":"Auto", "Ride-hailing alone (UberX, Lyft etc.)":"Auto", "Ride-hailing with other passengers (Uberpool, Lyftpool etc.)": "Auto", "Taxi": "Auto", 
                    "Transit Bus": "Transit", "Streetcar": "Transit", "Subway/RT": "Transit", "GO Bus": "Transit", "GO Train": "Transit", 
                    "Walk": "Active", "Bicycle": "Active", "Bikeshare": "Active"}

df.loc[df['psinstitution'] == 'Ryerson University', 'Campus'] = 'Ryerson University'
df.loc[df['psinstitution'] == 'OCAD University', 'Campus'] = 'OCAD University'

df.replace({'Campus': campus_name_to_num}, inplace=True)
df.replace({'Mode': mode_name_to_num}, inplace=True)
print(df.shape)
#temp_df = df[df['psinstitution'] == 'Ryerson University']
#temp_df.head()

# Remove "Other" Modes
df = df[(df['Mode'] == 0) | (df['Mode'] == 1) | (df['Mode'] == 2)]
print(df.shape)
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Mode'] = pd.to_numeric(df['Mode'], downcast='signed')
df['Campus'] = pd.to_numeric(df['Campus'], downcast='signed')
del df['psinstitution']

df = df.dropna() # Remove rows with missing data
print(df.shape)
df.head()

(19135, 4)
(10595, 4)
(10515, 3)


Unnamed: 0,Campus,HomeZone,Mode
0,2,3851.0,1
1,3,181.0,1
2,2,1039.0,1
5,2,544.0,1
11,2,548.0,1


In [4]:
# Dataframe with walk distances
df_path = pd.read_csv('../../../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Dataframe with AutoTravelTimes
df_att = pd.read_csv('../../../../LoS/Auto_Travel_Times.csv')
AutoTravelTimes = list(df_att['Data'])

# Dataframe with TransitTravelTimes
df_ttt = pd.read_csv('../../../../LoS/Transit_Travel_Times.csv')
TransitTravelTimes = list(df_ttt['Data'])

# Dataframe with TransitTravelTimes
df_ac = pd.read_csv('../../../../LoS/Auto_Cost.csv')
AutoCosts = list(df_ac['Data'])

# Dataframe with TransitTravelTimes
df_tc = pd.read_csv('../../../../LoS/Transit_Cost.csv')
TransitCosts = list(df_tc['Data'])

In [5]:
not_found = set()
# Function for distance/AutoTravelTime/TransitTravelTime lookup

def find_value(origin, destination, mode_num):
    # mode_num: 0 - to find Auto Travel Time
    #           1 - to find Transit Travel Time
    #           2 - to find Walking Distance
    #           3 - to find Auto Cost
    #           4 - to find Transit Cost
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return 0
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return 0
    
    if mode_num == 0:
        return AutoTravelTimes[i*2392 + j]
    elif mode_num == 1:
        return TransitTravelTimes[i*2392 + j]
    elif mode_num == 2:
        return dists[i*2392 + j] * 15/1000
    elif mode_num == 3:
        return AutoCosts[i*2392 + j]
    elif mode_num == 4:
        return TransitCosts[i*2392 + j]
    else:
        print("ERROR: Enter correct mode_num!")
        return 0

In [6]:
# List of campus' TTS zones from Joven's MOE data
campus_zones = list(zone_df['Zone'])
# Add column with campus zones
df['CampusZone'] = df['Campus'].apply(lambda x: campus_zones[x])

# Add columns for Distance, Transit Travel Time, Auto Travel Time
df['AIVTT'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 0), axis=1)
df['TPTT'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 1), axis=1)
df['Dist'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 2), axis=1)
df['AuCost'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 3), axis=1)
df['TrCost'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 4), axis=1)

In [7]:
print(df.shape)
df.head()

(10515, 9)


Unnamed: 0,Campus,HomeZone,Mode,CampusZone,AIVTT,TPTT,Dist,AuCost,TrCost
0,2,3851.0,1,493,74.84418,140.74206,805.06845,3.943282,13.32598
1,3,181.0,1,564,20.2813,97.424699,325.7691,1.515612,2.22
2,2,1039.0,1,493,24.65909,96.60153,267.78,1.229752,6.822792
5,2,544.0,1,493,11.94488,56.045654,144.11502,0.706237,2.22
11,2,548.0,1,493,8.111495,37.1335,93.384255,0.423913,2.22


In [8]:
# Load data into Biogeme database
database = db.Database("SMTO", df)

# Definition of new variables: adding columns to the database 
Available =  DefineVariable('Available', 1, database) # All modes available to all students

# Make variable names global
globals().update(database.variables)

# Remove unknown values
database.remove(Dist == 0.0)
database.remove(TPTT == 0.0)
database.remove(AIVTT == 0.0)
database.remove(AIVTT > 0.1e20)

In [9]:
# Beta initialization: (name, value, lowerbound, upperbound, status, desc='')
# Status 0 if estimated, 1 if maintained - reference choice should be 1
ASC_AU = Beta('ASC_AU', 0, None, None, 1)
ASC_TR = Beta('ASC_TR', 0, None, None, 0)
ASC_AC = Beta('ASC_AC', 0, None, None, 0)
B_TIME = Beta('B_TIME', -0.005, None, None, 0)
#B_TPTT = Beta('B_TPTT', -0.05, None, None, 0)
#B_DIST = Beta('B_DIST', -0.0005, None, None, 0)
B_AUCOST = Beta('B_AUCOST', -0.0005, None, None, 0)
B_TRCOST = Beta('B_TRCOST', -0.0005, None, None, 0)

# Variables: from columns in database
AV = DefineVariable('AV', Available, database)
AU_AIVTT = DefineVariable('AU_AIVTT', AIVTT, database)
TR_TPTT = DefineVariable('TR_TPTT', TPTT, database)
AC_WT = DefineVariable('AC_WT', Dist, database)
AU_COST = DefineVariable('AU_COST', AuCost, database)
TR_COST = DefineVariable('TR_COST', TrCost, database)

# Mode Choice Utility Functions: ASC_AU is 0
V0 = ASC_AU + B_TIME * AU_AIVTT + B_AUCOST * AU_COST
V1 = ASC_TR + B_TIME * TR_TPTT + B_TRCOST * TR_COST
V2 = ASC_AC + B_TIME * AC_WT


V  = {0: V0, 1: V1, 2: V2}
av = {0: AV, 1: AV, 2: AV}

In [10]:
database.data

Unnamed: 0,Campus,HomeZone,Mode,CampusZone,AIVTT,TPTT,Dist,AuCost,TrCost,Available,AV,AU_AIVTT,TR_TPTT,AC_WT,AU_COST,TR_COST
0,2,3851.0,1,493,74.844180,140.742060,805.068450,3.943282,13.325980,1,1.0,74.844180,140.742060,805.068450,3.943282,13.325980
1,3,181.0,1,564,20.281300,97.424699,325.769100,1.515612,2.220000,1,1.0,20.281300,97.424699,325.769100,1.515612,2.220000
2,2,1039.0,1,493,24.659090,96.601530,267.780000,1.229752,6.822792,1,1.0,24.659090,96.601530,267.780000,1.229752,6.822792
5,2,544.0,1,493,11.944880,56.045654,144.115020,0.706237,2.220000,1,1.0,11.944880,56.045654,144.115020,0.706237,2.220000
11,2,548.0,1,493,8.111495,37.133500,93.384255,0.423913,2.220000,1,1.0,8.111495,37.133500,93.384255,0.423913,2.220000
14,1,2611.0,1,526,80.103930,247.004720,798.352500,4.053303,13.421110,1,1.0,80.103930,247.004720,798.352500,4.053303,13.421110
19,3,615.0,1,564,7.260223,40.354967,85.159965,0.390787,2.220000,1,1.0,7.260223,40.354967,85.159965,0.390787,2.220000
21,0,249.0,1,282,13.301790,46.786401,84.497340,0.382239,2.220000,1,1.0,13.301790,46.786401,84.497340,0.382239,2.220000
23,2,4084.0,1,493,100.708900,341.983850,1165.680300,5.422310,14.287340,1,1.0,100.708900,341.983850,1165.680300,5.422310,14.287340
27,2,446.0,1,493,24.871310,83.488038,290.364150,1.399990,2.220000,1,1.0,24.871310,83.488038,290.364150,1.399990,2.220000


In [11]:
# Definition of the model. This is the contribution of each
# observation to the log likelihood function.
logprob = models.loglogit(V, av, Mode)

# Define level of verbosity
import biogeme.messaging as msg
logger = msg.bioMessage()
logger.setDebug()
#logger.setWarning()
#logger.setGeneral()
#logger.setDetailed()

# Create the Biogeme object
biogeme  = bio.BIOGEME(database,logprob,numberOfThreads=1)
biogeme.modelName = "SMTO_2019_ModeChoice_withCost"
#,numberOfThreads=1

# Estimate the parameters
results = biogeme.estimate(saveIterations=True)
biogeme.createLogFile()

# Print the estimated values
betas = results.getBetaValues()
for k,v in betas.items():
    print(f"{k:10}=\t{v:.3g}")

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
print(pandasResults)

[17:16:37] < General >   Remove 9 unused variables from the database as only 7 are used.
[17:16:37] < General >   Log likelihood (N=9667):  -17338.93
[17:16:37] < General >   Minimize with tol 1e-07
[17:16:37] < Debug >     ASC_AC:          0
[17:16:37] < Debug >     ASC_TR:          0
[17:16:37] < Debug >     B_AUCOST:    -0.0005
[17:16:37] < Debug >     B_TIME:     -0.005
[17:16:37] < Debug >     B_TRCOST:    -0.0005
[17:16:37] < General >   Log likelihood (N=9667):  -17338.93 Gradient norm:      2e+06  
[17:16:37] < Debug >     ASC_AC: 0.0001796421
[17:16:37] < Debug >     ASC_TR: 0.00191143
[17:16:37] < Debug >     B_AUCOST: -0.01109138
[17:16:37] < Debug >     B_TIME:  0.9948373
[17:16:37] < Debug >     B_TRCOST: 0.01397189
[17:16:37] < General >   Log likelihood (N=9667):   -2624920 Gradient norm:      3e+06  
[17:16:37] < Debug >     ASC_AC: 1.956481e-05
[17:16:37] < Debug >     ASC_TR: 0.0002081737
[17:16:37] < Debug >     B_AUCOST: -0.001653506
[17:16:37] < Debug >     B_TIME:

[17:16:38] < Debug >     ASC_TR:  0.3625981
[17:16:38] < Debug >     B_AUCOST:  0.2106904
[17:16:38] < Debug >     B_TIME: 0.001010027
[17:16:38] < Debug >     B_TRCOST:   0.135186
[17:16:38] < General >   Log likelihood (N=9667):  -9030.707 Gradient norm:      1e+05  
[17:16:38] < Debug >     ASC_AC: -0.1088581
[17:16:38] < Debug >     ASC_TR:  0.5667136
[17:16:38] < Debug >     B_AUCOST:  0.3995462
[17:16:38] < Debug >     B_TIME: 0.009180853
[17:16:38] < Debug >     B_TRCOST: 0.03143834
[17:16:38] < General >   Log likelihood (N=9667):  -22840.21 Gradient norm:      2e+06  
[17:16:38] < Debug >     ASC_AC: -0.07339608
[17:16:38] < Debug >     ASC_TR:  0.3641714
[17:16:38] < Debug >     B_AUCOST:  0.2121461
[17:16:38] < Debug >     B_TIME: 0.001073007
[17:16:38] < Debug >     B_TRCOST:  0.1343863
[17:16:38] < General >   Log likelihood (N=9667):  -9029.038 Gradient norm:      9e+04  
[17:16:38] < Debug >     ASC_AC: -0.08783983
[17:16:38] < Debug >     ASC_TR:  0.4358736
[17:16:38] <

[17:16:39] < Debug >     ASC_AC: -0.04242974
[17:16:39] < Debug >     ASC_TR:   1.230071
[17:16:39] < Debug >     B_AUCOST: 0.02582382
[17:16:39] < Debug >     B_TIME: -5.22776e-05
[17:16:39] < Debug >     B_TRCOST: 0.01861268
[17:16:39] < General >   Log likelihood (N=9667):   -8542.68 Gradient norm:      9e+04  
[17:16:39] < Debug >     ASC_AC: 0.02616552
[17:16:39] < Debug >     ASC_TR:   1.243882
[17:16:39] < Debug >     B_AUCOST: 0.04306049
[17:16:39] < Debug >     B_TIME: -3.8933e-05
[17:16:39] < Debug >     B_TRCOST: 0.02670523
[17:16:39] < General >   Log likelihood (N=9667):  -8529.032 Gradient norm:      1e+05  
[17:16:39] < Debug >     ASC_AC:  0.1723267
[17:16:39] < Debug >     ASC_TR:   1.292106
[17:16:39] < Debug >     B_AUCOST: 0.05938568
[17:16:39] < Debug >     B_TIME: -6.291867e-05
[17:16:39] < Debug >     B_TRCOST: 0.03577687
[17:16:39] < General >   Log likelihood (N=9667):  -8501.834 Gradient norm:      2e+05  
[17:16:39] < Debug >     ASC_AC:  0.3843659
[17:16:39]

[17:16:40] < Debug >     ASC_AC:  0.5232947
[17:16:40] < Debug >     ASC_TR:   1.555416
[17:16:40] < Debug >     B_AUCOST: 0.01869548
[17:16:40] < Debug >     B_TIME: -0.0002651738
[17:16:40] < Debug >     B_TRCOST: 0.01492614
[17:16:40] < General >   Log likelihood (N=9667):  -8437.848 Gradient norm:      0.009  
[17:16:40] < Debug >     ASC_AC:  0.5232947
[17:16:40] < Debug >     ASC_TR:   1.555416
[17:16:40] < Debug >     B_AUCOST: 0.01869547
[17:16:40] < Debug >     B_TIME: -0.0002651738
[17:16:40] < Debug >     B_TRCOST: 0.01492614
[17:16:40] < General >   Log likelihood (N=9667):  -8437.848 Gradient norm:      1e-05  
[17:16:40] < Debug >     ASC_AC:  0.5232947
[17:16:40] < Debug >     ASC_TR:   1.555416
[17:16:40] < Debug >     B_AUCOST: 0.01869547
[17:16:40] < Debug >     B_TIME: -0.0002651738
[17:16:40] < Debug >     B_TRCOST: 0.01492614
[17:16:40] < General >   Log likelihood (N=9667):  -8437.848 Gradient norm:      1e-05 Hessian norm:       9e+08 BHHH norm:       1e+09
[17:1

Upon running, this produces several output files. The most important is the html file.

### Resources
UT Austin example: https://www.youtube.com/watch?v=QeJgyBIaXMQ  
Biogeme example: https://www.youtube.com/watch?v=OiM94B8WayA  
Nested logit example: https://www.youtube.com/watch?v=vEhvf54IKvs