# Biogeme Test - SMTO Data

Gonna try to adapt the Biogeme Swissmetro example to university choice using SMTO data set. We'll try to predict __mode__ choice using distance.

In [4]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd

hh_df = pd.read_csv('../../../Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../../../Data/SMTO_2015/SMTO_2015_Respondents.csv')

In [5]:
# Load relevant columns
df = ps_df[['pscampusattend']]
df = df.join(hh_df[['HmTTS2006']])
df = df.join(ps_df[['psmainmodefalltypical']])
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'psmainmodefalltypical': 'Mode'})
df = df.dropna() # Remove rows with missing data

# Convert Campus and Mode column to numerical column
campus_name_to_num = {"Downtown Toronto (St. George)": 0, "Scarborough (UTSC)": 1, "Mississauga (UTM)": 2,
                      "Keele": 3, "Glendon": 4, "RyersonU": 5, "OCADu": 6}
mode_name_to_num = {"Car - Driver alone": 0, "Car - Driver with passenger(s)": 0, "Car - Passenger": 0, "Taxi": 0, "Transit Bus": 1, "Streetcar": 1, "Subway/RT": 1, "GO Bus": 1, "GO Train": 1, "Walk": 2, "Bicycle": 2}

df.replace({'Campus': campus_name_to_num}, inplace=True)
df.replace({'Mode': mode_name_to_num}, inplace=True)
print(df.shape)
# Remove "Other" Modes
df = df[(df['Mode'] == 0) | (df['Mode'] == 1) | (df['Mode'] == 2)]
print(df.shape)
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Mode'] = pd.to_numeric(df['Mode'], downcast='signed')

df.head()

(15064, 3)
(14915, 3)


Unnamed: 0,Campus,HomeZone,Mode
0,1,261,1
1,0,71,2
2,0,3714,1
3,0,74,2
4,0,71,2


In [12]:
# Dataframe with walk distances
df_path = pd.read_csv('../../../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Dataframe with AutoTravelTimes
df_att = pd.read_csv('../../../../LoS/Auto_Travel_Times.csv')
AutoTravelTimes = list(df_att['Data'])

# Dataframe with TransitTravelTimes
df_ttt = pd.read_csv('../../../../LoS/Transit_Travel_Times.csv')
TransitTravelTimes = list(df_ttt['Data'])

# Dataframe with TransitTravelTimes
df_ac = pd.read_csv('../../../../LoS/Auto_Cost.csv')
AutoCosts = list(df_ac['Data'])

# Dataframe with TransitTravelTimes
df_tc = pd.read_csv('../../../../LoS/Transit_Cost.csv')
TransitCosts = list(df_tc['Data'])

In [16]:
not_found = set()
# Function for distance/AutoTravelTime/TransitTravelTime lookup

def find_value(origin, destination, mode_num):
    # mode_num: 0 - to find Auto Travel Time
    #           1 - to find Transit Travel Time
    #           2 - to find Walking Distance
    #           3 - to find Auto Cost
    #           4 - to find Transit Cost
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return 0
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return 0
    
    if mode_num == 0:
        return AutoTravelTimes[i*2392 + j]
    elif mode_num == 1:
        return TransitTravelTimes[i*2392 + j]
    elif mode_num == 2:
        return dists[i*2392 + j] / 1000
    elif mode_num == 3:
        return AutoCosts[i*2392 + j]
    elif mode_num == 4:
        return TransitCosts[i*2392 + j]
    else:
        print("ERROR: Enter correct mode_num!")
        return 0

In [17]:
# List of campus' TTS zones from Joven's MOE data
campus_zones = [69, 566, 3631, 391, 225, 38, 67]

# Add column with campus zones
df['CampusZone'] = df['Campus'].apply(lambda x: campus_zones[x])

# Add columns for Distance, Transit Travel Time, Auto Travel Time, Auto Cost, Transit Cost
df['AIVTT'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 0), axis=1)
df['TPTT'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 1), axis=1)
df['Dist'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 2), axis=1)
df['AuCost'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 3), axis=1)
df['TrCost'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 4), axis=1)

In [18]:
df.head()

Unnamed: 0,Campus,HomeZone,Mode,CampusZone,AIVTT,TPTT,Dist,AuCost,TrCost
0,1,261,1,566,17.47422,75.468478,14.88098,1.00664,2.22
1,0,71,2,69,2.924953,24.128386,1.132351,0.105459,2.22
2,0,3714,1,69,50.17188,155.55117,23.31923,1.751424,4.415403
3,0,74,2,69,1.049121,16.675728,0.699414,0.047282,2.22
4,0,71,2,69,2.924953,24.128386,1.132351,0.105459,2.22


In [19]:
# Load data into Biogeme database
database = db.Database("SMTO", df)

# Definition of new variables: adding columns to the database 
Available =  DefineVariable('Available', 1, database) # All modes available to all students

# Make variable names global
globals().update(database.variables)

# Remove unknown values
database.remove(Dist == 0.0)
database.remove(TPTT == 0.0)
database.remove(AIVTT == 0.0)

In [20]:
df.head()

Unnamed: 0,Campus,HomeZone,Mode,CampusZone,AIVTT,TPTT,Dist,AuCost,TrCost,Available
0,1,261,1,566,17.47422,75.468478,14.88098,1.00664,2.22,1
1,0,71,2,69,2.924953,24.128386,1.132351,0.105459,2.22,1
2,0,3714,1,69,50.17188,155.55117,23.31923,1.751424,4.415403,1
3,0,74,2,69,1.049121,16.675728,0.699414,0.047282,2.22,1
4,0,71,2,69,2.924953,24.128386,1.132351,0.105459,2.22,1


In [11]:
#df.to_csv('../../../../../../../ModeChoice_Input.csv', index=False)

In [21]:
# Beta initialization: (name, value, lowerbound, upperbound, status, desc='')
# Status 0 if estimated, 1 if maintained - reference choice should be 1
ASC_AU = Beta('ASC_AU', 0, None, None, 1)
ASC_TR = Beta('ASC_TR', 0, None, None, 0)
ASC_AC = Beta('ASC_AC', 0, None, None, 0)
B_AIVTT = Beta('B_AIVTT', -0.05, None, None, 0)
B_TPTT = Beta('B_TPTT', -0.05, None, None, 0)
B_DIST = Beta('B_DIST', -0.0005, None, None, 0)
B_AUCOST = Beta('B_AUCOST', -0.0005, None, None, 0)
B_TRCOST = Beta('B_TRCOST', -0.0005, None, None, 0)

# Variables: from columns in database
AV = DefineVariable('AV', Available, database)
AU_AIVTT = DefineVariable('AU_AIVTT', AIVTT, database)
TR_TPTT = DefineVariable('TR_TPTT', TPTT, database)
AC_DIST = DefineVariable('AC_DIST', Dist, database)
AU_COST = DefineVariable('AU_COST', AuCost, database)
TR_COST = DefineVariable('TR_COST', TrCost, database)

# Mode Choice Utility Functions: ASC_AU is 0
V0 = ASC_AU + B_AIVTT * AU_AIVTT + B_AUCOST * AU_COST
V1 = ASC_TR + B_TPTT * TR_TPTT + B_TRCOST * TR_COST
V2 = ASC_AC + B_DIST * AC_DIST


V  = {0: V0, 1: V1, 2: V2}
av = {0: AV, 1: AV, 2: AV}

In [22]:
database.data

Unnamed: 0,Campus,HomeZone,Mode,CampusZone,AIVTT,TPTT,Dist,AuCost,TrCost,Available,AV,AU_AIVTT,TR_TPTT,AC_DIST,AU_COST,TR_COST
0,1,261,1,566,17.474220,75.468478,14.880980,1.006640,2.220000,1,1.0,17.474220,75.468478,14.880980,1.006640,2.220000
1,0,71,2,69,2.924953,24.128386,1.132351,0.105459,2.220000,1,1.0,2.924953,24.128386,1.132351,0.105459,2.220000
2,0,3714,1,69,50.171880,155.551170,23.319230,1.751424,4.415403,1,1.0,50.171880,155.551170,23.319230,1.751424,4.415403
3,0,74,2,69,1.049121,16.675728,0.699414,0.047282,2.220000,1,1.0,1.049121,16.675728,0.699414,0.047282,2.220000
4,0,71,2,69,2.924953,24.128386,1.132351,0.105459,2.220000,1,1.0,2.924953,24.128386,1.132351,0.105459,2.220000
5,0,72,2,69,3.068723,25.742024,1.595176,0.110414,2.220000,1,1.0,3.068723,25.742024,1.595176,0.110414,2.220000
6,1,600,1,566,14.542720,53.745301,11.499560,0.865712,2.220000,1,1.0,14.542720,53.745301,11.499560,0.865712,2.220000
7,1,3420,0,566,60.657420,213.667410,46.280060,3.538100,6.546093,1,1.0,60.657420,213.667410,46.280060,3.538100,6.546093
8,0,113,2,69,15.759180,42.637287,6.260033,0.432415,4.439915,1,1.0,15.759180,42.637287,6.260033,0.432415,4.439915
9,0,1031,1,69,66.121790,117.968371,29.734180,2.325896,9.416795,1,1.0,66.121790,117.968371,29.734180,2.325896,9.416795


In [23]:
# Definition of the model. This is the contribution of each
# observation to the log likelihood function.
logprob = models.loglogit(V, av, Mode)

# Define level of verbosity
import biogeme.messaging as msg
logger = msg.bioMessage()
logger.setDebug()
#logger.setWarning()
#logger.setGeneral()
#logger.setDetailed()

# Create the Biogeme object
biogeme  = bio.BIOGEME(database,logprob,numberOfThreads=1)
biogeme.modelName = "SMTO_Logit_ModeChoice_withCost"
#,numberOfThreads=1

# Estimate the parameters
results = biogeme.estimate(saveIterations=True)
biogeme.createLogFile()

# Print the estimated values
betas = results.getBetaValues()
for k,v in betas.items():
    print(f"{k:10}=\t{v:.3g}")

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
print(pandasResults)

[15:49:00] < General >   Remove 9 unused variables from the database as only 7 are used.
[15:49:01] < General >   Log likelihood (N=14260):  -54983.42
[15:49:01] < General >   Minimize with tol 1e-07
[15:49:01] < Debug >     ASC_AC:          0
[15:49:01] < Debug >     ASC_TR:          0
[15:49:01] < Debug >     B_AIVTT:      -0.05
[15:49:01] < Debug >     B_AUCOST:    -0.0005
[15:49:01] < Debug >     B_DIST:    -0.0005
[15:49:01] < Debug >     B_TPTT:      -0.05
[15:49:01] < Debug >     B_TRCOST:    -0.0005
[15:49:01] < General >   Log likelihood (N=14260):  -54983.42 Gradient norm:      9e+05  
[15:49:01] < Debug >     ASC_AC: -0.007244672
[15:49:01] < Debug >     ASC_TR: 0.009253443
[15:49:01] < Debug >     B_AIVTT: -0.04998992
[15:49:01] < Debug >     B_AUCOST: 0.0002643762
[15:49:01] < Debug >     B_DIST: -0.1981219
[15:49:01] < Debug >     B_TPTT:   0.929046
[15:49:01] < Debug >     B_TRCOST: 0.04720488
[15:49:01] < General >   Log likelihood (N=14260):  -236072.3 Gradient norm:  

[15:49:02] < Debug >     ASC_AC:  0.3775184
[15:49:02] < Debug >     ASC_TR: -0.01996554
[15:49:02] < Debug >     B_AIVTT: -0.1027025
[15:49:02] < Debug >     B_AUCOST:  0.1288686
[15:49:02] < Debug >     B_DIST: -0.2672687
[15:49:02] < Debug >     B_TPTT: 0.002151368
[15:49:02] < Debug >     B_TRCOST: -0.3410836
[15:49:02] < General >   Log likelihood (N=14260):  -9680.098 Gradient norm:      9e+03  
[15:49:02] < Debug >     ASC_AC:  0.5658014
[15:49:02] < Debug >     ASC_TR: -0.008004075
[15:49:02] < Debug >     B_AIVTT: -0.08467143
[15:49:02] < Debug >     B_AUCOST:   0.145551
[15:49:02] < Debug >     B_DIST: -0.2980052
[15:49:02] < Debug >     B_TPTT: 0.0007083313
[15:49:02] < Debug >     B_TRCOST: -0.4475895
[15:49:02] < General >   Log likelihood (N=14260):  -10654.52 Gradient norm:      2e+05  
[15:49:02] < Debug >     ASC_AC:  0.4034143
[15:49:02] < Debug >     ASC_TR: -0.0183204
[15:49:02] < Debug >     B_AIVTT: -0.1002226
[15:49:02] < Debug >     B_AUCOST:  0.1311631
[15:49:0

[15:49:03] < Debug >     ASC_AC:   2.711366
[15:49:03] < Debug >     ASC_TR:  0.8158872
[15:49:03] < Debug >     B_AIVTT: -0.02724993
[15:49:03] < Debug >     B_AUCOST:  0.1447332
[15:49:03] < Debug >     B_DIST:  -0.397413
[15:49:03] < Debug >     B_TPTT: 0.003489579
[15:49:03] < Debug >     B_TRCOST: -0.01248031
[15:49:03] < General >   Log likelihood (N=14260):  -7931.137 Gradient norm:      7e+03  
[15:49:03] < Debug >     ASC_AC:   2.993233
[15:49:03] < Debug >     ASC_TR:   0.877692
[15:49:03] < Debug >     B_AIVTT: -0.0283443
[15:49:03] < Debug >     B_AUCOST:  0.1525287
[15:49:03] < Debug >     B_DIST: -0.4016364
[15:49:03] < Debug >     B_TPTT: 0.005976515
[15:49:03] < Debug >     B_TRCOST: -0.06995101
[15:49:03] < General >   Log likelihood (N=14260):  -7885.493 Gradient norm:      6e+03  
[15:49:03] < Debug >     ASC_AC:   3.357219
[15:49:03] < Debug >     ASC_TR:  0.9444094
[15:49:03] < Debug >     B_AIVTT: -0.005165836
[15:49:03] < Debug >     B_AUCOST:  0.1945411
[15:49:0

[15:49:03] < General >   Log likelihood (N=14260):  -7769.597 Gradient norm:      7e+03  
[15:49:03] < Debug >     ASC_AC:    3.61479
[15:49:03] < Debug >     ASC_TR:    1.70313
[15:49:03] < Debug >     B_AIVTT: -0.0247905
[15:49:03] < Debug >     B_AUCOST: 0.05695249
[15:49:03] < Debug >     B_DIST: -0.4206164
[15:49:03] < Debug >     B_TPTT: 5.428171e-06
[15:49:03] < Debug >     B_TRCOST: -0.1136168
[15:49:03] < General >   Log likelihood (N=14260):  -7708.025 Gradient norm:      4e+03  
[15:49:04] < Debug >     ASC_AC:   4.029913
[15:49:04] < Debug >     ASC_TR:   2.215084
[15:49:04] < Debug >     B_AIVTT: -0.03282115
[15:49:04] < Debug >     B_AUCOST: 0.07903066
[15:49:04] < Debug >     B_DIST: -0.5008341
[15:49:04] < Debug >     B_TPTT: -0.004449466
[15:49:04] < Debug >     B_TRCOST: -0.1606866
[15:49:04] < General >   Log likelihood (N=14260):  -7641.751 Gradient norm:      7e+03  
[15:49:04] < Debug >     ASC_AC:   4.541346
[15:49:04] < Debug >     ASC_TR:   2.733049
[15:49:04] 

[15:49:04] < Debug >     ASC_AC:   3.713621
[15:49:04] < Debug >     ASC_TR:    2.22692
[15:49:04] < Debug >     B_AIVTT: -0.04215612
[15:49:04] < Debug >     B_AUCOST: -0.002886971
[15:49:04] < Debug >     B_DIST:  -0.504949
[15:49:04] < Debug >     B_TPTT: -0.005818874
[15:49:04] < Debug >     B_TRCOST:  -0.235726
[15:49:04] < General >   Log likelihood (N=14260):  -7611.473 Gradient norm:       0.01  
[15:49:04] < Debug >     ASC_AC:   3.713616
[15:49:04] < Debug >     ASC_TR:   2.226917
[15:49:04] < Debug >     B_AIVTT: -0.04215599
[15:49:04] < Debug >     B_AUCOST: -0.002886496
[15:49:04] < Debug >     B_DIST: -0.5049477
[15:49:04] < Debug >     B_TPTT: -0.005818829
[15:49:04] < Debug >     B_TRCOST: -0.2357252
[15:49:04] < General >   Log likelihood (N=14260):  -7611.473 Gradient norm:      0.002  
[15:49:04] < Debug >     ASC_AC:   3.713612
[15:49:04] < Debug >     ASC_TR:   2.226914
[15:49:04] < Debug >     B_AIVTT: -0.04215631
[15:49:04] < Debug >     B_AUCOST: -0.002885626
[1

[15:49:05] < Debug >     ASC_AC:   3.713616
[15:49:05] < Debug >     ASC_TR:   2.226917
[15:49:05] < Debug >     B_AIVTT: -0.04215599
[15:49:05] < Debug >     B_AUCOST: -0.002886476
[15:49:05] < Debug >     B_DIST: -0.5049477
[15:49:05] < Debug >     B_TPTT: -0.005818828
[15:49:05] < Debug >     B_TRCOST: -0.2357252
[15:49:05] < General >   Log likelihood (N=14260):  -7611.473 Gradient norm:     0.0002  
[15:49:05] < Debug >     ASC_AC:   3.713616
[15:49:05] < Debug >     ASC_TR:   2.226917
[15:49:05] < Debug >     B_AIVTT: -0.04215599
[15:49:05] < Debug >     B_AUCOST: -0.002886492
[15:49:05] < Debug >     B_DIST: -0.5049477
[15:49:05] < Debug >     B_TPTT: -0.005818829
[15:49:05] < Debug >     B_TRCOST: -0.2357252
[15:49:05] < General >   Log likelihood (N=14260):  -7611.473 Gradient norm:      0.002  
[15:49:05] < Debug >     ASC_AC:   3.713616
[15:49:05] < Debug >     ASC_TR:   2.226917
[15:49:05] < Debug >     B_AIVTT: -0.04215599
[15:49:05] < Debug >     B_AUCOST: -0.002886492
[1

Upon running, this produces several output files. The most important is the html file.

### Resources
UT Austin example: https://www.youtube.com/watch?v=QeJgyBIaXMQ  
Biogeme example: https://www.youtube.com/watch?v=OiM94B8WayA  
Nested logit example: https://www.youtube.com/watch?v=vEhvf54IKvs