# Biogeme Test - SMTO Data

Gonna try to adapt the Biogeme Swissmetro example to university choice using SMTO data set. We'll try to predict campus choice using distance.

In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd

hh_df = pd.read_csv('../Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../Data/SMTO_2015/SMTO_2015_Respondents.csv')

In [2]:
# Load relevant columns
df = ps_df[['pscampusattend']]
df = df.join(hh_df[['HmTTS2006']])
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus'})
df = df.dropna() # Remove rows with missing data

# Convert Campus column to numerical column
campus_name_to_num = {"Downtown Toronto (St. George)": 0, "Scarborough (UTSC)": 1, "Mississauga (UTM)": 2,
                      "Keele": 3, "Glendon": 4, "RyersonU": 5, "OCADu": 6}
df.replace({'Campus': campus_name_to_num}, inplace=True)
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')

df

Unnamed: 0,Campus,HomeZone
0,1,261
1,0,71
2,0,3714
3,0,74
4,0,71
...,...,...
15221,3,212
15222,3,233
15223,3,95
15224,3,2221


In [3]:
# Dataframe with walk distances
df_path = pd.read_csv('../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

In [4]:
not_found = set()
# Function for distance lookup
def find_distance(origin, destination):
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return 0
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return 0
    return dists[i*2392 + j] / 1000

In [5]:
# List of campus' TTS zones from Joven's MOE data
campus_zones = [69, 566, 3631, 391, 225, 38, 67]

# Load distances in dataframe
for i in range(7):
    df["Dist" + str(i)] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i]))
len(not_found)

129

In [6]:
df

Unnamed: 0,Campus,HomeZone,Dist0,Dist1,Dist2,Dist3,Dist4,Dist5,Dist6
0,1,261,10.256060,14.88098,29.20657,22.59214,9.218413,9.580635,11.241730
1,0,71,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838
2,0,3714,23.319230,45.63271,4.51742,28.58045,32.555200,24.964000,23.686150
3,0,74,0.699414,24.11954,19.43932,16.81186,12.830410,2.314008,1.541276
4,0,71,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838
...,...,...,...,...,...,...,...,...,...
15221,3,212,7.120260,19.01731,23.76046,14.19530,5.732956,6.553092,8.193741
15222,3,233,15.917590,12.03644,32.96591,17.68772,6.019180,15.242170,16.903260
15223,3,95,2.783940,25.09743,17.84462,15.53600,12.829600,4.733398,3.979057
15224,3,2221,23.379880,26.15476,37.35434,13.30458,15.379040,23.250580,24.518920


In [7]:
# Load data into Biogeme database
database = db.Database("SMTO", df)

# Definition of new variables: adding columns to the database 
Available =  DefineVariable('Available', 1, database) # All schools available to all students

# Make variable names global
globals().update(database.variables)

# Remove unknown zones
database.remove(Dist0 == 0)
database.data

Unnamed: 0,Campus,HomeZone,Dist0,Dist1,Dist2,Dist3,Dist4,Dist5,Dist6,Available
0,1,261,10.256060,14.88098,29.20657,22.59214,9.218413,9.580635,11.241730,1
1,0,71,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838,1
2,0,3714,23.319230,45.63271,4.51742,28.58045,32.555200,24.964000,23.686150,1
3,0,74,0.699414,24.11954,19.43932,16.81186,12.830410,2.314008,1.541276,1
4,0,71,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838,1
...,...,...,...,...,...,...,...,...,...,...
15221,3,212,7.120260,19.01731,23.76046,14.19530,5.732956,6.553092,8.193741,1
15222,3,233,15.917590,12.03644,32.96591,17.68772,6.019180,15.242170,16.903260,1
15223,3,95,2.783940,25.09743,17.84462,15.53600,12.829600,4.733398,3.979057,1
15224,3,2221,23.379880,26.15476,37.35434,13.30458,15.379040,23.250580,24.518920,1


In [8]:
"""
# Here we use the "biogeme" way for backward compatibility
exclude = HomeToMainCampusKM > 130
database.remove(exclude)
database.data

# Parameters to be estimated
ASC_CAR = Beta('ASC_CAR',0,None,None,0)
ASC_TRAIN = Beta('ASC_TRAIN',0,None,None,0)
ASC_SM = Beta('ASC_SM',0,None,None,1)
B_TIME = Beta('B_TIME',0,None,None,0)
B_COST = Beta('B_COST',0,None,None,0)

# Definition of new variables
SM_COST =  SM_CO   * (  GA   ==  0  ) 
TRAIN_COST =  TRAIN_CO   * (  GA   ==  0  )

# Definition of new variables: adding columns to the database 
CAR_AV_SP =  DefineVariable('CAR_AV_SP',CAR_AV  * (  SP   !=  0  ),database)
TRAIN_AV_SP =  DefineVariable('TRAIN_AV_SP',TRAIN_AV  * (  SP   !=  0  ),database)
TRAIN_TT_SCALED = DefineVariable('TRAIN_TT_SCALED',\
                                 TRAIN_TT / 100.0,database)
TRAIN_COST_SCALED = DefineVariable('TRAIN_COST_SCALED',\
                                   TRAIN_COST / 100,database)
SM_TT_SCALED = DefineVariable('SM_TT_SCALED', SM_TT / 100.0,database)
SM_COST_SCALED = DefineVariable('SM_COST_SCALED', SM_COST / 100,database)
CAR_TT_SCALED = DefineVariable('CAR_TT_SCALED', CAR_TT / 100,database)
CAR_CO_SCALED = DefineVariable('CAR_CO_SCALED', CAR_CO / 100,database)

# Definition of the utility functions
V1 = ASC_TRAIN + \
     B_TIME * TRAIN_TT_SCALED + \
     B_COST * TRAIN_COST_SCALED
V2 = ASC_SM + \
     B_TIME * SM_TT_SCALED + \
     B_COST * SM_COST_SCALED
V3 = ASC_CAR + \
     B_TIME * CAR_TT_SCALED + \
     B_COST * CAR_CO_SCALED

# Associate utility functions with the numbering of alternatives
V = {1: V1,
     2: V2,
     3: V3}

# Associate the availability conditions with the alternatives
av = {1: TRAIN_AV_SP,
      2: SM_AV,
      3: CAR_AV_SP}
      
# Definition of the model. This is the contribution of each
# observation to the log likelihood function.
logprob = models.loglogit(V,av,CHOICE)

"""


# Beta initialization: (name, value, lowerbound, upperbound, status, desc='')
# Status 0 if estimated, 1 if maintained - reference choice should be 1
ASC_SG = Beta('ASC_SG', 0, None, None, 0)
ASC_SC = Beta('ASC_SC', 0, None, None, 0)
ASC_MI = Beta('ASC_MI', 0, None, None, 0)
ASC_YK = Beta('ASC_YK', 0, None, None, 0)
ASC_YG = Beta('ASC_YG', 0, None, None, 1)
ASC_RY = Beta('ASC_RY', 0, None, None, 0)
ASC_OC = Beta('ASC_OC', 0, None, None, 0)
B_DIST = Beta('B_DIST', 0, None, None, 0)

# Variables: from columns in database
AV = DefineVariable('AV', Available, database)
SG_DIST = DefineVariable('SG_DIST', Dist0, database)
SC_DIST = DefineVariable('SC_DIST', Dist1, database)
MI_DIST = DefineVariable('MI_DIST', Dist2, database)
YK_DIST = DefineVariable('YK_DIST', Dist3, database)
YG_DIST = DefineVariable('YG_DIST', Dist4, database)
RY_DIST = DefineVariable('RY_DIST', Dist5, database)
OC_DIST = DefineVariable('OC_DIST', Dist6, database)

# Utility Functions: note ASC_SG is 0
V0 = ASC_SG + B_DIST * SG_DIST
V1 = ASC_SC + B_DIST * SC_DIST
V2 = ASC_MI + B_DIST * MI_DIST
V3 = ASC_YK + B_DIST * YK_DIST
V4 = ASC_YG + B_DIST * YG_DIST
V5 = ASC_RY + B_DIST * RY_DIST
V6 = ASC_OC + B_DIST * OC_DIST

V  = {0: V0, 1: V1, 2: V2, 3: V3, 4: V4, 5: V5, 6: V6}
av = {0: AV, 1: AV, 2: AV, 3: AV, 4: AV, 5: AV, 6: AV}

In [9]:
database.data

Unnamed: 0,Campus,HomeZone,Dist0,Dist1,Dist2,Dist3,Dist4,Dist5,Dist6,Available,AV,SG_DIST,SC_DIST,MI_DIST,YK_DIST,YG_DIST,RY_DIST,OC_DIST
0,1,261,10.256060,14.88098,29.20657,22.59214,9.218413,9.580635,11.241730,1,1.0,10.256060,14.88098,29.20657,22.59214,9.218413,9.580635,11.241730
1,0,71,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838,1,1.0,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838
2,0,3714,23.319230,45.63271,4.51742,28.58045,32.555200,24.964000,23.686150,1,1.0,23.319230,45.63271,4.51742,28.58045,32.555200,24.964000,23.686150
3,0,74,0.699414,24.11954,19.43932,16.81186,12.830410,2.314008,1.541276,1,1.0,0.699414,24.11954,19.43932,16.81186,12.830410,2.314008,1.541276
4,0,71,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838,1,1.0,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15221,3,212,7.120260,19.01731,23.76046,14.19530,5.732956,6.553092,8.193741,1,1.0,7.120260,19.01731,23.76046,14.19530,5.732956,6.553092,8.193741
15222,3,233,15.917590,12.03644,32.96591,17.68772,6.019180,15.242170,16.903260,1,1.0,15.917590,12.03644,32.96591,17.68772,6.019180,15.242170,16.903260
15223,3,95,2.783940,25.09743,17.84462,15.53600,12.829600,4.733398,3.979057,1,1.0,2.783940,25.09743,17.84462,15.53600,12.829600,4.733398,3.979057
15224,3,2221,23.379880,26.15476,37.35434,13.30458,15.379040,23.250580,24.518920,1,1.0,23.379880,26.15476,37.35434,13.30458,15.379040,23.250580,24.518920


In [10]:
# Definition of the model. This is the contribution of each
# observation to the log likelihood function.
logprob = models.loglogit(V, av, Campus)

# Define level of verbosity
#logger.setWarning()
#logger.setGeneral()
#logger.setDetailed()

# Create the Biogeme object
biogeme  = bio.BIOGEME(database,logprob,numberOfThreads=1)
biogeme.modelName = "SMTO_Logit"

# Estimate the parameters
results = biogeme.estimate(saveIterations=True)
biogeme.createLogFile()

# Print the estimated values
betas = results.getBetaValues()
for k,v in betas.items():
    print(f"{k:10}=\t{v:.3g}")

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
print(pandasResults)

[15:35:35] < General >   Remove 9 unused variables from the database as only 9 are used.
[15:35:36] < General >   Log likelihood (N=14741):  -28684.66
[15:35:36] < General >   Minimize with tol 1e-07
[15:35:36] < Debug >     ASC_MI:          0
[15:35:36] < Debug >     ASC_OC:          0
[15:35:36] < Debug >     ASC_RY:          0
[15:35:36] < Debug >     ASC_SC:          0
[15:35:36] < Debug >     ASC_SG:          0
[15:35:36] < Debug >     ASC_YK:          0
[15:35:36] < Debug >     B_DIST:          0
[15:35:36] < General >   Log likelihood (N=14741):  -28684.66 Gradient norm:      8e+04  
[15:35:36] < Debug >     ASC_MI: -0.01372364
[15:35:36] < Debug >     ASC_OC: -0.01945924
[15:35:36] < Debug >     ASC_RY: 0.009277788
[15:35:36] < Debug >     ASC_SC:  -0.012036
[15:35:36] < Debug >     ASC_SG: 0.04470636
[15:35:36] < Debug >     ASC_YK: 0.01218099
[15:35:36] < Debug >     B_DIST: -0.9985264
[15:35:36] < General >   Log likelihood (N=14741):   -99530.1 Gradient norm:      9e+04  
[

[15:35:38] < Debug >     ASC_MI: 0.02601807
[15:35:38] < Debug >     ASC_OC:  -1.073984
[15:35:38] < Debug >     ASC_RY:   1.142948
[15:35:38] < Debug >     ASC_SC:  0.6004886
[15:35:38] < Debug >     ASC_SG:   1.951252
[15:35:38] < Debug >     ASC_YK:   1.540501
[15:35:38] < Debug >     B_DIST: -0.08743076
[15:35:38] < General >   Log likelihood (N=14741):  -20642.19 Gradient norm:      3e+03  
[15:35:38] < Debug >     ASC_MI:  0.1132659
[15:35:38] < Debug >     ASC_OC:  -1.075394
[15:35:38] < Debug >     ASC_RY:    1.27513
[15:35:38] < Debug >     ASC_SC:   0.704669
[15:35:38] < Debug >     ASC_SG:   2.070842
[15:35:38] < Debug >     ASC_YK:   1.572183
[15:35:38] < Debug >     B_DIST: -0.08762991
[15:35:38] < General >   Log likelihood (N=14741):   -20610.2 Gradient norm:      3e+03  
[15:35:38] < Debug >     ASC_MI:   0.392262
[15:35:38] < Debug >     ASC_OC: -0.9660005
[15:35:38] < Debug >     ASC_RY:   1.368069
[15:35:38] < Debug >     ASC_SC:  0.9821294
[15:35:38] < Debug >     A

[15:35:40] < Debug >     ASC_MI:   1.019392
[15:35:40] < Debug >     ASC_OC:  0.2857076
[15:35:40] < Debug >     ASC_RY:   2.115908
[15:35:40] < Debug >     ASC_SC:   1.356476
[15:35:40] < Debug >     ASC_SG:   2.786689
[15:35:40] < Debug >     ASC_YK:   2.289607
[15:35:40] < Debug >     B_DIST: -0.09042879
[15:35:40] < General >   Log likelihood (N=14741):   -20404.1 Gradient norm:      7e+01  
[15:35:40] < Debug >     ASC_MI:  0.9870221
[15:35:40] < Debug >     ASC_OC:  0.2492653
[15:35:40] < Debug >     ASC_RY:   2.088453
[15:35:40] < Debug >     ASC_SC:   1.311235
[15:35:40] < Debug >     ASC_SG:   2.766838
[15:35:40] < Debug >     ASC_YK:    2.26822
[15:35:40] < Debug >     B_DIST: -0.09081065
[15:35:40] < General >   Log likelihood (N=14741):  -20403.67 Gradient norm:      3e+01  
[15:35:40] < Debug >     ASC_MI:  0.9888065
[15:35:40] < Debug >     ASC_OC:  0.2449995
[15:35:40] < Debug >     ASC_RY:   2.086677
[15:35:40] < Debug >     ASC_SC:   1.309853
[15:35:40] < Debug >     A

[15:35:42] < Debug >     ASC_MI:  0.9862364
[15:35:42] < Debug >     ASC_OC:   0.244394
[15:35:42] < Debug >     ASC_RY:   2.082492
[15:35:42] < Debug >     ASC_SC:   1.307626
[15:35:42] < Debug >     ASC_SG:   2.762055
[15:35:42] < Debug >     ASC_YK:   2.262923
[15:35:42] < Debug >     B_DIST: -0.09071292
[15:35:42] < General >   Log likelihood (N=14741):  -20403.66 Gradient norm:      0.003  
[15:35:42] < Debug >     ASC_MI:  0.9862364
[15:35:42] < Debug >     ASC_OC:  0.2443938
[15:35:42] < Debug >     ASC_RY:   2.082492
[15:35:42] < Debug >     ASC_SC:   1.307626
[15:35:42] < Debug >     ASC_SG:   2.762055
[15:35:42] < Debug >     ASC_YK:   2.262923
[15:35:42] < Debug >     B_DIST: -0.09071292
[15:35:42] < General >   Log likelihood (N=14741):  -20403.66 Gradient norm:     0.0009  
[15:35:42] < Debug >     ASC_MI:  0.9862364
[15:35:42] < Debug >     ASC_OC:  0.2443939
[15:35:42] < Debug >     ASC_RY:   2.082492
[15:35:42] < Debug >     ASC_SC:   1.307626
[15:35:42] < Debug >     A

[15:35:44] < Debug >     ASC_MI:  0.9862323
[15:35:44] < Debug >     ASC_OC:  0.2443899
[15:35:44] < Debug >     ASC_RY:   2.082487
[15:35:44] < Debug >     ASC_SC:   1.307623
[15:35:44] < Debug >     ASC_SG:   2.762051
[15:35:44] < Debug >     ASC_YK:   2.262918
[15:35:44] < Debug >     B_DIST: -0.09071291
[15:35:44] < General >   Log likelihood (N=14741):  -20403.66 Gradient norm:      0.008  
[15:35:45] < Debug >     ASC_MI:  0.9862358
[15:35:45] < Debug >     ASC_OC:  0.2443934
[15:35:45] < Debug >     ASC_RY:   2.082491
[15:35:45] < Debug >     ASC_SC:   1.307626
[15:35:45] < Debug >     ASC_SG:   2.762054
[15:35:45] < Debug >     ASC_YK:   2.262922
[15:35:45] < Debug >     B_DIST: -0.09071292
[15:35:45] < General >   Log likelihood (N=14741):  -20403.66 Gradient norm:      0.003  
[15:35:45] < Debug >     ASC_MI:  0.9862363
[15:35:45] < Debug >     ASC_OC:  0.2443939
[15:35:45] < Debug >     ASC_RY:   2.082492
[15:35:45] < Debug >     ASC_SC:   1.307626
[15:35:45] < Debug >     A

[15:35:46] < Debug >     ASC_MI:  0.9862364
[15:35:46] < Debug >     ASC_OC:   0.244394
[15:35:46] < Debug >     ASC_RY:   2.082492
[15:35:46] < Debug >     ASC_SC:   1.307626
[15:35:46] < Debug >     ASC_SG:   2.762055
[15:35:46] < Debug >     ASC_YK:   2.262923
[15:35:46] < Debug >     B_DIST: -0.09071292
[15:35:46] < General >   Log likelihood (N=14741):  -20403.66 Gradient norm:      0.003  
[15:35:47] < Debug >     ASC_MI:  0.9862364
[15:35:47] < Debug >     ASC_OC:   0.244394
[15:35:47] < Debug >     ASC_RY:   2.082492
[15:35:47] < Debug >     ASC_SC:   1.307626
[15:35:47] < Debug >     ASC_SG:   2.762055
[15:35:47] < Debug >     ASC_YK:   2.262923
[15:35:47] < Debug >     B_DIST: -0.09071292
[15:35:47] < General >   Log likelihood (N=14741):  -20403.66 Gradient norm:      0.003  
[15:35:47] < Debug >     ASC_MI:  0.9862364
[15:35:47] < Debug >     ASC_OC:   0.244394
[15:35:47] < Debug >     ASC_RY:   2.082492
[15:35:47] < Debug >     ASC_SC:   1.307626
[15:35:47] < Debug >     A

Upon running, this produces several output files. The most important is the html file.

### Resources
UT Austin example: https://www.youtube.com/watch?v=QeJgyBIaXMQ  
Biogeme example: https://www.youtube.com/watch?v=OiM94B8WayA  
Nested logit example: https://www.youtube.com/watch?v=vEhvf54IKvs