In [1]:
# default_exp MNL_benchmark

# MNL benchmark

> Benchmark with Swissmetro dataset => Chang utility function same as in TasteNet paper

In [3]:
#hide
from nbdev.showdoc import *

In [5]:
#export
import numpy as np
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta

#### Data preparation

In [6]:
# read data
df = pd.read_csv('./data/swissmetro_train.csv')
df.shape

(7484, 28)

In [7]:
df.describe()

Unnamed: 0,GROUP,SURVEY,SP,ID,PURPOSE,FIRST,TICKET,WHO,LUGGAGE,AGE,...,TRAIN_TT,TRAIN_CO,TRAIN_HE,SM_TT,SM_CO,SM_HE,SM_SEATS,CAR_TT,CAR_CO,CHOICE
count,7484.0,7484.0,7484.0,7484.0,7484.0,7484.0,7484.0,7484.0,7484.0,7484.0,...,7484.0,7484.0,7484.0,7484.0,7484.0,7484.0,7484.0,7484.0,7484.0,7484.0
mean,2.631614,0.631614,1.0,597.088856,2.899653,0.473544,2.898049,1.497595,0.675975,2.898985,...,167.127338,512.706975,69.857028,87.760289,669.774452,20.061464,0.117451,124.15326,79.018306,2.155933
std,0.482399,0.482399,0.0,343.772433,1.110953,0.499333,2.191708,0.710073,0.599483,1.027071,...,78.343306,1083.101925,37.295564,55.210542,1436.019973,8.164189,0.321978,89.711416,55.083844,0.62926
min,2.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,31.0,8.0,30.0,8.0,10.0,10.0,0.0,0.0,0.0,1.0
25%,2.0,0.0,1.0,299.0,2.0,0.0,1.0,1.0,0.0,2.0,...,110.0,58.0,30.0,54.0,71.0,10.0,0.0,70.0,40.0,2.0
50%,3.0,1.0,1.0,599.0,3.0,0.0,3.0,1.0,1.0,3.0,...,159.0,94.0,60.0,79.0,112.0,20.0,0.0,120.0,76.0,2.0
75%,3.0,1.0,1.0,893.0,3.0,1.0,3.0,2.0,1.0,4.0,...,209.0,173.0,120.0,110.0,210.0,30.0,0.0,176.0,112.0,3.0
max,3.0,1.0,1.0,1192.0,8.0,1.0,10.0,3.0,3.0,5.0,...,1049.0,5040.0,120.0,796.0,6720.0,30.0,1.0,1560.0,520.0,3.0


In [9]:
df.columns

Index(['GROUP', 'SURVEY', 'SP', 'ID', 'PURPOSE', 'FIRST', 'TICKET', 'WHO',
       'LUGGAGE', 'AGE', 'MALE', 'INCOME', 'GA', 'ORIGIN', 'DEST', 'TRAIN_AV',
       'CAR_AV', 'SM_AV', 'TRAIN_TT', 'TRAIN_CO', 'TRAIN_HE', 'SM_TT', 'SM_CO',
       'SM_HE', 'SM_SEATS', 'CAR_TT', 'CAR_CO', 'CHOICE'],
      dtype='object')

In [20]:
df.AGE.value_counts()

3    2682
2    2321
4    1425
5     563
1     493
Name: AGE, dtype: int64

### 1. Biogeme

In [80]:
database = db.Database('swissmetro', df)
globals().update(database.variables)

In [81]:
# Define parameters
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
ASC_SM = Beta('ASC_SM', 0, None, None, 0)
ASC_CAR = Beta('ASC_CAR',0, None, None, 1) # Constant

B_TIME_TRAIN = Beta('B_TIME_TRAIN', 0, None, None, 0)
B_TIME_SM = Beta('B_TIME_SM', 0, None, None, 0)
B_TIME_CAR = Beta('B_TIME_CAR', 0, None, None, 0)

B_HEADWAY_TRAIN = Beta('B_HEADWAY_TRAIN', 0, None, None, 0)
B_HEADWAY_SM = Beta('B_HEADWAY_SM', 0, None, None, 0)

B_COST_TRAIN = Beta('B_COST_TRAIN', -1, None, None, 1) # Fixed
B_COST_SM = Beta('B_COST_SM', -1, None, None, 1) # Fixed
B_COST_CAR = Beta('B_COST_CAR', -1, None, None, 1) # Fixed
# B_COST_TRAIN = Beta('B_COST_TRAIN', 0, None, None, 0) # Fixed
# B_COST_SM = Beta('B_COST_SM', 0, None, None, 0) # Fixed
# B_COST_CAR = Beta('B_COST_CAR', 0, None, None, 0) # Fixed

In [82]:
# Definition of new variables
SM_CO = SM_CO * (GA == 0)
TRAIN_CO = TRAIN_CO * (GA == 0)

CAR_AV = CAR_AV * (SP != 0)
TRAIN_AV = TRAIN_AV * (SP != 0)

In [83]:
## Definition of the utility functions
V1 = ASC_TRAIN + B_TIME_TRAIN * TRAIN_TT + B_HEADWAY_TRAIN * TRAIN_HE + B_COST_TRAIN * TRAIN_CO
V2 = ASC_SM + B_TIME_SM * SM_TT + B_HEADWAY_SM * SM_HE + B_COST_SM * SM_CO
V3 = ASC_CAR + B_TIME_CAR * CAR_TT + B_COST_CAR * CAR_CO 

In [84]:
# Associate utility functions with the numbering of alternatives
V = {1: V1,
     2: V2,
     3: V3}

# Associate the availability conditions with the alternatives
av = {1: TRAIN_AV,
      2: SM_AV,
      3: CAR_AV}

In [85]:
# Definition of the model. This is the contribution of each
# observation to the log likelihood function.
logprob = models.loglogit(V, av, CHOICE)

# Create the Biogeme object
biogeme = bio.BIOGEME(database, logprob)
# biogeme.modelName = '01logit'
biogeme.modelName = 'MNL_A'

In [86]:
# Estimate the parameters
results = biogeme.estimate()
# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
print(pandasResults)

                    Value   Std err     t-test       p-value  Rob. Std err  \
ASC_SM           0.729244  0.098901   7.373464  1.663114e-13      0.109824   
ASC_TRAIN        0.956208  0.129172   7.402585  1.334488e-13      0.135637   
B_COST_CAR      -0.006672  0.000943  -7.071452  1.533218e-12      0.001259   
B_COST_SM       -0.007911  0.000446 -17.734670  0.000000e+00      0.000627   
B_COST_TRAIN    -0.018745  0.000951 -19.703251  0.000000e+00      0.001515   
B_HEADWAY_SM    -0.007075  0.003051  -2.319051  2.039228e-02      0.003102   
B_HEADWAY_TRAIN -0.006277  0.001002  -6.265781  3.709619e-10      0.000994   
B_TIME_CAR      -0.009673  0.000690 -14.020815  0.000000e+00      0.001241   
B_TIME_SM       -0.013123  0.000730 -17.978813  0.000000e+00      0.001307   
B_TIME_TRAIN    -0.012936  0.000768 -16.836669  0.000000e+00      0.001071   

                 Rob. t-test  Rob. p-value  
ASC_SM              6.640130  3.134049e-11  
ASC_TRAIN           7.049757  1.792344e-12  
B_COST

In [87]:
pandasResults['Value']

ASC_SM             0.729244
ASC_TRAIN          0.956208
B_COST_CAR        -0.006672
B_COST_SM         -0.007911
B_COST_TRAIN      -0.018745
B_HEADWAY_SM      -0.007075
B_HEADWAY_TRAIN   -0.006277
B_TIME_CAR        -0.009673
B_TIME_SM         -0.013123
B_TIME_TRAIN      -0.012936
Name: Value, dtype: float64

In [30]:
# The validation consists in organizing the data into several slices
# of about the same size, randomly defined. Each slice is considered
# as a validation dataset. The model is then re-estimated using all
# the data except the slice, and the estimated model is applied on the
# validation set (i.e. the slice). The value of the log likelihood for
# each observation in the validation set is reported in a
# dataframe. As this is done for each slice, the output is a list of
# dataframes, each corresponding to one of these exercises.

validation_results = biogeme.validate(results,5)

for slide in validation_results:
    print(f'Log likelihood for {slide.shape[0]} validation data: {slide["Loglikelihood"].sum()}')
    

Log likelihood for 1497 validation data: -22072.582684325316
Log likelihood for 1497 validation data: -19786.512933319646
Log likelihood for 1497 validation data: -22761.069770956114
Log likelihood for 1497 validation data: -22437.29734738394
Log likelihood for 1496 validation data: -18503.098702511194


In [31]:
slices = 5

In [32]:
validationData = database.split(slices)

In [27]:
type(validationData)

zip

In [28]:
for v in validationData:
    print(type(v))

<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>


In [29]:
type(v), len(v)

(tuple, 2)

In [30]:
type(v[0]), type(v[1]), v[0].shape, v[1].shape

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 (8554, 28),
 (2138, 28))

In [31]:
keepDatabase = database
type(keepDatabase)

biogeme.database.Database

In [32]:
estimationResults = results

In [33]:
allSimulationResults = []
for v in validationData:
    # v[0] is the estimation data set
    database = db.Database('Estimation data', v[0])
    biogeme.loglike.changeInitValues(estimationResults.getBetaValues())
    results = biogeme.estimate()
    simulate = {'Loglikelihood': biogeme.loglike}
#     simBiogeme = BIOGEME(db.Database('Validation data', v[1]),
#                          simulate)
#     simResult = simBiogeme.simulate(results.getBetaValues())
    simResult = biogeme.simulate(results.getBetaValues())
    allSimulationResults.append(simResult)
biogeme.database = keepDatabase

In [34]:
allSimulationResults

[]

In [35]:
# Get accuracy
type(slide), slide.shape

(pandas.core.frame.DataFrame, (2138, 1))

In [36]:
results

<biogeme.results.bioResults at 0x7f612150fc10>

### PyLogit

In [95]:
#export
from collections import OrderedDict
import pandas as pd
import numpy as np
import pylogit as pl

In [96]:
# wide_sm = pd.read_csv('./data/swissmetro_train.csv')

# Load the raw swiss metro data
# Note the .dat files are tab delimited text files
swissmetro_wide = pd.read_csv("./data/swissmetro.dat", sep='\t')

In [97]:
# Select obervations whose choice is known (i.e. CHOICE != 0)
# **AND** whose PURPOSE is either 1 or 3
include_criteria = (swissmetro_wide.PURPOSE.isin([1, 3]) &
                    (swissmetro_wide.CHOICE != 0))

# Use ".copy()" so that later on, we avoid performing operations 
# on a view of a dataframe as opposed to on an actual dataframe
clean_sm_wide = swissmetro_wide.loc[include_criteria].copy()

# Look at how many observations we have after removing unwanted
# observations
final_num_obs = clean_sm_wide.shape[0]
num_obs_statement = "The cleaned number of observations is {:,.0f}."
print (num_obs_statement.format(final_num_obs))

The cleaned number of observations is 6,768.


In [98]:
# Create a custom id column that ignores the fact that this is a 
# panel/repeated-observations dataset, and start the "custom_id" from 1
clean_sm_wide["custom_id"] = np.arange(clean_sm_wide.shape[0], dtype=int) + 1

# Create the list of individual specific variables
ind_variables = clean_sm_wide.columns.tolist()[:15]

# Specify the variables that vary across individuals **AND** 
# across some or all alternatives
alt_varying_variables = {u'travel_time': dict([(1, 'TRAIN_TT'),
                                               (2, 'SM_TT'),
                                               (3, 'CAR_TT')]),
                          u'travel_cost': dict([(1, 'TRAIN_CO'),
                                                (2, 'SM_CO'),
                                                (3, 'CAR_CO')]),
                          u'headway': dict([(1, 'TRAIN_HE'),
                                            (2, 'SM_HE')]),
                          u'seat_configuration': dict([(2, "SM_SEATS")])}

# Specify the availability variables
availability_variables = dict(zip(range(1, 4), ['TRAIN_AV', 'SM_AV', 'CAR_AV']))

# Determine the columns that will denote the
# new column of alternative ids, and the columns
# that denote the custom observation ids and the 
# choice column
new_alt_id = "mode_id"
obs_id_column = "custom_id"
choice_column = "CHOICE"

In [99]:
# Perform the desired conversion
long_swiss_metro = pl.convert_wide_to_long(clean_sm_wide, 
                                           ind_variables, 
                                           alt_varying_variables, 
                                           availability_variables, 
                                           obs_id_column, 
                                           choice_column,
                                           new_alt_id_name=new_alt_id)


In [100]:
# Scale both the travel time and travel cost by 100
long_swiss_metro["travel_time_hundredth"] = (long_swiss_metro["travel_time"] /
                                             100.0)

# Figure out which rows correspond to train or swiss metro 
# alternatives for individuals with GA passes. These individuals face no 
# marginal costs for a trip
train_pass_train_alt = ((long_swiss_metro["GA"] == 1) *
                        (long_swiss_metro["mode_id"].isin([1, 2]))).astype(int)
# Note that the (train_pass_train_alt == 0) term accounts for the
# fact that those with a GA pass have no marginal cost for the trip
long_swiss_metro["travel_cost_hundredth"] = (long_swiss_metro["travel_cost"] *
                                             (train_pass_train_alt == 0) /
                                             100.0)

  f"evaluating in Python space because the {repr(op_str)} "


In [105]:
# Provide the module with the needed input arguments to create
# an instance of the MNL model class
example_mnl = pl.create_choice_model(data=long_swiss_metro,
                                     alt_id_col=new_alt_id,
                                     obs_id_col=obs_id_column,
                                     choice_col=choice_column,
                                     specification=example_specification,
                                     model_type="MNL",
                                     names=example_names)

# Start the model estimation from initial values of all zeros
# i.e. 4 zeros for the 4 coefficients being estimated
example_mnl.fit_mle(np.zeros(4))

Log-likelihood at zero: -6,964.6630
Initial Log-likelihood: -6,964.6630
Estimation Time for Point Estimation: 0.03 seconds.
Final log-likelihood: -5,331.2520




In [106]:
# Look at the estimated coefficients and goodness-of-fit statistics
example_mnl.get_statsmodels_summary()

0,1,2,3
Dep. Variable:,CHOICE,No. Observations:,6768.0
Model:,Multinomial Logit Model,Df Residuals:,6764.0
Method:,MLE,Df Model:,4.0
Date:,"Wed, 10 Mar 2021",Pseudo R-squ.:,0.235
Time:,19:34:53,Pseudo R-bar-squ.:,0.234
AIC:,10670.504,Log-Likelihood:,-5331.252
BIC:,10697.784,LL-Null:,-6964.663

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ASC Car,-0.1546,0.043,-3.577,0.000,-0.239,-0.070
ASC Train,-0.7012,0.055,-12.778,0.000,-0.809,-0.594
B_COST,-1.0838,0.052,-20.910,0.000,-1.185,-0.982
B_TIME,-1.2779,0.057,-22.465,0.000,-1.389,-1.166
