In [None]:
from collections import OrderedDict    # For recording the model specification 
import pickle
import pandas as pd                    # For file input/output
import numpy as np                     # For vectorized math operations

import pylogit as pl                   # For MNL model estimation
                                       # To convert from wide to long format


# 1. Load the Swissmetro Dataset

In [None]:
# Load the raw swiss metro data
# Note the .dat files are tab delimited text files
# swissmetro_wide = pd.read_table("../data/swissmetro.dat", sep='\t')
swissmetro_wide = pickle.load(open('../data/swissmetro_clean.pkl', "rb"))

# 2. Clean the dataset

Note that the 01Logit.py file provided is an example from Python Biogeme (see: <a href="http://biogeme.epfl.ch/examples_swissmetro.html">http://biogeme.epfl.ch/examples_swissmetro.html</a>). See http://www.strc.ch/conferences/2001/bierlaire1.pdf for a detailed explanation of the variables. The 01Logit.py file excludes observations meeting the following critera:
<pre>
exclude = (( PURPOSE != 1 ) * (  PURPOSE   !=  3  ) + ( CHOICE == 0 )) > 0
</pre>
As a result, their dataset has 6,768 observations. Below, I make the same exclusions.

In [None]:
# Use ".copy()" so that later on, we avoid performing operations 
# on a view of a dataframe as opposed to on an actual dataframe
clean_sm_wide = swissmetro_wide.copy()

# Look at how many observations we have after removing unwanted
# observations
final_num_obs = clean_sm_wide.shape[0]
num_obs_statement = "The cleaned number of observations is {:,.0f}."
print (num_obs_statement.format(final_num_obs))

The cleaned number of observations is 10,692.


# 3. Create an id column that ignores the repeat observations per individual

In the simple example given on the Python Biogeme website for 01Logit.py, the repeated observations per individual are treated as separate and independent observations We will do the same

In [None]:
# Create a custom id column that ignores the fact that this is a 
# panel/repeated-observations dataset, and start the "custom_id" from 1
clean_sm_wide["custom_id"] = np.arange(clean_sm_wide.shape[0], dtype=int) + 1

# 4. Convert the data from 'wide' format to 'long' format

## 4a. Determine the 'type' of each column in the dataset.

In [None]:
# Look at the columns of the swissmetro data
clean_sm_wide.columns

Index(['GROUP', 'SURVEY', 'SP', 'ID', 'PURPOSE', 'FIRST', 'TICKET', 'WHO',
       'LUGGAGE', 'AGE', 'MALE', 'INCOME', 'GA', 'ORIGIN', 'DEST', 'TRAIN_AV',
       'CAR_AV', 'SM_AV', 'TRAIN_TT', 'TRAIN_CO', 'TRAIN_HE', 'SM_TT', 'SM_CO',
       'SM_HE', 'SM_SEATS', 'CAR_TT', 'CAR_CO', 'CHOICE', 'custom_id'],
      dtype='object')

In [None]:
# Create the list of individual specific variables
ind_variables = clean_sm_wide.columns.tolist()[:15]

# Specify the variables that vary across individuals **AND** 
# across some or all alternatives
alt_varying_variables = {u'travel_time': dict([(1, 'TRAIN_TT'),
                                               (2, 'SM_TT'),
                                               (3, 'CAR_TT')]),
                          u'travel_cost': dict([(1, 'TRAIN_CO'),
                                                (2, 'SM_CO'),
                                                (3, 'CAR_CO')]),
                          u'headway': dict([(1, 'TRAIN_HE'),
                                            (2, 'SM_HE')]),
                          u'seat_configuration': dict([(2, "SM_SEATS")])}

# Specify the availability variables
availability_variables = dict(zip(range(1, 4), ['TRAIN_AV', 'SM_AV', 'CAR_AV']))

# Determine the columns that will denote the
# new column of alternative ids, and the columns
# that denote the custom observation ids and the 
# choice column
new_alt_id = "mode_id"
obs_id_column = "custom_id"
choice_column = "CHOICE"

## 4b. Actually perform the conversion from wide to long formats

In [None]:
# Perform the desired conversion
long_swiss_metro = pl.convert_wide_to_long(clean_sm_wide, 
                                           ind_variables, 
                                           alt_varying_variables, 
                                           availability_variables, 
                                           obs_id_column, 
                                           choice_column,
                                           new_alt_id_name=new_alt_id)

# Look at the first 9 rows of the long-format dataframe
long_swiss_metro.head(9).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
custom_id,1.0,1.0,1.0,2.0,2.0,2.0,3.0,3.0,3.0
mode_id,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0
CHOICE,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
GROUP,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
SURVEY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SP,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ID,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
PURPOSE,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FIRST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TICKET,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# 5. Create the variables used in the Python Biogeme Logit Model Example

In 01Logit.py, the travel time and travel cost variables are scaled for ease of numeric optimization. We will do the same such that our estimated coefficients are comparable.

In [None]:
# Scale both the travel time and travel cost by 100
long_swiss_metro["travel_time_hundredth"] = (long_swiss_metro["travel_time"] /
                                             100.0)

# Figure out which rows correspond to train or swiss metro 
# alternatives for individuals with GA passes. These individuals face no 
# marginal costs for a trip
train_pass_train_alt = ((long_swiss_metro["GA"] == 1) *
                        (long_swiss_metro["mode_id"].isin([1, 2]))).astype(int)
# Note that the (train_pass_train_alt == 0) term accounts for the
# fact that those with a GA pass have no marginal cost for the trip
long_swiss_metro["travel_cost_hundredth"] = (long_swiss_metro["travel_cost"] *
                                             (train_pass_train_alt == 0) /
                                             100.0)



# 6. Specify and Estimate the Python Biogeme Logit Model Example

## 6a. Specify the Model

In [None]:
# Create the model's specification dictionary and variable names dictionary
# NOTE: - Keys should be variables within the long format dataframe.
#         The sole exception to this is the "intercept" key.
#       - For the specification dictionary, the values should be lists
#         or lists of lists. Within a list, or within the inner-most
#         list should be the alternative ID's of the alternative whose
#         utility specification the explanatory variable is entering.

example_specification = OrderedDict()
example_names = OrderedDict()

# Note that 1 is the id for the Train and 3 is the id for the Car.
# The next two lines are placing alternative specific constants in
# the utility equations for the Train and for the Car. The order
# in which these variables are placed is chosen so the summary
# dataframe which is returned will match that shown in the HTML
# file of the python biogeme example.
example_specification["intercept"] = [3, 1]
example_names["intercept"] = ['ASC Car', 'ASC Train']

# Note that the names used below are simply for consistency with
# the coefficient names given in the Python Biogeme example.
# example_specification["travel_cost_hundredth"] = [[1, 2, 3]]
# example_names["travel_cost_hundredth"] = ['B_COST']

example_specification["travel_cost_hundredth"] = [[1, 2, 3]]
example_names["travel_cost_hundredth"] = ['B_COST']

example_specification["travel_time_hundredth"] = [[1, 2, 3]]
example_names["travel_time_hundredth"] = ['B_TIME']

## 6b. Estimate the model

In [None]:
# Provide the module with the needed input arguments to create
# an instance of the MNL model class
example_mnl = pl.create_choice_model(data=long_swiss_metro,
                                     alt_id_col=new_alt_id,
                                     obs_id_col=obs_id_column,
                                     choice_col=choice_column,
                                     specification=example_specification,
                                     model_type="MNL",
                                     names=example_names)

# Start the model estimation from initial values of all zeros
# i.e. 4 zeros for the 4 coefficients being estimated
example_mnl.fit_mle(np.zeros(4))

Log-likelihood at zero: -11,071.2632
Initial Log-likelihood: -11,071.2632
Estimation Time for Point Estimation: 0.16 seconds.
Final log-likelihood: -8,647.8792


  warn('Method %s does not use Hessian information (hess).' % method,


## 6.c Compare the model output with that of Python Biogeme

In [None]:
# Look at the estimated coefficients and goodness-of-fit statistics
example_mnl.get_statsmodels_summary()

0,1,2,3
Dep. Variable:,CHOICE,No. Observations:,10692.0
Model:,Multinomial Logit Model,Df Residuals:,10688.0
Method:,MLE,Df Model:,4.0
Date:,"Tue, 02 Feb 2021",Pseudo R-squ.:,0.219
Time:,09:30:25,Pseudo R-bar-squ.:,0.219
AIC:,17303.758,Log-Likelihood:,-8647.879
BIC:,17332.867,LL-Null:,-11071.263

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ASC Car,0.0169,0.031,0.537,0.591,-0.045,0.078
ASC Train,-0.6564,0.042,-15.659,0.000,-0.739,-0.574
B_COST,-0.7891,0.036,-21.726,0.000,-0.860,-0.718
B_TIME,-1.2773,0.043,-29.962,0.000,-1.361,-1.194


##### Get accuracy

In [None]:
def model_pred(data, model, alt_id_col, obs_id_col, choice_col):
    data['predicted'] = model.predict(data)  
    is_chosen = data.groupby([obs_id_col])['predicted'].idxmax()
    data['predicted_choice'] = 0
    data.loc[is_chosen.values,'predicted_choice'] = 1
    
    actual = data.loc[data[choice_column] ==1,alt_id_column]
    pred = data.loc[data['predicted_choice'] ==1,alt_id_column]
    return actual, pred   


alt_id_column = "mode_id"
obs_id_column = "custom_id"
choice_column = "CHOICE"

actual, predict = model_pred(long_swiss_metro, example_mnl, new_alt_id, obs_id_column, choice_column)

from sklearn.metrics import classification_report
print(classification_report(actual, predict))

              precision    recall  f1-score   support

           1       0.68      0.01      0.02      1413
           2       0.66      0.90      0.76      6199
           3       0.66      0.47      0.55      3080

    accuracy                           0.66     10692
   macro avg       0.67      0.46      0.44     10692
weighted avg       0.66      0.66      0.60     10692



In [None]:
# Look at robust p-values in case one wants to see them
example_mnl.summary

Unnamed: 0,parameters,std_err,t_stats,p_values,robust_std_err,robust_t_stats,robust_p_values
ASC Car,0.016871,0.03141,0.537117,0.5911867,0.037121,0.454493,0.6494738
ASC Train,-0.656446,0.041921,-15.659217,2.874124e-55,0.05451,-12.042623,2.1210260000000002e-33
B_COST,-0.78914,0.036322,-21.726386,1.1554649999999999e-104,0.050933,-15.493837,3.818078e-54
B_TIME,-1.277273,0.04263,-29.961676,3.100099e-197,0.065605,-19.469057,2.0093560000000001e-84


### Summary
My estimation results match those of Python Biogeme. <br>
The Python Biogeme log-likelihood is -5,331.252 and their estimated parameters are:
<pre>
ASC Car:    -0.155
ASC Train:  -0.701
B_COST:     -1.08
B_TIME:     -1.28
</pre>

As shown above, my log-likelihood is -5,331.252, and my estimated parameters are:
<pre>
ASC Car:    -0.1546
ASC Train:  -0.7012
B_COST:     -1.0838	
B_TIME:     -1.2779
</pre>

### Test with Biogeme

In [None]:
#%%
"""File 01logit.py

:author: Michel Bierlaire, EPFL
:date: Thu Sep  6 15:14:39 2018

 Example of a logit model.
 Three alternatives: Train, Car and Swissmetro
 SP data
"""

import pandas as pd
import pickle
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta

# Read the data
# df = pd.read_csv('swissmetro.dat', '\t')
# df = pd.read_table("../data/swissmetro.dat", sep='\t')
df = pickle.load(open('../data/swissmetro_clean.pkl', "rb"))
database = db.Database('swissmetro', df)

# The following statement allows you to use the names of the variable
# as Python variable.
globals().update(database.variables)

# Removing some observations
exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

# Parameters to be estimated
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
ASC_SM = Beta('ASC_SM', 0, None, None, 1)
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_COST = Beta('B_COST', 0, None, None, 0)


# Definition of new variables
SM_COST = SM_CO * (GA == 0)
TRAIN_COST = TRAIN_CO * (GA == 0)
CAR_AV_SP = CAR_AV * (SP != 0)
TRAIN_AV_SP = TRAIN_AV * (SP != 0)
TRAIN_TT_SCALED = TRAIN_TT / 100.0
TRAIN_COST_SCALED = TRAIN_COST / 100
SM_TT_SCALED = SM_TT / 100.0
SM_COST_SCALED = SM_COST / 100
CAR_TT_SCALED = CAR_TT / 100
CAR_CO_SCALED = CAR_CO / 100

# Definition of the utility functions
V1 = ASC_TRAIN + \
     B_TIME * TRAIN_TT_SCALED + \
     B_COST * TRAIN_COST_SCALED
V2 = ASC_SM + \
     B_TIME * SM_TT_SCALED + \
     B_COST * SM_COST_SCALED
V3 = ASC_CAR + \
     B_TIME * CAR_TT_SCALED + \
     B_COST * CAR_CO_SCALED

# Associate utility functions with the numbering of alternatives
V = {1: V1,
     2: V2,
     3: V3}

# Associate the availability conditions with the alternatives
av = {1: TRAIN_AV_SP,
      2: SM_AV,
      3: CAR_AV_SP}

# Definition of the model. This is the contribution of each
# observation to the log likelihood function.
logprob = models.loglogit(V, av, CHOICE)

# Create the Biogeme object
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = '01logit'

# Estimate the parameters
results = biogeme.estimate()

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
pandasResults

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,-0.160781,0.043247,-3.717742,0.000201,0.058178,-2.763588,0.005717
ASC_TRAIN,-0.716768,0.055016,-13.028317,0.0,0.082651,-8.67218,0.0
B_COST,-1.083953,0.051807,-20.922856,0.0,0.068156,-15.903978,0.0
B_TIME,-1.269009,0.056857,-22.319395,0.0,0.104124,-12.187436,0.0


In [None]:
# From Pylogit
example_mnl.summary

Unnamed: 0,parameters,std_err,t_stats,p_values,robust_std_err,robust_t_stats,robust_p_values
ASC Car,0.016871,0.03141,0.537117,0.5911867,0.037121,0.454493,0.6494738
ASC Train,-0.656446,0.041921,-15.659217,2.874124e-55,0.05451,-12.042623,2.1210260000000002e-33
B_COST,-0.78914,0.036322,-21.726386,1.1554649999999999e-104,0.050933,-15.493837,3.818078e-54
B_TIME,-1.277273,0.04263,-29.961676,3.100099e-197,0.065605,-19.469057,2.0093560000000001e-84
