In [1]:
# Source code from: https://github.com/timothyb0912/pylogit

In [10]:
from collections import OrderedDict
import numpy as np
import pandas as pd
import pylogit as pl

### Load Swiss metro data

In [11]:
sm_long = pd.read_csv("./data/swissmetro_long.csv")
sm_long.shape

(19143, 22)

### Create the model specification
The model specification being used in this example is the following:
$$
\begin{aligned}
V_{i, \textrm{Train}} &= \textrm{ASC Train} + \\
&\quad \beta _{ \textrm{tt_train} } \textrm{Travel Time} _{ \textrm{Train}} + \\
&\quad \beta _{ \textrm{tc_train} } \textrm{Travel Cost}_{\textrm{Train}} * \left( GA == 0 \right) + \\
&\quad \beta _{ \textrm{headway_train} } \textrm{Headway} _{\textrm{Train}} \\
\\
V_{i, \textrm{Swissmetro}} &= \textrm{ASC Swissmetro} + \\
&\quad \beta _{ \textrm{tt_sm} } \textrm{Travel Time} _{ \textrm{Swissmetro}} + \\
&\quad \beta _{ \textrm{tc_sm} } \textrm{Travel Cost}_{\textrm{Swissmetro}} * \left( GA == 0 \right) + \\
&\quad \beta _{ \textrm{headway_sm} } \textrm{Heaway} _{\textrm{Swissmetro}} \\
% &\quad \beta _{ \textrm{seat} } \left( \textrm{Seat Configuration} == 1 \right) \\
% &\quad \beta _{ \textrm{survey} } \left( \textrm{Train Survey} == 1 \right) \\
% &\quad \beta _{ \textrm{first_class} } \left( \textrm{First Class} == 0 \right) \\
\\
V_{i, \textrm{Car}} &= \beta _{ \textrm{tt_car} } \textrm{Travel Time} _{ \textrm{Car}} + \\
&\quad \beta _{ \textrm{tc_car}} \textrm{Travel Cost}_{\textrm{Car}} \\
% &\quad \beta _{\textrm{luggage}=1} \left( \textrm{Luggage} == 1 \right) + \\
% &\quad \beta _{\textrm{luggage}>1} \left( \textrm{Luggage} > 1 \right)
\end{aligned}
$$

In [12]:
# Update travel time (minutes => hours)
# sm_long["travel_time"] = sm_long["travel_time"] 
# Get free_ticket
sm_long["free_ticket"] = (((sm_long["GA"] == 1) | (sm_long["WHO"]/ 100.0 == 2)) &
                            sm_long["mode_id"].isin([1,2])).astype(int)
# Update travel cost
sm_long["travel_cost"] = (sm_long["travel_cost"] * (sm_long["free_ticket"] == 0)) 

In [13]:
spec = OrderedDict()
spec["intercept"] = [1, 2]
spec["travel_time"] = [1, 2, 3]
spec["travel_cost"] = [1, 2, 3]
spec["headway"] = [1, 2]

In [14]:
# Estimate the multinomial logit model (MNL)
sm_mnl = pl.create_choice_model(data=sm_long,
                                alt_id_col="mode_id",
                                obs_id_col="custom_id",
                                choice_col="CHOICE",
                                specification=spec,
                                model_type="MNL")

# Specify the initial values and method for the optimization.
sm_mnl.fit_mle(np.zeros(10))

# Look at the estimation results
sm_mnl.get_statsmodels_summary()

Log-likelihood at zero: -6,964.6630
Initial Log-likelihood: -6,964.6630
Estimation Time for Point Estimation: 0.07 seconds.
Final log-likelihood: -5,060.4258




0,1,2,3
Dep. Variable:,CHOICE,No. Observations:,6768.0
Model:,Multinomial Logit Model,Df Residuals:,6758.0
Method:,MLE,Df Model:,10.0
Date:,"Thu, 11 Mar 2021",Pseudo R-squ.:,0.273
Time:,18:12:33,Pseudo R-bar-squ.:,0.272
AIC:,10140.852,Log-Likelihood:,-5060.426
BIC:,10209.051,LL-Null:,-6964.663

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept_1,0.6291,0.143,4.408,0.000,0.349,0.909
intercept_2,0.5003,0.110,4.567,0.000,0.286,0.715
travel_time_1,-0.0088,0.001,-10.608,0.000,-0.010,-0.007
travel_time_2,-0.0111,0.001,-12.739,0.000,-0.013,-0.009
travel_time_3,-0.0129,0.001,-15.943,0.000,-0.014,-0.011
travel_cost_1,-0.0309,0.001,-25.817,0.000,-0.033,-0.029
travel_cost_2,-0.0112,0.001,-20.613,0.000,-0.012,-0.010
travel_cost_3,-0.0080,0.001,-7.499,0.000,-0.010,-0.006
headway_1,-0.0052,0.001,-4.958,0.000,-0.007,-0.003


### Evaluation metrics

In [15]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [16]:
def model_pred(data, model, alt_id_col, obs_id_col, choice_col):
    data['predicted'] = model.predict(data)
    is_chosen = data.groupby([obs_id_col])['predicted'].idxmax()
    data['predicted_choice'] = 0
    data.loc[is_chosen.values,'predicted_choice'] = 1
    
    actual = data.loc[data[choice_column] == 1, alt_id_column]
    pred = data.loc[data['predicted_choice'] == 1, alt_id_column]
    return actual, pred   

def nll():
    pass

alt_id_column = "mode_id"
obs_id_column = "custom_id"
choice_column = "CHOICE"

##### MNL model results

In [17]:
actual, predict = model_pred(sm_long, sm_mnl, alt_id_column, obs_id_column, choice_column)
print(classification_report(actual, predict))

              precision    recall  f1-score   support

           1       0.60      0.04      0.08       908
           2       0.67      0.92      0.78      4090
           3       0.69      0.42      0.52      1770

    accuracy                           0.67      6768
   macro avg       0.65      0.46      0.46      6768
weighted avg       0.67      0.67      0.62      6768



In [18]:
acc = accuracy_score(actual, predict), 
f1 = f1_score(actual, predict, average="weighted")
print(f"Accuracy = {np.round(acc,5)}, F1_score = {np.round(f1,5)}")

Accuracy = [0.67317], F1_score = 0.61607


########################################################################################

## Simple MNL as in 1.logit Biogeme

The model specification being used in this example is the following:
$$
\begin{aligned}
V_{i, \textrm{Train}} &= \textrm{ASC Train}  + \\
&\quad \beta _{ \textrm{tt} } \textrm{Travel Time} _{ \textrm{Train}} + \\
&\quad \beta _{ \textrm{tc} } \textrm{Travel Cost}_{\textrm{Train}} * \left( GA == 0 \right) + \\
\\
V_{i, \textrm{Swissmetro}} &= \quad \beta _{ \textrm{tt} } \textrm{Travel Time} _{ \textrm{Swissmetro}} + \\
&\quad \beta _{ \textrm{tc} } \textrm{Travel Cost}_{\textrm{Swissmetro}} * \left( GA == 0 \right) + \\
\\
V_{i, \textrm{Car}} &= \textrm{ASC Car} + \\
&\beta _{ \textrm{tt} } \textrm{Travel Time} _{ \textrm{Car}} + \\
&\quad \beta _{ \textrm{tc}} \textrm{Travel Cost}_{\textrm{Car}} \\
\end{aligned}
$$

In [26]:
# Read data
sm_long = pd.read_csv("./data/swissmetro_long.csv")

# Update travel time (minutes => hours)
sm_long["travel_time"] = sm_long["travel_time"] / 100
# Get free_ticket
sm_long["free_ticket"] = (((sm_long["GA"] == 1) | (sm_long["WHO"]/ 100.0 == 2)) &
                            sm_long["mode_id"].isin([1,2])).astype(int)
# Update travel cost
sm_long["travel_cost"] = (sm_long["travel_cost"] * (sm_long["free_ticket"] == 0)) / 100

In [27]:
spec = OrderedDict()
spec["intercept"] = [3, 1]
spec["travel_cost"] = [[1, 2, 3]]
spec["travel_time"] = [[1, 2, 3]]

In [28]:
# Estimate the multinomial logit model (MNL)
sm_mnl = pl.create_choice_model(data=sm_long,
                                alt_id_col="mode_id",
                                obs_id_col="custom_id",
                                choice_col="CHOICE",
                                specification=spec,
                                model_type="MNL")

# Specify the initial values and method for the optimization.
sm_mnl.fit_mle(np.zeros(4))

# Look at the estimation results
sm_mnl.get_statsmodels_summary()

Log-likelihood at zero: -6,964.6630
Initial Log-likelihood: -6,964.6630
Estimation Time for Point Estimation: 0.03 seconds.
Final log-likelihood: -5,331.2520




0,1,2,3
Dep. Variable:,CHOICE,No. Observations:,6768.0
Model:,Multinomial Logit Model,Df Residuals:,6764.0
Method:,MLE,Df Model:,4.0
Date:,"Thu, 11 Mar 2021",Pseudo R-squ.:,0.235
Time:,18:37:20,Pseudo R-bar-squ.:,0.234
AIC:,10670.504,Log-Likelihood:,-5331.252
BIC:,10697.784,LL-Null:,-6964.663

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept_3,-0.1546,0.043,-3.577,0.000,-0.239,-0.070
intercept_1,-0.7012,0.055,-12.778,0.000,-0.809,-0.594
"travel_cost_[1, 2, 3]",-1.0838,0.052,-20.910,0.000,-1.185,-0.982
"travel_time_[1, 2, 3]",-1.2779,0.057,-22.465,0.000,-1.389,-1.166


In [29]:
actual, predict = model_pred(sm_long, sm_mnl, alt_id_column, obs_id_column, choice_column)
print(classification_report(actual, predict))

              precision    recall  f1-score   support

           1       0.83      0.01      0.01       908
           2       0.68      0.92      0.78      4090
           3       0.68      0.46      0.55      1770

    accuracy                           0.68      6768
   macro avg       0.73      0.46      0.45      6768
weighted avg       0.70      0.68      0.62      6768



In [30]:
acc = accuracy_score(actual, predict), 
f1 = f1_score(actual, predict, average="weighted")
print(f"Accuracy = {np.round(acc,5)}, F1_score = {np.round(f1,5)}")

Accuracy = [0.67642], F1_score = 0.61537
