In [None]:
# default_exp MNL_swissmetro

# MNL with Swissmetro dataset

> API details of MNL with Swissmetro dataset.

In [None]:
#hide
from nbdev.showdoc import *

#### Data preparation

In [None]:
#export
import pickle
import numpy as np
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta

In [None]:
# Convert .dat to .pck
df = pd.read_csv('./data/swissmetro.dat', sep='	')
# Remove unknown age (6)
df['AGE'].value_counts
df = df[df['AGE'] != 6]
# Remove "other" trip purpose (9)
df = df[df['PURPOSE'] != 9]
# Remove "unknown" choice (0)
df = df[df['CHOICE'] != 0]
df.shape

(10692, 28)

In [None]:
# Save to pickle file
df.to_pickle('./data/swissmetro_clean.pkl')

#### Data loading

In [None]:
data = pickle.load(open('./data/swissmetro_clean.pkl', "rb"))
data.head()

Unnamed: 0,GROUP,SURVEY,SP,ID,PURPOSE,FIRST,TICKET,WHO,LUGGAGE,AGE,...,TRAIN_TT,TRAIN_CO,TRAIN_HE,SM_TT,SM_CO,SM_HE,SM_SEATS,CAR_TT,CAR_CO,CHOICE
0,2,0,1,1,1,0,1,1,0,3,...,112,48,120,63,52,20,0,117,65,2
1,2,0,1,1,1,0,1,1,0,3,...,103,48,30,60,49,10,0,117,84,2
2,2,0,1,1,1,0,1,1,0,3,...,130,48,60,67,58,30,0,117,52,2
3,2,0,1,1,1,0,1,1,0,3,...,103,40,30,63,52,20,0,72,52,2
4,2,0,1,1,1,0,1,1,0,3,...,130,36,60,63,42,20,0,90,84,2


### Biogeme

In [None]:
# Read the data
# df = pd.read_csv('swissmetro.dat', '\t')
df = pickle.load(open('./data/swissmetro_clean.pkl', "rb"))
database = db.Database('swissmetro', df)

In [None]:
df.shape

(10692, 28)

In [None]:
# The following statement allows you to use the names of the variable
# as Python variable.
globals().update(database.variables)

In [None]:
# Removing some observations
# exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
# database.remove(exclude)

# Parameters to be estimated
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
ASC_SM = Beta('ASC_SM', 0, None, None, 1)
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_COST = Beta('B_COST', 0, None, None, 0)

In [None]:
# Definition of new variables
SM_COST = SM_CO * (GA == 0)
TRAIN_COST = TRAIN_CO * (GA == 0)
CAR_AV_SP = CAR_AV * (SP != 0)
TRAIN_AV_SP = TRAIN_AV * (SP != 0)
TRAIN_TT_SCALED = TRAIN_TT / 100.0
TRAIN_COST_SCALED = TRAIN_COST / 100
SM_TT_SCALED = SM_TT / 100.0
SM_COST_SCALED = SM_COST / 100
CAR_TT_SCALED = CAR_TT / 100
CAR_CO_SCALED = CAR_CO / 100

In [None]:
# Definition of the utility functions
V1 = ASC_TRAIN + \
     B_TIME * TRAIN_TT_SCALED + \
     B_COST * TRAIN_COST_SCALED
V2 = ASC_SM + \
     B_TIME * SM_TT_SCALED + \
     B_COST * SM_COST_SCALED
V3 = ASC_CAR + \
     B_TIME * CAR_TT_SCALED + \
     B_COST * CAR_CO_SCALED

In [None]:
# Associate utility functions with the numbering of alternatives
V = {1: V1,
     2: V2,
     3: V3}

# Associate the availability conditions with the alternatives
av = {1: TRAIN_AV_SP,
      2: SM_AV,
      3: CAR_AV_SP}

In [None]:
# Definition of the model. This is the contribution of each
# observation to the log likelihood function.
logprob = models.loglogit(V, av, CHOICE)

# Create the Biogeme object
biogeme = bio.BIOGEME(database, logprob)
# biogeme.modelName = '01logit'
biogeme.modelName = 'anything'

In [None]:
# Estimate the parameters
results = biogeme.estimate()

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
print(pandasResults)

              Value   Std err     t-test   p-value  Rob. Std err  Rob. t-test  \
ASC_CAR    0.016930  0.031410   0.538992  0.589892      0.037120     0.456079   
ASC_TRAIN -0.656391  0.041921 -15.657964  0.000000      0.054510   -12.041692   
B_COST    -0.789108  0.036321 -21.725702  0.000000      0.050931   -15.493548   
B_TIME    -1.277293  0.042630 -29.962201  0.000000      0.065605   -19.469462   

           Rob. p-value  
ASC_CAR        0.648333  
ASC_TRAIN      0.000000  
B_COST         0.000000  
B_TIME         0.000000  


In [None]:
pandasResults['Value']

ASC_CAR      0.016930
ASC_TRAIN   -0.656391
B_COST      -0.789108
B_TIME      -1.277293
Name: Value, dtype: float64

In [None]:
pandasResults.at['ASC_CAR','Value']

0.01692988840776121

In [None]:
# The validation consists in organizing the data into several slices
# of about the same size, randomly defined. Each slice is considered
# as a validation dataset. The model is then re-estimated using all
# the data except the slice, and the estimated model is applied on the
# validation set (i.e. the slice). The value of the log likelihood for
# each observation in the validation set is reported in a
# dataframe. As this is done for each slice, the output is a list of
# dataframes, each corresponding to one of these exercises.

validation_results = biogeme.validate(results,5)

for slide in validation_results:
    print(f'Log likelihood for {slide.shape[0]} validation data: {slide["Loglikelihood"].sum()}')
    

Log likelihood for 2139 validation data: -1725.6245127576863
Log likelihood for 2139 validation data: -1757.9792027381814
Log likelihood for 2138 validation data: -1708.3622443500494
Log likelihood for 2138 validation data: -1739.8047807640314
Log likelihood for 2138 validation data: -1716.1084410042763


In [None]:
slices = 5

In [None]:
validationData = database.split(slices)

In [None]:
type(validationData)

zip

In [None]:
for v in validationData:
    print(type(v))

<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>


In [None]:
type(v), len(v)

(tuple, 2)

In [None]:
type(v[0]), type(v[1]), v[0].shape, v[1].shape

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 (8554, 28),
 (2138, 28))

In [None]:
1711*4 + 1710

8554

In [None]:
6844 + 1710

8554

In [None]:
keepDatabase = database
type(keepDatabase)

biogeme.database.Database

In [None]:
estimationResults = results

In [None]:
allSimulationResults = []
for v in validationData:
    # v[0] is the estimation data set
    database = db.Database('Estimation data', v[0])
    biogeme.loglike.changeInitValues(estimationResults.getBetaValues())
    results = biogeme.estimate()
    simulate = {'Loglikelihood': biogeme.loglike}
#     simBiogeme = BIOGEME(db.Database('Validation data', v[1]),
#                          simulate)
#     simResult = simBiogeme.simulate(results.getBetaValues())
    simResult = biogeme.simulate(results.getBetaValues())
    allSimulationResults.append(simResult)
biogeme.database = keepDatabase

In [None]:
allSimulationResults

[]

In [None]:
# Get accuracy
type(slide), slide.shape

(pandas.core.frame.DataFrame, (2138, 1))

In [None]:
results

<biogeme.results.bioResults at 0x7f6feceff1f0>

### PyLogit

In [None]:
#export
#load package...