In [1]:
import numpy as np
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta

In [2]:
# Read data
df = pd.read_csv("../data/swissmetro.dat",'\t')
df.shape

(10728, 28)

In [3]:
# Convert to biogeme database
database = db.Database('swissmetro',df)
type(database)

biogeme.database.Database

In [4]:
globals().update(database.variables)

In [5]:
# Remove some observations
exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

In [6]:
database.data.shape

(6768, 28)

In [7]:
# Paramaters
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
ASC_SM = Beta('ASC_SM', 0, None, None, 1)
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_COST = Beta('B_COST', 0, None, None, 0)
MU     = Beta('MU', 1, 1, 10, 0)

In [8]:
database.data.columns

Index(['GROUP', 'SURVEY', 'SP', 'ID', 'PURPOSE', 'FIRST', 'TICKET', 'WHO',
       'LUGGAGE', 'AGE', 'MALE', 'INCOME', 'GA', 'ORIGIN', 'DEST', 'TRAIN_AV',
       'CAR_AV', 'SM_AV', 'TRAIN_TT', 'TRAIN_CO', 'TRAIN_HE', 'SM_TT', 'SM_CO',
       'SM_HE', 'SM_SEATS', 'CAR_TT', 'CAR_CO', 'CHOICE'],
      dtype='object')

In [9]:
database.data[['TRAIN_TT', "TRAIN_CO", 'SM_TT', "SM_CO", "CAR_TT", "CAR_CO"]].describe()

Unnamed: 0,TRAIN_TT,TRAIN_CO,SM_TT,SM_CO,CAR_TT,CAR_CO
count,6768.0,6768.0,6768.0,6768.0,6768.0,6768.0
mean,166.077423,490.885195,84.507388,641.066489,123.154846,78.655881
std,69.795646,1062.593533,47.11314,1411.658237,91.718406,55.921803
min,35.0,9.0,12.0,11.0,0.0,0.0
25%,112.0,60.0,55.0,74.0,70.0,40.0
50%,159.0,94.0,77.0,112.0,120.0,76.0
75%,206.0,166.0,105.0,196.0,176.0,115.0
max,1022.0,5040.0,796.0,6720.0,1560.0,520.0


In [10]:
# Define new variables
SM_COST = SM_CO * (GA == 0)
TRAIN_COST = TRAIN_CO * (GA == 0)
CAR_AV_SP = CAR_AV * (SP != 0)
TRAIN_AV_SP = TRAIN_AV * (SP != 0)
TRAIN_TT_SCALED = TRAIN_TT / 100.0
TRAIN_CO_SCALED = TRAIN_COST / 100.0
SM_TT_SCALED = SM_TT / 100.0
SM_CO_SCALED = SM_COST / 100.0
CAR_TT_SCALED = CAR_TT / 100.0
CAR_CO_SCALED = CAR_CO / 100.0

In [11]:
# Define Utility functions
V1 = (ASC_TRAIN + B_TIME * TRAIN_TT_SCALED + \
                  B_COST * TRAIN_CO_SCALED)
V2 = (ASC_SM  + B_TIME * SM_TT_SCALED + \
                B_COST * SM_CO_SCALED)
V3 = (ASC_CAR + B_TIME * CAR_TT_SCALED + \
                B_COST * CAR_CO_SCALED)

In [12]:
# Availability 
av = {1:TRAIN_AV_SP, 2:SM_AV, 3: CAR_AV_SP}
V = {1: V1, 2: V2, 3: V3}


In [13]:
# Nest definition
existing = MU, [1, 3]
future = 1.0, [2]
nest = existing, future

In [14]:
nest

((MU(1), [1, 3]), (1.0, [2]))

In [15]:
# Set up nested-logit model
logprob = models.lognested(V, av, nest, CHOICE)
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = "02_nested_logit_sm"

In [16]:
logprob.embedExpression

<bound method Expression.embedExpression of _bioLogLogit(1:(((ASC_TRAIN(0) + (B_TIME(0) * (TRAIN_TT / `100.0`))) + (B_COST(0) * ((TRAIN_CO * (GA == `0`)) / `100.0`))) + (((MU(1) - `1.0`) * ((ASC_TRAIN(0) + (B_TIME(0) * (TRAIN_TT / `100.0`))) + (B_COST(0) * ((TRAIN_CO * (GA == `0`)) / `100.0`)))) + (((`1.0` / MU(1)) - `1.0`) * log(bioMultSum({{0:`0.0`, 1:exp((MU(1) * ((ASC_TRAIN(0) + (B_TIME(0) * (TRAIN_TT / `100.0`))) + (B_COST(0) * ((TRAIN_CO * (GA == `0`)) / `100.0`)))))}[((TRAIN_AV * (SP != `0`)) != `0`)], {{0:`0.0`, 1:exp((MU(1) * ((ASC_CAR(0) + (B_TIME(0) * (CAR_TT / `100.0`))) + (B_COST(0) * (CAR_CO / `100.0`)))))}[((CAR_AV * (SP != `0`)) != `0`)]))))), 2:(((ASC_SM(0) + (B_TIME(0) * (SM_TT / `100.0`))) + (B_COST(0) * ((SM_CO * (GA == `0`)) / `100.0`))) + ((`0.0` * ((ASC_SM(0) + (B_TIME(0) * (SM_TT / `100.0`))) + (B_COST(0) * ((SM_CO * (GA == `0`)) / `100.0`)))) + (`0.0` * log(bioMultSum({{0:`0.0`, 1:exp((`1.0` * ((ASC_SM(0) + (B_TIME(0) * (SM_TT / `100.0`))) + (B_COST(0) * ((SM_C

In [17]:
results = biogeme.estimate(saveIterations=False)

In [18]:
results.getEstimatedParameters()

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,-0.167198,0.037136,-4.502351,7e-06,0.054529,-3.066216,0.002167863
ASC_TRAIN,-0.511973,0.045178,-11.332242,0.0,0.079113,-6.47141,9.709256e-11
B_COST,-0.856664,0.046275,-18.51265,0.0,0.060039,-14.268469,0.0
B_TIME,-0.898596,0.056991,-15.767254,0.0,0.107116,-8.389037,0.0
MU,2.054249,0.117727,17.449238,0.0,0.164248,12.506998,0.0


In [19]:
results.getGeneralStatistics()

{'Number of estimated parameters': (5, ''),
 'Sample size': (6768, ''),
 'Excluded observations': (3960, ''),
 'Init log likelihood': (-6964.662979192372, '.7g'),
 'Final log likelihood': (-5236.90001542563, '.7g'),
 'Likelihood ratio test for the init. model': (3455.5259275334847, '.7g'),
 'Rho-square for the init. model': (0.24807560235557802, '.3g'),
 'Rho-square-bar for the init. model': (0.24735769252778905, '.3g'),
 'Akaike Information Criterion': (10483.80003085126, '.7g'),
 'Bayesian Information Criterion': (10517.89983535769, '.7g'),
 'Final gradient norm': (0.0290355708572915, '.4E'),
 'Nbr of threads': (4, '')}

In [20]:
results.getBetaValues()

{'ASC_CAR': -0.1671984945299977,
 'ASC_TRAIN': -0.5119729189945217,
 'B_COST': -0.8566640180954533,
 'B_TIME': -0.8985964324916943,
 'MU': 2.054249486211504}

In [21]:
biogeme.simulate(results.getBetaValues())

Unnamed: 0,loglike
0,-0.475055
1,-0.439247
2,-0.514670
3,-0.642297
4,-0.452943
...,...
8446,-1.599996
8447,-1.762829
8448,-1.935206
8449,-1.907229


### Biogeme loglikelihood

In [24]:
params = results.getBetaValues()

In [33]:
params

{'ASC_CAR': -0.1671984945299977,
 'ASC_TRAIN': -0.5119729189945217,
 'B_COST': -0.8566640180954533,
 'B_TIME': -0.8985964324916943,
 'MU': 2.054249486211504}

In [25]:
V1 = (params["ASC_TRAIN"] + params["B_TIME"] * TRAIN_TT_SCALED + params["B_COST"] * TRAIN_CO_SCALED)
V2 = (0 + params["B_TIME"] * SM_TT_SCALED +  params["B_COST"] * SM_CO_SCALED)
V3 = (params["ASC_CAR"] + params["B_TIME"] * CAR_TT_SCALED + params["B_COST"] * CAR_CO_SCALED)

In [34]:
df["V1"] = database.valuesFromDatabase(V1*params["MU"])
df["V2"] = database.valuesFromDatabase(V2)
df["V3"] = database.valuesFromDatabase(V3*params["MU"])
df["PRED"] = np.argmax(df[["V1","V2","V3"]].values, axis=1) + 1

In [35]:
df.columns

Index(['GROUP', 'SURVEY', 'SP', 'ID', 'PURPOSE', 'FIRST', 'TICKET', 'WHO',
       'LUGGAGE', 'AGE', 'MALE', 'INCOME', 'GA', 'ORIGIN', 'DEST', 'TRAIN_AV',
       'CAR_AV', 'SM_AV', 'TRAIN_TT', 'TRAIN_CO', 'TRAIN_HE', 'SM_TT', 'SM_CO',
       'SM_HE', 'SM_SEATS', 'CAR_TT', 'CAR_CO', 'CHOICE', 'V1', 'V2', 'V3',
       'PRED'],
      dtype='object')

In [36]:
cols = ['ID', 'PURPOSE', 'CHOICE', 'V1', 'V2', 'V3', 'PRED']

In [37]:
df[cols].head()

Unnamed: 0,ID,PURPOSE,CHOICE,V1,V2,V3,PRED
0,1,1,2,-3.963879,-1.011581,-3.64709,2
1,1,1,2,-3.797744,-0.958923,-3.981452,2
2,1,1,2,-4.296149,-1.098925,-3.418316,2
3,1,1,2,-3.65696,-1.011581,-2.587642,2
4,1,1,2,-4.084972,-0.925915,-3.483048,2


In [38]:
def accuracy(actual, pred):
    res = np.sum(np.array(actual) == np.array(pred)) / len(actual)
    return res

In [39]:
accuracy(df["CHOICE"], df["PRED"])

0.5029550827423168