In [283]:
import numpy as np
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta

In [284]:
# Read data
df = pd.read_csv("../data/swissmetro.dat",'\t')
df.shape

(10728, 28)

In [285]:
df.head()

Unnamed: 0,GROUP,SURVEY,SP,ID,PURPOSE,FIRST,TICKET,WHO,LUGGAGE,AGE,...,TRAIN_TT,TRAIN_CO,TRAIN_HE,SM_TT,SM_CO,SM_HE,SM_SEATS,CAR_TT,CAR_CO,CHOICE
0,2,0,1,1,1,0,1,1,0,3,...,112,48,120,63,52,20,0,117,65,2
1,2,0,1,1,1,0,1,1,0,3,...,103,48,30,60,49,10,0,117,84,2
2,2,0,1,1,1,0,1,1,0,3,...,130,48,60,67,58,30,0,117,52,2
3,2,0,1,1,1,0,1,1,0,3,...,103,40,30,63,52,20,0,72,52,2
4,2,0,1,1,1,0,1,1,0,3,...,130,36,60,63,42,20,0,90,84,2


In [286]:
(df.AGE > 65).value_counts()

False    10728
Name: AGE, dtype: int64

In [287]:
# Convert to biogeme database
database = db.Database('swissmetro',df)
type(database)

biogeme.database.Database

In [288]:
globals().update(database.variables)

In [289]:
# Remove some observations
exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

In [290]:
database.data.shape

(6768, 28)

In [291]:
# Paramaters
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
ASC_SM = Beta('ASC_SM', 0, None, None, 1)
B_TIME = Beta('B_TIME', 0, None, None, 0)
# B_COST = Beta('B_COST', 0, None, None, 0)
B_COST_CAR = Beta('B_COST_CAR', 0, None, None, 0)
B_COST_TRAIN = Beta('B_COST_TRAIN', 0, None, None, 0)
B_COST_SM = Beta('B_COST_SM', 0, None, None, 0)

In [292]:
database.data.columns

Index(['GROUP', 'SURVEY', 'SP', 'ID', 'PURPOSE', 'FIRST', 'TICKET', 'WHO',
       'LUGGAGE', 'AGE', 'MALE', 'INCOME', 'GA', 'ORIGIN', 'DEST', 'TRAIN_AV',
       'CAR_AV', 'SM_AV', 'TRAIN_TT', 'TRAIN_CO', 'TRAIN_HE', 'SM_TT', 'SM_CO',
       'SM_HE', 'SM_SEATS', 'CAR_TT', 'CAR_CO', 'CHOICE'],
      dtype='object')

In [293]:
database.data[['TRAIN_TT', "TRAIN_CO", 'SM_TT', "SM_CO", "CAR_TT", "CAR_CO"]].describe()

Unnamed: 0,TRAIN_TT,TRAIN_CO,SM_TT,SM_CO,CAR_TT,CAR_CO
count,6768.0,6768.0,6768.0,6768.0,6768.0,6768.0
mean,166.077423,490.885195,84.507388,641.066489,123.154846,78.655881
std,69.795646,1062.593533,47.11314,1411.658237,91.718406,55.921803
min,35.0,9.0,12.0,11.0,0.0,0.0
25%,112.0,60.0,55.0,74.0,70.0,40.0
50%,159.0,94.0,77.0,112.0,120.0,76.0
75%,206.0,166.0,105.0,196.0,176.0,115.0
max,1022.0,5040.0,796.0,6720.0,1560.0,520.0


In [294]:
# Define new variables
SM_COST = SM_CO * (GA == 0)
TRAIN_COST = TRAIN_CO * (GA == 0)
CAR_AV_SP = CAR_AV * (SP != 0)
TRAIN_AV_SP = TRAIN_AV * (SP != 0)
TRAIN_TT_SCALED = TRAIN_TT / 100.0
TRAIN_CO_SCALED = TRAIN_COST / 100.0
SM_TT_SCALED = SM_TT / 100.0
SM_CO_SCALED = SM_COST / 100.0
CAR_TT_SCALED = CAR_TT / 100.0
CAR_CO_SCALED = CAR_CO / 100.0

In [295]:
# Define Utility functions
# V1 = (ASC_TRAIN + B_TIME * TRAIN_TT_SCALED + B_COST * TRAIN_CO_SCALED)
# V2 = (ASC_SM  + B_TIME * SM_TT_SCALED + B_COST * SM_CO_SCALED)
# V3 = (ASC_CAR + B_TIME * CAR_TT_SCALED + B_COST * CAR_CO_SCALED)
V1 = (ASC_TRAIN + B_TIME * TRAIN_TT_SCALED + B_COST_TRAIN * TRAIN_CO_SCALED)
V2 = (ASC_SM  + B_TIME * SM_TT_SCALED + B_COST_SM * SM_CO_SCALED)
V3 = (ASC_CAR + B_TIME * CAR_TT_SCALED + B_COST_CAR * CAR_CO_SCALED)

In [296]:
# Availability and loglikelihood
av = {1:TRAIN_AV_SP, 2:SM_AV, 3: CAR_AV_SP}
V = {1: V1, 2: V2, 3: V3}
logprob = models.loglogit(V, av, CHOICE)

In [297]:
logprob

_bioLogLogit(1:((ASC_TRAIN(0) + (B_TIME(0) * (TRAIN_TT / `100.0`))) + (B_COST_TRAIN(0) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(0) * (SM_TT / `100.0`))) + (B_COST_SM(0) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(0) + (B_TIME(0) * (CAR_TT / `100.0`))) + (B_COST_CAR(0) * (CAR_CO / `100.0`))))

In [298]:
# Create biogeme object
mlogit = bio.BIOGEME(database, logprob)
mlogit.modelName = "01_logit_sm"
type(mlogit)

biogeme.biogeme.BIOGEME

In [299]:
logprob

_bioLogLogit(1:((ASC_TRAIN(0) + (B_TIME(0) * (TRAIN_TT / `100.0`))) + (B_COST_TRAIN(0) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(0) * (SM_TT / `100.0`))) + (B_COST_SM(0) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(0) + (B_TIME(0) * (CAR_TT / `100.0`))) + (B_COST_CAR(0) * (CAR_CO / `100.0`))))

In [300]:
results = biogeme.estimate(saveIterations=True)

In [301]:
results.getEstimatedParameters()

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,-0.427275,0.076908,-5.555681,2.765322e-08,0.077052,-5.545292,2.934636e-08
ASC_TRAIN,0.189393,0.065512,2.890947,0.003840825,0.09214,2.055491,0.03983165
B_COST_CAR,-0.938943,0.090345,-10.392857,0.0,0.11585,-8.104806,4.440892e-16
B_COST_SM,-1.090544,0.052632,-20.720204,0.0,0.070374,-15.496406,0.0
B_COST_TRAIN,-2.929038,0.1122,-26.105571,0.0,0.168586,-17.374129,0.0
B_TIME,-1.116365,0.060939,-18.319493,0.0,0.120079,-9.296889,0.0


In [302]:
results.getGeneralStatistics()

{'Number of estimated parameters': (6, ''),
 'Sample size': (6768, ''),
 'Excluded observations': (3960, ''),
 'Init log likelihood': (-6964.662979192372, '.7g'),
 'Final log likelihood': (-5083.4999380817335, '.7g'),
 'Likelihood ratio test for the init. model': (3762.326082221278, '.7g'),
 'Rho-square for the init. model': (0.2701010869773314, '.3g'),
 'Rho-square-bar for the init. model': (0.26923959518398466, '.3g'),
 'Akaike Information Criterion': (10178.999876163467, '.7g'),
 'Bayesian Information Criterion': (10219.919641571183, '.7g'),
 'Final gradient norm': (0.0162235191236585, '.4E'),
 'Nbr of threads': (4, '')}

In [303]:
results.getBetaValues()

{'ASC_CAR': -0.4272747701232511,
 'ASC_TRAIN': 0.18939275267183409,
 'B_COST_CAR': -0.938942608489664,
 'B_COST_SM': -1.0905441342085622,
 'B_COST_TRAIN': -2.929038422579854,
 'B_TIME': -1.1163651683467066}

In [304]:
logprob

_bioLogLogit(1:((ASC_TRAIN(0) + (B_TIME(0) * (TRAIN_TT / `100.0`))) + (B_COST_TRAIN(0) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(0) * (SM_TT / `100.0`))) + (B_COST_SM(0) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(0) + (B_TIME(0) * (CAR_TT / `100.0`))) + (B_COST_CAR(0) * (CAR_CO / `100.0`))))

In [305]:
mlogit.loadSavedIteration()
mlogit.loglike

_bioLogLogit(1:((ASC_TRAIN(0.18939275267183409) + (B_TIME(-1.1163651683467066) * (TRAIN_TT / `100.0`))) + (B_COST_TRAIN(-2.929038422579854) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(-1.1163651683467066) * (SM_TT / `100.0`))) + (B_COST_SM(-1.0905441342085622) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(-0.4272747701232511) + (B_TIME(-1.1163651683467066) * (CAR_TT / `100.0`))) + (B_COST_CAR(-0.938942608489664) * (CAR_CO / `100.0`))))

In [306]:
logprob

_bioLogLogit(1:((ASC_TRAIN(0.18939275267183409) + (B_TIME(-1.1163651683467066) * (TRAIN_TT / `100.0`))) + (B_COST_TRAIN(-2.929038422579854) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(-1.1163651683467066) * (SM_TT / `100.0`))) + (B_COST_SM(-1.0905441342085622) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(-0.4272747701232511) + (B_TIME(-1.1163651683467066) * (CAR_TT / `100.0`))) + (B_COST_CAR(-0.938942608489664) * (CAR_CO / `100.0`))))

In [307]:
# biogeme.simulate(results.getBetaValues())

### Biogeme loglikelihood

In [308]:
import biogeme.loglikelihood as ll

In [309]:
biogeme
loglike = ll.loglikelihood(logprob)

In [310]:
print(loglike)

log(_bioLogLogit(1:((ASC_TRAIN(0.18939275267183409) + (B_TIME(-1.1163651683467066) * (TRAIN_TT / `100.0`))) + (B_COST_TRAIN(-2.929038422579854) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(-1.1163651683467066) * (SM_TT / `100.0`))) + (B_COST_SM(-1.0905441342085622) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(-0.4272747701232511) + (B_TIME(-1.1163651683467066) * (CAR_TT / `100.0`))) + (B_COST_CAR(-0.938942608489664) * (CAR_CO / `100.0`)))))


In [311]:
V1

((ASC_TRAIN(0.18939275267183409) + (B_TIME(-1.1163651683467066) * (TRAIN_TT / `100.0`))) + (B_COST_TRAIN(-2.929038422579854) * ((TRAIN_CO * (GA == `0`)) / `100.0`)))

In [312]:
def add_utilities(db, V):
    df = db.data
    for idx, v in V.items():
        df[f'V_{idx}'] = db.valuesFromDatabase(v)
    Vs = [f'V_{idx}' for idx in V.keys()]
    df["PRED"] = np.argmax(df[Vs].values, axis=1) + 1
    return df

In [313]:
test = add_utilities(database, V)

In [314]:
test

Unnamed: 0,GROUP,SURVEY,SP,ID,PURPOSE,FIRST,TICKET,WHO,LUGGAGE,AGE,...,SM_CO,SM_HE,SM_SEATS,CAR_TT,CAR_CO,CHOICE,V_1,V_2,V_3,PRED
0,2,0,1,1,1,0,1,1,0,3,...,52,20,0,117,65,2,-2.466875,-1.270393,-2.343735,2
1,2,0,1,1,1,0,1,1,0,3,...,49,10,0,117,84,2,-2.366402,-1.204186,-2.522134,2
2,2,0,1,1,1,0,1,1,0,3,...,58,30,0,117,52,2,-2.667820,-1.380480,-2.221672,2
3,2,0,1,1,1,0,1,1,0,3,...,52,20,0,72,52,2,-2.132079,-1.270393,-1.719308,2
4,2,0,1,1,1,0,1,1,0,3,...,42,20,0,90,84,2,-2.316336,-1.161339,-2.220715,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8446,3,1,1,939,3,1,7,3,1,5,...,17,30,0,130,64,1,-1.397057,-0.743575,-2.479473,2
8447,3,1,1,939,3,1,7,3,1,5,...,16,10,0,80,80,1,-1.367766,-0.766161,-2.071521,2
8448,3,1,1,939,3,1,7,3,1,5,...,16,20,0,80,64,1,-1.484928,-0.732670,-1.921290,2
8449,3,1,1,939,3,1,7,3,1,5,...,17,30,0,80,104,1,-1.708201,-0.777066,-2.296867,2


In [315]:
# df["V1"] = database.valuesFromDatabase(V1)
# df["V2"] = database.valuesFromDatabase(V2)
# df["V3"] = database.valuesFromDatabase(V3)
# df["PRED"] = np.argmax(df[["V1","V2","V3"]].values, axis=1) + 1

In [273]:
# V1.getValue_c(database)

In [274]:
cols = ['ID', 'PURPOSE', 'CHOICE', 'V1', 'V2', 'V3', 'PRED']

In [275]:
df[cols].head()

Unnamed: 0,ID,PURPOSE,CHOICE,V1,V2,V3,PRED
0,1,1,2,-2.466875,-1.270393,-2.343735,2
1,1,1,2,-2.366402,-1.204186,-2.522134,2
2,1,1,2,-2.66782,-1.38048,-2.221672,2
3,1,1,2,-2.132079,-1.270393,-1.719308,2
4,1,1,2,-2.316336,-1.161339,-2.220715,2


In [276]:
def accuracy(actual, pred):
    res = np.sum(np.array(actual) == np.array(pred)) / len(actual)
    return res

In [277]:
accuracy(df["CHOICE"], df["PRED"])

0.5709219858156028

## TEST ANOTHER CASE

In [46]:
# Define Utility functions
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 1)
ASC_SM = Beta('ASC_SM', 0, None, None, 0)
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_COST = Beta('B_COST', 0, None, None, 0)
B_HE = Beta('B_HE', 0, None, None, 0)

In [48]:
v_car = ASC_CAR + B_TIME * CAR_TT_SCALED + B_COST * CAR_CO_SCALED
v_trn = ASC_TRAIN + B_TIME * TRAIN_TT_SCALED + B_COST * TRAIN_CO_SCALED + B_HE * TRAIN_HE
v_sm  = ASC_SM + B_TIME * SM_TT_SCALED + B_COST * SM_CO_SCALED + B_HE * SM_HE

av = {1:TRAIN_AV_SP, 2:SM_AV, 3: CAR_AV_SP}
v = {1: v_trn, 2: v_sm, 3: v_car}
logprob = models.logit(v, av, CHOICE)

In [49]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = "01_logit_mit"

In [50]:
results = biogeme.estimate()

In [51]:
results

<biogeme.results.bioResults at 0x7f10e2320a30>

In [57]:
results.getEstimatedParameters()

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,17.89044,9.089397,1.968276,0.049036,3.849171,4.647868,3e-06
ASC_SM,40.80182,15.012395,2.717875,0.00657,8.669971,4.706108,3e-06
B_COST,-163.66466,60.312231,-2.713623,0.006655,41.420026,-3.951341,7.8e-05
B_HE,-1.111451,0.423329,-2.625502,0.008652,0.313539,-3.544856,0.000393
B_TIME,-188.087538,69.221641,-2.717178,0.006584,47.60176,-3.951273,7.8e-05


### OPTION 2

In [78]:
# Read data
df = pd.read_csv("../data/swissmetro.dat",'\t')
df.shape

(10728, 28)

In [79]:
df.head()

Unnamed: 0,GROUP,SURVEY,SP,ID,PURPOSE,FIRST,TICKET,WHO,LUGGAGE,AGE,...,TRAIN_TT,TRAIN_CO,TRAIN_HE,SM_TT,SM_CO,SM_HE,SM_SEATS,CAR_TT,CAR_CO,CHOICE
0,2,0,1,1,1,0,1,1,0,3,...,112,48,120,63,52,20,0,117,65,2
1,2,0,1,1,1,0,1,1,0,3,...,103,48,30,60,49,10,0,117,84,2
2,2,0,1,1,1,0,1,1,0,3,...,130,48,60,67,58,30,0,117,52,2
3,2,0,1,1,1,0,1,1,0,3,...,103,40,30,63,52,20,0,72,52,2
4,2,0,1,1,1,0,1,1,0,3,...,130,36,60,63,42,20,0,90,84,2


In [80]:
df["SENIOR"] = df.AGE > 65

In [81]:
df["SENIOR"] = df["SENIOR"].astype(int)

In [82]:
# Convert to biogeme database
database = db.Database('swissmetro',df)

In [83]:
# Remove some observations
exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

In [84]:
# Define Utility functions
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 1)
ASC_SM = Beta('ASC_SM', 0, None, None, 0)
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_COST_CAR = Beta('B_COST_CAR', 0, None, None, 0)
B_COST_TRAIN = Beta('B_COST_TRAIN', 0, None, None, 0)
B_COST_SM = Beta('B_COST_SM', 0, None, None, 0)
B_HE = Beta('B_HE', 0, None, None, 0)
B_GA = Beta('B_GA', 0, None, None, 0)

v_car = ASC_CAR + B_TIME * CAR_TT + B_COST_CAR * CAR_CO
v_trn = ASC_TRAIN + B_TIME * TRAIN_TT + B_COST_TRAIN * TRAIN_CO + B_HE * TRAIN_HE + B_GA * GA
v_sm  = ASC_SM + B_TIME * SM_TT + B_COST_SM * SM_CO + B_HE * SM_HE  + B_GA * GA

av = {1:TRAIN_AV_SP, 2:SM_AV, 3: CAR_AV_SP}
v = {1: v_trn, 2: v_sm, 3: v_car}
logprob = models.logit(v, av, CHOICE)

In [85]:
biogeme = bio.BIOGEME(database, logprob)

In [86]:
results = biogeme.estimate()

  x = self.project(x + delta_t * d)


In [87]:
results.getEstimatedParameters()

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,-43.811665,24.12671,-1.815899,0.069386,5.781173,-7.578335,3.508305e-14
ASC_SM,32.005619,21.96239,1.457292,0.145036,6.569427,4.871904,1.105276e-06
B_COST_CAR,-17.334401,11.14207,-1.555761,0.119765,3.020679,-5.738578,9.54748e-09
B_COST_SM,-17.710067,11.35941,-1.559066,0.118981,3.059432,-5.788677,7.094284e-09
B_COST_TRAIN,-22.6325,14.51853,-1.55887,0.119027,3.910867,-5.78708,7.162031e-09
B_GA,0.353409,1.506707e-08,23455690.0,0.0,3.865332e-09,91430340.0,0.0
B_HE,-6.846326,4.359428,-1.570464,0.116307,1.1493,-5.956955,2.56981e-09
B_TIME,-22.739649,14.56489,-1.561264,0.118461,3.900223,-5.830345,5.531274e-09
