In [1]:
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta

In [2]:
# Read data
df = pd.read_csv("../data/swissmetro.dat",'\t')
df.shape

(10728, 28)

In [3]:
# Convert to biogeme database
database = db.Database('swissmetro',df)
type(database)

biogeme.database.Database

In [4]:
globals().update(database.variables)

In [8]:
# Remove some observations
exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

In [33]:
database.data.shape

(6768, 28)

In [34]:
# Paramaters
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
ASC_SM = Beta('ASC_SM', 0, None, None, 1)
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_COST = Beta('B_COST', 0, None, None, 0)

In [35]:
database.data.columns

Index(['GROUP', 'SURVEY', 'SP', 'ID', 'PURPOSE', 'FIRST', 'TICKET', 'WHO',
       'LUGGAGE', 'AGE', 'MALE', 'INCOME', 'GA', 'ORIGIN', 'DEST', 'TRAIN_AV',
       'CAR_AV', 'SM_AV', 'TRAIN_TT', 'TRAIN_CO', 'TRAIN_HE', 'SM_TT', 'SM_CO',
       'SM_HE', 'SM_SEATS', 'CAR_TT', 'CAR_CO', 'CHOICE'],
      dtype='object')

In [36]:
database.data[['TRAIN_TT', "TRAIN_CO", 'SM_TT', "SM_CO", "CAR_TT", "CAR_CO"]].describe()

Unnamed: 0,TRAIN_TT,TRAIN_CO,SM_TT,SM_CO,CAR_TT,CAR_CO
count,6768.0,6768.0,6768.0,6768.0,6768.0,6768.0
mean,166.077423,490.885195,84.507388,641.066489,123.154846,78.655881
std,69.795646,1062.593533,47.11314,1411.658237,91.718406,55.921803
min,35.0,9.0,12.0,11.0,0.0,0.0
25%,112.0,60.0,55.0,74.0,70.0,40.0
50%,159.0,94.0,77.0,112.0,120.0,76.0
75%,206.0,166.0,105.0,196.0,176.0,115.0
max,1022.0,5040.0,796.0,6720.0,1560.0,520.0


In [41]:
# Define new variables
SM_COST = SM_CO * (GA == 0)
TRAIN_COST = TRAIN_CO * (GA == 0)
CAR_AV_SP = CAR_AV * (SP != 0)
TRAIN_AV_SP = TRAIN_AV * (SP != 0)
TRAIN_TT_SCALED = TRAIN_TT / 100.0
TRAIN_CO_SCALED = TRAIN_COST / 100.0
SM_TT_SCALED = SM_TT / 100.0
SM_CO_SCALED = SM_COST / 100.0
CAR_TT_SCALED = CAR_TT / 100.0
CAR_CO_SCALED = CAR_CO / 100.0

In [42]:
# Define Utility functions
V1 = (ASC_TRAIN + B_TIME * TRAIN_TT_SCALED + \
                  B_COST * TRAIN_CO_SCALED)
V2 = (ASC_SM  + B_TIME * SM_TT_SCALED + \
                B_COST * SM_CO_SCALED)
V3 = (ASC_CAR + B_TIME * CAR_TT_SCALED + \
                B_COST * CAR_CO_SCALED)

In [44]:
# Availability and loglikelihood
av = {1:TRAIN_AV_SP, 2:SM_AV, 3: CAR_AV_SP}
V = {1: V1, 2: V2, 3: V3}
logprob = models.loglogit(V, av, CHOICE)

In [45]:
type(logprob)

biogeme.expressions._bioLogLogit

In [50]:
# Create biogeme object
biogeme = bio.BIOGEME(database, logprob)
type(biogeme)

biogeme.biogeme.BIOGEME

In [59]:
logprob.embedExpression

<bound method Expression.embedExpression of _bioLogLogit(1:((ASC_TRAIN(0) + (B_TIME(0) * (TRAIN_TT / `100.0`))) + (B_COST(0) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(0) * (SM_TT / `100.0`))) + (B_COST(0) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(0) + (B_TIME(0) * (CAR_TT / `100.0`))) + (B_COST(0) * (CAR_CO / `100.0`))))>

In [51]:
dir(biogeme)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_audit',
 '_generateDraws',
 '_prepareDatabaseForFormula',
 '_prepareLiterals',
 'algoParameters',
 'algorithm',
 'allDraws',
 'allFixedBetas',
 'allFreeBetas',
 'allRandomVariables',
 'bestIteration',
 'betaIds',
 'betaInitValues',
 'bootstrap_results',
 'bootstrap_time',
 'bounds',
 'calculateInitLikelihood',
 'calculateLikelihood',
 'calculateLikelihoodAndDerivatives',
 'cfsqp',
 'cfsqp_default_bounds',
 'changeInitValues',
 'checkDerivatives',
 'columnForBatchSamplingWeights',
 'confidenceIntervals',
 'createLogFile',
 'database',
 'drawNames',
 'drawsProcessingTime',
 'elementaryExpressionIndex',
 'estimate',
 'file

In [95]:
biogeme.loglike

_bioLogLogit(1:((ASC_TRAIN(0) + (B_TIME(0) * (TRAIN_TT / `100.0`))) + (B_COST(0) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(0) * (SM_TT / `100.0`))) + (B_COST(0) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(0) + (B_TIME(0) * (CAR_TT / `100.0`))) + (B_COST(0) * (CAR_CO / `100.0`))))

In [106]:
biogeme.loadSavedIteration()
biogeme.loglike

_bioLogLogit(1:((ASC_TRAIN(-0.7011473794405245) + (B_TIME(-1.2778854818542016) * (TRAIN_TT / `100.0`))) + (B_COST(-1.0837675514845984) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(-1.2778854818542016) * (SM_TT / `100.0`))) + (B_COST(-1.0837675514845984) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(-0.15460289050933534) + (B_TIME(-1.2778854818542016) * (CAR_TT / `100.0`))) + (B_COST(-1.0837675514845984) * (CAR_CO / `100.0`))))

In [103]:
results = biogeme.estimate(saveIterations=True)

In [104]:
results.getEstimatedParameters()

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,-0.154603,0.043235,-3.57584,0.000349,0.058163,-2.658079,0.007859
ASC_TRAIN,-0.701147,0.054874,-12.777443,0.0,0.082562,-8.492375,0.0
B_COST,-1.083768,0.05183,-20.910063,0.0,0.068224,-15.885339,0.0
B_TIME,-1.277885,0.056883,-22.464979,0.0,0.104255,-12.257328,0.0


In [105]:
biogeme.loglike

_bioLogLogit(1:((ASC_TRAIN(0) + (B_TIME(0) * (TRAIN_TT / `100.0`))) + (B_COST(0) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(0) * (SM_TT / `100.0`))) + (B_COST(0) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(0) + (B_TIME(0) * (CAR_TT / `100.0`))) + (B_COST(0) * (CAR_CO / `100.0`))))

In [100]:
results.getGeneralStatistics()

{'Number of estimated parameters': (4, ''),
 'Sample size': (6768, ''),
 'Excluded observations': (3960, ''),
 'Init log likelihood': (-6964.662979192372, '.7g'),
 'Final log likelihood': (-5331.252007298093, '.7g'),
 'Likelihood ratio test for the init. model': (3266.821943788558, '.7g'),
 'Rho-square for the init. model': (0.23452835790823734, '.3g'),
 'Rho-square-bar for the init. model': (0.23395403004600612, '.3g'),
 'Akaike Information Criterion': (10670.504014596187, '.7g'),
 'Bayesian Information Criterion': (10697.78385820133, '.7g'),
 'Final gradient norm': (0.015306327717452512, '.4E'),
 'Nbr of threads': (4, '')}

In [110]:
results.getBetaValues()

{'ASC_CAR': -0.15460289050933534,
 'ASC_TRAIN': -0.7011473794405245,
 'B_COST': -1.0837675514845984,
 'B_TIME': -1.2778854818542016}

In [114]:
biogeme.simulate(results.getBetaValues())

Unnamed: 0,loglike
0,-0.500879
1,-0.452628
2,-0.547979
3,-0.743131
4,-0.476071
...,...
8446,-1.758083
8447,-1.805149
8448,-1.901937
8449,-2.005439


In [108]:
biogeme.loglike

_bioLogLogit(1:((ASC_TRAIN(-0.7011473794405245) + (B_TIME(-1.2778854818542016) * (TRAIN_TT / `100.0`))) + (B_COST(-1.0837675514845984) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(-1.2778854818542016) * (SM_TT / `100.0`))) + (B_COST(-1.0837675514845984) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(-0.15460289050933534) + (B_TIME(-1.2778854818542016) * (CAR_TT / `100.0`))) + (B_COST(-1.0837675514845984) * (CAR_CO / `100.0`))))

In [115]:
formulas = [v.getSignature() for v in biogeme.formulas.values()]

In [119]:
[v for v in biogeme.formulas.values()]

[_bioLogLogit(1:((ASC_TRAIN(-0.7011473794405245) + (B_TIME(-1.2778854818542016) * (TRAIN_TT / `100.0`))) + (B_COST(-1.0837675514845984) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(-1.2778854818542016) * (SM_TT / `100.0`))) + (B_COST(-1.0837675514845984) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(-0.15460289050933534) + (B_TIME(-1.2778854818542016) * (CAR_TT / `100.0`))) + (B_COST(-1.0837675514845984) * (CAR_CO / `100.0`))))]

In [116]:
formulas

[[b'<Variable>{140392791882576}"CHOICE",32,27',
  b'<Beta>{140392755617744}"ASC_TRAIN"[0],1,1',
  b'<Beta>{140392755616880}"B_TIME"[0],3,3',
  b'<Variable>{140392791882624}"TRAIN_TT",23,18',
  b'<Numeric>{140392755213552},100.0',
  b'<Divide>{140392755213504}(2),140392791882624,140392755213552',
  b'<Times>{140392755215712}(2),140392755616880,140392755213504',
  b'<Plus>{140392755215760}(2),140392755617744,140392755215712',
  b'<Beta>{140392755617552}"B_COST"[0],2,2',
  b'<Variable>{140392791881856}"TRAIN_CO",24,19',
  b'<Variable>{140392791881664}"GA",17,12',
  b'<Numeric>{140392755212640},0',
  b'<Equal>{140392755212880}(2),140392791881664,140392755212640',
  b'<Times>{140392755213216}(2),140392791881856,140392755212880',
  b'<Numeric>{140392755213936},100.0',
  b'<Divide>{140392755213648}(2),140392755213216,140392755213936',
  b'<Times>{140392755215808}(2),140392755617552,140392755213648',
  b'<Plus>{140392755214800}(2),140392755215760,140392755215808',
  b'<Beta>{140392791688144}"A

### Biogeme.loglikelyhood

In [120]:
import biogeme.loglikelihood as ll

In [121]:
loglike = ll.loglikelihood(logprob)

In [122]:
print(loglike)

log(_bioLogLogit(1:((ASC_TRAIN(-0.7011473794405245) + (B_TIME(-1.2778854818542016) * (TRAIN_TT / `100.0`))) + (B_COST(-1.0837675514845984) * ((TRAIN_CO * (GA == `0`)) / `100.0`))), 2:((ASC_SM(0) + (B_TIME(-1.2778854818542016) * (SM_TT / `100.0`))) + (B_COST(-1.0837675514845984) * ((SM_CO * (GA == `0`)) / `100.0`))), 3:((ASC_CAR(-0.15460289050933534) + (B_TIME(-1.2778854818542016) * (CAR_TT / `100.0`))) + (B_COST(-1.0837675514845984) * (CAR_CO / `100.0`)))))


In [124]:
V1

((ASC_TRAIN(-0.7011473794405245) + (B_TIME(-1.2778854818542016) * (TRAIN_TT / `100.0`))) + (B_COST(-1.0837675514845984) * ((TRAIN_CO * (GA == `0`)) / `100.0`)))

In [123]:
database.valuesFromDatabase(V1)

0      -2.652588
1      -2.537578
2      -2.882607
3      -2.450876
4      -2.752555
          ...   
8446   -2.222153
8447   -2.211316
8448   -2.254667
8449   -2.510244
8450   -2.222153
Length: 6768, dtype: float64

In [126]:
database.valuesFromDatabase(V2)

0      -1.368627
1      -1.297777
2      -1.484768
3      -1.368627
4      -1.260250
          ...   
8446   -0.823183
8447   -0.850682
8448   -0.812346
8449   -0.861520
8450   -0.904870
Length: 6768, dtype: float64

In [128]:
df["V1"] = database.valuesFromDatabase(V1)
df["V2"] = database.valuesFromDatabase(V2)
df["V3"] = database.valuesFromDatabase(V3)

In [129]:
df.columns

Index(['GROUP', 'SURVEY', 'SP', 'ID', 'PURPOSE', 'FIRST', 'TICKET', 'WHO',
       'LUGGAGE', 'AGE', 'MALE', 'INCOME', 'GA', 'ORIGIN', 'DEST', 'TRAIN_AV',
       'CAR_AV', 'SM_AV', 'TRAIN_TT', 'TRAIN_CO', 'TRAIN_HE', 'SM_TT', 'SM_CO',
       'SM_HE', 'SM_SEATS', 'CAR_TT', 'CAR_CO', 'CHOICE', 'V1', 'V2', 'V3'],
      dtype='object')

In [130]:
cols = ['ID', 'PURPOSE', 'CHOICE', 'V1', 'V2', 'V3']

In [131]:
df[cols]

Unnamed: 0,ID,PURPOSE,CHOICE,V1,V2,V3
0,1,1,2,-2.652588,-1.368627,-2.354178
1,1,1,2,-2.537578,-1.297777,-2.560094
2,1,1,2,-2.882607,-1.484768,-2.213288
3,1,1,2,-2.450876,-1.368627,-1.638240
4,1,1,2,-2.752555,-1.260250,-2.215065
...,...,...,...,...,...,...
8446,939,3,1,-2.222153,-0.823183,-2.509465
8447,939,3,1,-2.211316,-0.850682,-2.043925
8448,939,3,1,-2.254667,-0.812346,-1.870523
8449,939,3,1,-2.510244,-0.861520,-2.304030


In [145]:
import numpy as np
a = np.arange(6).reshape(2,3) + 10
a

array([[10, 11, 12],
       [13, 14, 15]])

In [150]:
np.argmax(a, axis=1)

array([2, 2])

In [140]:
len(np.argmax(df[["V1","V2","V3"]].values, axis=1))

6768

In [151]:
df["PRED"] = np.argmax(df[["V1","V2","V3"]].values, axis=1) + 1

In [152]:
df[cols + ["PRED"]]

Unnamed: 0,ID,PURPOSE,CHOICE,V1,V2,V3,PRED
0,1,1,2,-2.652588,-1.368627,-2.354178,2
1,1,1,2,-2.537578,-1.297777,-2.560094,2
2,1,1,2,-2.882607,-1.484768,-2.213288,2
3,1,1,2,-2.450876,-1.368627,-1.638240,2
4,1,1,2,-2.752555,-1.260250,-2.215065,2
...,...,...,...,...,...,...,...
8446,939,3,1,-2.222153,-0.823183,-2.509465,2
8447,939,3,1,-2.211316,-0.850682,-2.043925,2
8448,939,3,1,-2.254667,-0.812346,-1.870523,2
8449,939,3,1,-2.510244,-0.861520,-2.304030,2


In [153]:
df.PRED.value_counts()

2    4414
3    2354
Name: PRED, dtype: int64

In [154]:
df.CHOICE.value_counts()

2    4090
3    1770
1     908
Name: CHOICE, dtype: int64

In [162]:
def accuracy(actual, pred):
    res = np.sum(np.array(actual) == np.array(pred)) / len(actual)
    return res

In [164]:
actual = [1, 2 , 2]
pred = [1, 2, 3]
accuracy(actual, pred)

0.6666666666666666

In [165]:
accuracy(df["CHOICE"], df["PRED"])

0.5701832151300237