# Biogeme Basics: Probit Model

In [1]:
import pandas  as pd
import numpy as np
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
import biogeme.optimization as opt
from biogeme.expressions import Beta, DefineVariable, bioNormalCdf, log, Elem
import seaborn as sns
import matplotlib.pyplot as plt

**Import Swissmetro data**

In [2]:
pandas = pd.read_csv("../Data/swissmetro.dat",sep='\t')
database = db.Database("data/swissmetro", pandas)

**Use collumn names as variables**

In [3]:
globals().update(database.variables)

**Exclude some unwanted entries**

Rmove observations where Swissmetro was chosen (CHOICE == 2) and also remove observations where one of the two alternatives is not available

In [4]:
CAR_AV_SP =  DefineVariable('CAR_AV_SP',CAR_AV * (SP != 0),database)
TRAIN_AV_SP =  DefineVariable('TRAIN_AV_SP',TRAIN_AV * (SP != 0),database)
exclude = (TRAIN_AV_SP == 0) + (CAR_AV_SP == 0) + ( CHOICE == 2 ) + (( PURPOSE != 1 ) * (  PURPOSE   !=  3  ) + ( CHOICE == 0 )) > 0
database.remove(exclude)

**Define some dummy variables**

In [5]:
SM_COST = SM_CO * ( GA == 0 )
TRAIN_COST = TRAIN_CO * ( GA == 0 )

**Rescale some data**

In [6]:
TRAIN_TT_SCALED   = DefineVariable('TRAIN_TT_SCALED',   TRAIN_TT / 100.0, database)
TRAIN_COST_SCALED = DefineVariable('TRAIN_COST_SCALED', TRAIN_COST / 100, database)
SM_TT_SCALED      = DefineVariable('SM_TT_SCALED',      SM_TT / 100.0   , database)
SM_COST_SCALED    = DefineVariable('SM_COST_SCALED',    SM_COST / 100   , database)
CAR_TT_SCALED     = DefineVariable('CAR_TT_SCALED',     CAR_TT / 100    , database)
CAR_CO_SCALED     = DefineVariable('CAR_CO_SCALED',     CAR_CO / 100    , database)

**Create parameters to be estimated**

`Beta`
1. name of parameter
2. default value for the parameter
3. lower bound
4. upper bound
5. flag indicating if parameter is to be estimated

In [7]:
ASC_CAR = Beta('ASC_CAR',0,None,None,0)
ASC_TRAIN = Beta('ASC_TRAIN',0,None,None,1)
B_TIME = Beta('B_TIME',0,None,None,0)
B_COST = Beta('B_COST',0,None,None,0)

**Define the utility functions**

\begin{align}
V_1 & = \beta_{time}X_{Train_{TT}} + \beta_{cost}X_{Train_{cost}}\\
V_3 & = \beta_{Car} + \beta_{time}X_{Car_{TT}} + \beta_{cost}X_{Car_{cost}}\\
\end{align}

In [8]:
V1 = B_TIME * TRAIN_TT_SCALED + \
     B_COST * TRAIN_COST_SCALED
V3 = ASC_CAR + \
     B_TIME * CAR_TT_SCALED + \
     B_COST * CAR_CO_SCALED

**Associate utility functions with alternatives and associate availability of alternatives**

Create a python dictionary with all utility functions

Create a python dictionary with availability of choices

In [9]:
P = {1: bioNormalCdf(V1-V3),
     3: bioNormalCdf(V3-V1)}

**Define the model**

In [10]:
logprob = log(Elem(P, CHOICE))

**Define the Biogeme object**

* Give the database with all variables
* Give the log likelihood model

In [11]:
biogeme  = bio.BIOGEME(database, logprob)

biogeme.modelName = "swissmetro_probit_basic"

**Estimate the model**

1. A `.html` can be generated with a report of the results and can be opened with a browser
2. A `.pickle` file can also be generaetd with a snapshot with the results. This file can then be used in other scripts

In [12]:
biogeme.generateHtml = True
biogeme.generatePickle = False

results = biogeme.estimate()

print(f"HTML file:    {results.data.htmlFileName}")
print(f"Pickle file:  {results.data.pickleFileName }")

HTML file:    swissmetro_probit_basic.html
Pickle file:  None


**Print results**

In [13]:
betas = results.getBetaValues()
for k,v in betas.items():
    print(f"{k:10}=\t{v:.3g}")

ASC_CAR   =	0.691
B_COST    =	-0.812
B_TIME    =	-0.297


**Get the general statistics**

In [14]:
gs = results.getGeneralStatistics()

for k,v in gs.items():
    print("{}= {}".format(k.ljust(45),v[0]))

Number of estimated parameters               = 3
Sample size                                  = 2232
Excluded observations                        = 8496
Init log likelihood                          = -1547.104507009801
Final log likelihood                         = -986.1887862987081
Likelihood ratio test for the init. model    = 1121.8314414221857
Rho-square for the init. model               = 0.3625583909617164
Rho-square-bar for the init. model           = 0.3606192847239623
Akaike Information Criterion                 = 1978.3775725974162
Bayesian Information Criterion               = 1995.50953256792
Final gradient norm                          = 0.007383834715391203
Nbr of threads                               = 8
