# Demo of Biogeme using simulated data for 500 individuals

For the sake of comparison, this notebook uses Biogeme to perform maximum simulated likelihood estimation (MSLE) on the same data.

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import logging
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Fix random seed for reproducibility
np.random.seed(42)

# Generate fake data

In [2]:
from core.dcm_fakedata import generate_fake_data_wide

num_resp = 500
num_menus = 5
num_alternatives = 5

true_alpha = np.array([-0.8, 0.8, 1.2])
true_beta = np.array([-0.8, 0.8, 1.0, -0.8, 1.5])
# dynamic version of generating Omega
corr = 0.8
scale_factor = 1.0
true_Omega = corr*np.ones((len(true_beta),len(true_beta))) # off-diagonal values of cov matrix
true_Omega[np.arange(len(true_beta)), np.arange(len(true_beta))] = 1.0 # diagonal values of cov matrix
true_Omega *= scale_factor

df = generate_fake_data_wide(num_resp, num_menus, num_alternatives, true_alpha, true_beta, true_Omega)
df.head()

Generating fake data...
Error: 45.16


Unnamed: 0,ALT1_XF1,ALT1_XF2,ALT1_XF3,ALT1_XR1,ALT1_XR2,ALT1_XR3,ALT1_XR4,ALT1_XR5,ALT2_XF1,ALT2_XF2,...,ALT5_XR1,ALT5_XR2,ALT5_XR3,ALT5_XR4,ALT5_XR5,choice,indID,menuID,obsID,ones
0,0.37454,0.950714,0.731994,0.502929,0.47468,0.301972,0.539872,0.248391,0.598658,0.156019,...,0.237133,0.323644,0.292095,0.797652,0.17123,2,0,0,0,1
1,0.183405,0.304242,0.524756,0.608472,0.051355,0.549528,0.063914,0.175877,0.431945,0.291229,...,0.981368,0.299731,0.278902,0.973245,0.728234,2,0,1,1,1
2,0.607545,0.170524,0.065052,0.585298,0.699323,0.709927,0.763336,0.067189,0.948886,0.965632,...,0.839992,0.718722,0.500256,0.273207,0.877216,3,0,2,2,1
3,0.662522,0.311711,0.520068,0.776655,0.837907,0.140789,0.314813,0.424885,0.54671,0.184854,...,0.296403,0.507946,0.230841,0.514504,0.683332,3,0,3,3,1
4,0.388677,0.271349,0.828738,0.085521,0.870117,0.593746,0.989603,0.025592,0.356753,0.280935,...,0.584885,0.684462,0.256079,0.832849,0.44969,0,0,4,4,1


# Mixed Logit specification

In [3]:
import biogeme.biogeme as bio
from biogeme.expressions import Beta, bioLinearUtility, DefineVariable, Plus, Times, bioDraws, PanelLikelihoodTrajectory, MonteCarlo, log
import biogeme.models as models
import biogeme.database as db
import biogeme.messaging as msg
import biogeme.optimization as opt
import biogeme.results as res

database = db.Database('choiceset', df)

# They are organized as panel data. The variable ID identifies each individual.
database.panel("indID")

globals().update(database.variables)

In [4]:
# Parameters to be estimated
B_XF1 = Beta('B_XF1', 0, None, None, 0)
B_XF2 = Beta('B_XF2', 0, None, None, 0)
B_XF3 = Beta('B_XF3', 0, None, None, 0)

B_XR1 = Beta('B_XR1', 0, None, None, 0)
B_XR1_S = Beta('B_XR1_S', 1, None, None, 0)
B_XR1_RND = B_XR1 + B_XR1_S * bioDraws('B_XR1_RND', 'NORMAL_ANTI')

B_XR2 = Beta('B_XR2', 0, None, None, 0)
B_XR2_S = Beta('B_XR2_S', 1, None, None, 0)
B_XR2_RND = B_XR2 + B_XR2_S * bioDraws('B_XR2_RND', 'NORMAL_ANTI')

B_XR3 = Beta('B_XR3', 0, None, None, 0)
B_XR3_S = Beta('B_XR3_S', 1, None, None, 0)
B_XR3_RND = B_XR3 + B_XR3_S * bioDraws('B_XR3_RND', 'NORMAL_ANTI')

B_XR4 = Beta('B_XR4', 0, None, None, 0)
B_XR4_S = Beta('B_XR4_S', 1, None, None, 0)
B_XR4_RND = B_XR4 + B_XR4_S * bioDraws('B_XR4_RND', 'NORMAL_ANTI')

B_XR5 = Beta('B_XR5', 0, None, None, 0)
B_XR5_S = Beta('B_XR5_S', 1, None, None, 0)
B_XR5_RND = B_XR5 + B_XR5_S * bioDraws('B_XR5_RND', 'NORMAL_ANTI')

In [5]:
# Definition of the utility functions
V1 = B_XF1*ALT1_XF1 + B_XF2*ALT1_XF2 + B_XF3*ALT1_XF3 + B_XR1_RND*ALT1_XR1 + B_XR2_RND*ALT1_XR2 + B_XR3_RND*ALT1_XR3 + B_XR4_RND*ALT1_XR4 + B_XR5_RND*ALT1_XR5
V2 = B_XF1*ALT2_XF1 + B_XF2*ALT2_XF2 + B_XF3*ALT2_XF3 + B_XR1_RND*ALT2_XR1 + B_XR2_RND*ALT2_XR2 + B_XR3_RND*ALT2_XR3 + B_XR4_RND*ALT2_XR4 + B_XR5_RND*ALT2_XR5
V3 = B_XF1*ALT3_XF1 + B_XF2*ALT3_XF2 + B_XF3*ALT3_XF3 + B_XR1_RND*ALT3_XR1 + B_XR2_RND*ALT3_XR2 + B_XR3_RND*ALT3_XR3 + B_XR4_RND*ALT3_XR4 + B_XR5_RND*ALT3_XR5
V4 = B_XF1*ALT4_XF1 + B_XF2*ALT4_XF2 + B_XF3*ALT4_XF3 + B_XR1_RND*ALT4_XR1 + B_XR2_RND*ALT4_XR2 + B_XR3_RND*ALT4_XR3 + B_XR4_RND*ALT4_XR4 + B_XR5_RND*ALT4_XR5
V5 = B_XF1*ALT5_XF1 + B_XF2*ALT5_XF2 + B_XF3*ALT5_XF3 + B_XR1_RND*ALT5_XR1 + B_XR2_RND*ALT5_XR2 + B_XR3_RND*ALT5_XR3 + B_XR4_RND*ALT5_XR4 + B_XR5_RND*ALT5_XR5

# Associate utility functions with the numbering of alternatives
V = {0: V1, 1: V2, 2: V3, 3: V4, 4: V5}

# Associate the availability conditions with the alternatives
av = {0: ones, 1: ones, 2: ones, 3: ones, 4: ones}

# Mixed Logit model in Biogeme (MSLE)

In [6]:
# Conditional to the random parameters, the likelihood of one observation is
# given by the logit model (called the kernel)
obsprob = models.logit(V, av, choice)

# Conditional to the random parameters, the likelihood of all observations for
# one individual (the trajectory) is the product of the likelihood of
# each observation.
condprobIndiv = PanelLikelihoodTrajectory(obsprob)

# We integrate over the random parameters using Monte-Carlo
logprob = log(MonteCarlo(condprobIndiv))

In [7]:
%%time

# Define level of verbosity
logger = msg.bioMessage()
# logger.setSilent()
# logger.setWarning()
# logger.setGeneral()
logger.setDetailed()
# logger.setDebug()

# Create the Biogeme object
biogeme = bio.BIOGEME(database, logprob, numberOfDraws=1000)
biogeme.modelName = 'fakeData'

# Estimate the parameters.
results = biogeme.estimate()
pandasResults = results.getEstimatedParameters()
print(pandasResults)

[16:27:45] < General >   Remove 3 unused variables from the database as only 43 are used.
[16:27:45] < Detailed >  It is suggested to scale the following variables.
[16:27:45] < Detailed >  Multiply indID by	0.001 because the largest (abs) value is	499.0
[16:27:45] < Detailed >  To remove this feature, set the parameter suggestScales to False when creating the BIOGEME object.
[16:27:46] < General >   *** Initial values of the parameters are obtained from the file __fakeData.iter
[16:27:46] < Detailed >  Log likelihood (N = 500):  -3962.825
[16:27:46] < Detailed >  ** Optimization: Newton with trust region for simple bounds
[16:27:51] < General >   Log likelihood (N = 500):  -3962.825 Gradient norm:      3e+02 Hessian norm:       4e+02 
[16:27:52] < Detailed >  Log likelihood (N = 500):  -3567.112
[16:27:57] < General >   Log likelihood (N = 500):  -3567.112 Gradient norm:      4e+01 Hessian norm:       4e+02 
[16:27:57] < Detailed >  1 f=  7.134223 projected rel. grad.=0.0084 rel. chan

In [8]:
print(results)


Results for model fakeData
Output file (HTML):			fakeData.html
Nbr of parameters:		13
Sample size:			500
Observations:			2500
Excluded data:			0
Init log likelihood:		-3962.825
Final log likelihood:		-3558.071
Likelihood ratio test (init):		809.5078
Rho square (init):			0.102
Rho bar square (init):			0.0989
Akaike Information Criterion:	7142.143
Bayesian Information Criterion:	7196.933
Final gradient norm:		0.002551651
B_XF1          : -0.753[0.0872 -8.63 0][0.0875 -8.6 0]
B_XF2          : 0.566[0.0867 6.53 6.61e-11][0.0827 6.85 7.28e-12]
B_XF3          : 1.18[0.0888 13.3 0][0.0925 12.8 0]
B_XR1          : -0.869[0.0978 -8.89 0][0.0979 -8.88 0]
B_XR1_S        : 0.913[0.179 5.1 3.43e-07][0.182 5.02 5.14e-07]
B_XR2          : 0.899[0.0948 9.49 0][0.0952 9.44 0]
B_XR2_S        : 0.735[0.202 3.64 0.000272][0.2 3.68 0.000236]
B_XR3          : 1.09[0.0944 11.6 0][0.0935 11.7 0]
B_XR3_S        : 0.584[0.247 2.36 0.0182][0.253 2.31 0.0208]
B_XR4          : -0.729[0.0987 -7.39 1.48e-13][0.0993