# How to use cross validation

In [1]:
import pandas  as pd
import numpy as np
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
import biogeme.optimization as opt
import biogeme.results as res
from biogeme.expressions import Beta, DefineVariable
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

**Import Swissmetro data**

In [2]:
pandas = pd.read_csv("../Data/swissmetro.dat",sep='\t')

**Divide data into train and test set**

In [3]:
df_train, df_test = train_test_split(pandas, test_size=0.4)

**Set biogeme train and test database**

In [4]:
database_train = db.Database("data/swissmetro", df_train)
database_test = db.Database("data/swissmetro", df_test)

**Use collumn names as variables**

In [5]:
globals().update(database_train.variables)

**Exclude some unwanted entries**

In [6]:
exclude = (( PURPOSE != 1 ) * ( PURPOSE != 3 ) + ( CHOICE == 0 )) > 0

database_train.remove(exclude)
database_test.remove(exclude)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[column] = self.data.apply(functionToApply, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


**Define some dummy variables**

In [7]:
SM_COST = SM_CO * ( GA == 0 )
TRAIN_COST = TRAIN_CO * ( GA == 0 )

CAR_AV_SP = DefineVariable ('CAR_AV_SP', CAR_AV * ( SP !=0 ), database_train)
TRAIN_AV_SP = DefineVariable ('TRAIN_AV_SP', TRAIN_AV * ( SP != 0 ), database_train)

CAR_AV_SP = DefineVariable ('CAR_AV_SP', CAR_AV * ( SP !=0 ), database_test)
TRAIN_AV_SP = DefineVariable ('TRAIN_AV_SP', TRAIN_AV * ( SP != 0 ), database_test)

**Rescale some data**

In [8]:
TRAIN_TT_SCALED   = DefineVariable('TRAIN_TT_SCALED',   TRAIN_TT / 100.0, database_train)
TRAIN_COST_SCALED = DefineVariable('TRAIN_COST_SCALED', TRAIN_COST / 100, database_train)
SM_TT_SCALED      = DefineVariable('SM_TT_SCALED',      SM_TT / 100.0   , database_train)
SM_COST_SCALED    = DefineVariable('SM_COST_SCALED',    SM_COST / 100   , database_train)
CAR_TT_SCALED     = DefineVariable('CAR_TT_SCALED',     CAR_TT / 100    , database_train)
CAR_CO_SCALED     = DefineVariable('CAR_CO_SCALED',     CAR_CO / 100    , database_train)

TRAIN_TT_SCALED   = DefineVariable('TRAIN_TT_SCALED',   TRAIN_TT / 100.0, database_test)
TRAIN_COST_SCALED = DefineVariable('TRAIN_COST_SCALED', TRAIN_COST / 100, database_test)
SM_TT_SCALED      = DefineVariable('SM_TT_SCALED',      SM_TT / 100.0   , database_test)
SM_COST_SCALED    = DefineVariable('SM_COST_SCALED',    SM_COST / 100   , database_test)
CAR_TT_SCALED     = DefineVariable('CAR_TT_SCALED',     CAR_TT / 100    , database_test)
CAR_CO_SCALED     = DefineVariable('CAR_CO_SCALED',     CAR_CO / 100    , database_test)

**Create parameters to be estimated**

In [9]:
ASC_CAR = Beta('ASC_CAR',0,None ,None ,0)
ASC_TRAIN = Beta('ASC_TRAIN',0,None ,None ,0)
ASC_SM = Beta('ASC_SM',0,None ,None ,1)
B_TIME = Beta('B_TIME',0,None ,None ,0)
B_COST = Beta('B_COST',0,None ,None ,0)

**Define the utility functions**

In [10]:
V1 = ASC_TRAIN + \
     B_TIME * TRAIN_TT_SCALED + \
     B_COST * TRAIN_COST_SCALED
V2 = ASC_SM + \
     B_TIME * SM_TT_SCALED + \
     B_COST * SM_COST_SCALED
V3 = ASC_CAR + \
     B_TIME * CAR_TT_SCALED + \
     B_COST * CAR_CO_SCALED

**Associate utility functions with alternatives and associate availability of alternatives**

In [11]:
V = {1: V1,
     2: V2,
     3: V3}

av = {1: TRAIN_AV_SP,
      2: SM_AV,
      3: CAR_AV_SP}

**Define the model**

In [12]:
logprob = models.loglogit(V, av, CHOICE)

**Define the Biogeme object**

In [13]:
biogeme  = bio.BIOGEME(database_train, logprob)

biogeme.modelName = "swissmetro_logit_train"

**Train the model using the training set**

In [14]:
biogeme.generateHtml = True
biogeme.generatePickle = True

results = biogeme.estimate()

print(f"HTML file:    {results.data.htmlFileName}")
print(f"Pickle file:  {results.data.pickleFileName }")

HTML file:    swissmetro_logit_train.html
Pickle file:  swissmetro_logit_train.pickle


**Let's look at results and compare with the model using all data to estimate the model**

In [15]:
betas = results.getBetaValues()
for k,v in betas.items():
    print(f"{k:10}=\t{v:.3g}")

ASC_CAR   =	-0.128
ASC_TRAIN =	-0.67
B_COST    =	-1.06
B_TIME    =	-1.32


**Coefficients using all data for estimating the model**

ASC_CAR   =	-0.155

ASC_TRAIN =	-0.701

B_COST    =	-1.08

B_TIME    =	-1.28

In [16]:
gs = results.getGeneralStatistics()

for k,v in gs.items():
    print("{}= {}".format(k.ljust(45),v[0]))

Number of estimated parameters               = 4
Sample size                                  = 4048
Excluded observations                        = 2388
Init log likelihood                          = -4174.7099918798185
Final log likelihood                         = -3190.69200341776
Likelihood ratio test for the init. model    = 1968.0359769241168
Rho-square for the init. model               = 0.23570930444894633
Rho-square-bar for the init. model           = 0.23475115406058877
Akaike Information Criterion                 = 6389.38400683552
Bayesian Information Criterion               = 6414.60791967939
Final gradient norm                          = 0.0047625546232616615
Nbr of threads                               = 8


**Model results all data for estimating the model**

Sample size                                  = 6768

Init log likelihood                          = -6964.662979192295

Final log likelihood                         = -5331.252007298093

Likelihood ratio test for the init. model    = 3266.8219437884036

Rho-square for the init. model               = 0.2345283579082288

Rho-square-bar for the init. model           = 0.23395403004599769

In [17]:
prob_train = models.logit(V, av, 1)
prob_SM = models.logit(V, av, 2)
prob_car = models.logit(V, av, 3)

In [18]:
simulate ={'Prob. SM':  prob_SM ,
           'Prob. train':  prob_train ,
           'Prob. car': prob_car ,}

In [19]:
biogeme = bio.BIOGEME(database_test, simulate)
biogeme.modelName = "swissmetro_logit_test"

In [20]:
betas = biogeme.freeBetaNames

print('Extracting the following variables:')
for k in betas:
    print('\t',k)

results = res.bioResults(pickleFile='swissmetro_logit_train.pickle')
betaValues = results.getBetaValues ()

Extracting the following variables:
	 ASC_CAR
	 ASC_TRAIN
	 B_COST
	 B_TIME


In [21]:
simulatedValues = biogeme.simulate(betaValues)
print(simulatedValues.head())

      Prob. SM  Prob. train  Prob. car
309   0.777874     0.222126   0.000000
406   0.775576     0.224424   0.000000
6623  0.754613     0.081452   0.163935
304   0.816346     0.183654   0.000000
2832  0.874595     0.125405   0.000000


In [22]:
prob_max = simulatedValues.idxmax(axis=1)
prob_max = prob_max.replace({'Prob. train': 1, 'Prob. SM': 2, 'Prob. car': 3})

In [23]:
data = {'y_Actual':    df_test['CHOICE'],
        'y_Predicted': prob_max
        }

df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])

confusion_matrix

Predicted,1,2,3
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2,343,24
2,0,1529,126
3,0,374,322


In [24]:
accuracy = np.diagonal(confusion_matrix.to_numpy()).sum()/confusion_matrix.to_numpy().sum()
print('Global accuracy of the model:', accuracy)

Global accuracy of the model: 0.68125
