In [2]:
# Import necessary packages
import pandas as pd
import csv

# Load data and remove Other students
df = pd.read_csv('../../../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df = df[df['Level'] != 'Other']

# Convert school column to numeric
school_codes = df.School.unique().tolist()
df['School'] = df['School'].apply(lambda x: school_codes.index(x))

# Enrollment columns based on student's level
for code in school_codes:
    df['Enrol.' + code] = df.apply(lambda x: x['UG.' + code] if x['Level'] == 'UG' else x['Grad.' + code], axis=1)
    
num_rows = df.shape[0]

In [3]:
# Function for confusion matrices
def get_cm(probs, hardmax):
    cm = []
    if hardmax:
        for school in range(len(school_codes)):
            cm.append([(probs[df['School'] == school][['Prob.' + i for i in school_codes]].idxmax(axis = 1) == 'Prob.' + j).sum() for j in school_codes])
    else:
        for school in range(len(school_codes)):
            cm.append((probs[df['School'] == school][['Prob.' + i for i in school_codes]].sum().values.tolist()))    
    return cm

# Function for accuracy from confusion matrix
def get_accuracy(cm):
    correct = sum([cm[i][i] for i in range(len(cm))])
    return correct/sum(sum(cm,[])) * 100

In [9]:
# Import Biogeme modules
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta

# Prepare dummy columns
for code in school_codes:
    df['Closest.' + code] = ((df['Closest.' + code]) & (df['Dist.' + code] <= 2)) * 1
database = db.Database("SMTO", df.select_dtypes(include = 'number'))

# Specify model
V, av = {}, {}
B_CLOSEST = Beta('B_CLOSEST', 0, None, None, 0)
B_DIST = Beta('B_DIST', 0, None, None, 0)
B_FAM_DIST = Beta('B_FAM_DIST', 0, None, None, 0)
B_ENROL = Beta('B_ENROL', 0, None, None, 0)
for i in range(len(school_codes)):
    code = school_codes[i]
    V[i] = B_ENROL * database.variables["Total." + code] + B_CLOSEST * database.variables["Closest." + code] + database.variables["Enrol." + code] + database.variables['Dist.' + code] * (B_DIST + B_FAM_DIST * database.variables["Family"])
    av[i] = 1   

# Run model
logprob = models.loglogit(V, av, database.variables["School"])
biogeme = bio.BIOGEME(database, {'loglike': logprob, 'weight': database.variables["Exp_Segment"]})
betas = biogeme.estimate().getBetaValues()    

# Generate probabilities
simulate = {'Prob.' + school_codes[i]: models.logit(V, av, i) for i in range(len(school_codes))}
sim_biogeme = bio.BIOGEME(database, simulate)
probs = sim_biogeme.simulate(betas).set_index(df.index)

# Confusion matrices and metrics
hard_cm = get_cm(probs, True)
soft_cm = get_cm(probs, False)
print(get_accuracy(hard_cm))
print(get_accuracy(soft_cm))

49.44743749136621
38.00183550035435


In [10]:
from IPython.display import HTML
HTML(filename= "biogemeModelDefaultName.html")

0,1
Report file:,biogemeModelDefaultName.html
Database name:,SMTO

0,1
Number of estimated parameters:,4
Sample size:,14478
Excluded observations:,0
Init log likelihood:,-21579.53
Final log likelihood:,-18321.45
Likelihood ratio test for the init. model:,6516.16
Rho-square for the init. model:,0.151
Rho-square-bar for the init. model:,0.151
Akaike Information Criterion:,36650.9
Bayesian Information Criterion:,36681.22

Name,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
B_CLOSEST,0.922,0.045,20.5,0.0,0.0419,22.0,0.0
B_DIST,-0.114,0.00273,-41.7,0.0,0.00294,-38.9,0.0
B_ENROL,-0.0619,0.013,-4.74,2.11e-06,0.0125,-4.94,7.98e-07
B_FAM_DIST,0.0476,0.00309,15.4,0.0,0.00325,14.6,0.0

Coefficient1,Coefficient2,Covariance,Correlation,t-test,p-value,Rob. cov.,Rob. corr.,Rob. t-test,Rob. p-value
B_DIST,B_CLOSEST,4.24e-05,0.345,-23.5,0.0,4.95e-05,0.403,-25.4,0.0
B_ENROL,B_CLOSEST,2.97e-05,0.0506,-21.3,0.0,2.36e-05,0.0449,-22.8,0.0
B_ENROL,B_DIST,-2.69e-07,-0.00754,3.9,9.42e-05,-1.62e-06,-0.0439,4.01,6.07e-05
B_FAM_DIST,B_CLOSEST,-3.85e-05,-0.277,-19.0,0.0,-4.64e-05,-0.341,-20.3,0.0
B_FAM_DIST,B_DIST,-7.37e-06,-0.872,28.7,0.0,-8.49e-06,-0.89,26.9,0.0
B_FAM_DIST,B_ENROL,-2.62e-06,-0.065,8.05,8.88e-16,-6.24e-07,-0.0153,8.42,0.0
