In [1]:
# Import necessary packages
import pandas as pd
from math import log

# Load data and remove Other students
df = pd.read_csv('../../Data/SMTO_2015/Formatted.csv')

# Convert school column to numeric
school_codes = df['School'].unique().tolist()
df['School'] = df['School'].apply(school_codes.index)

# Enrollment columns based on student's level
for code in school_codes:
    df['Enrol.' + code] = df.apply(lambda x: log(x[x.Level + '.' + code]), axis=1)
    
num_rows = df.shape[0]

In [2]:
# Function for confusion matrices
def get_cm(probs, hardmax):
    cm = []
    if hardmax:
        for school in range(len(school_codes)):
            cm.append([(probs[df['School'] == school].idxmax(axis=1) == 'Prob.' + j).sum() for j in school_codes])
    else:
        for school in range(len(school_codes)):
            cm.append((probs[df['School'] == school][['Prob.' + i for i in school_codes]].sum().values.tolist()))    
    return cm

# Function for accuracy from confusion matrix
def get_accuracy(cm):
    correct = sum([cm[i][i] for i in range(len(cm))])
    return correct/sum(sum(cm,[])) * 100

In [3]:
# Import Biogeme modules
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta

# Prepare dummy columns
for code in school_codes:
    df['Closest.' + code] = ((df['Closest.' + code]) & (df['Dist.' + code] <= 2)) * 1
database = db.Database("SMTO", df.select_dtypes(include = 'number'))

# Specify model
V, av = {}, {}
B_CLOSEST = Beta('B_CLOSEST', 0, None, None, 0)
B_DIST = Beta('B_DIST', 0, None, None, 0)
B_FAM_DIST = Beta('B_FAM_DIST', 0, None, None, 0)
B_ENROL = Beta('B_ENROL', 0, None, None, 0)
for i in range(len(school_codes)):
    code = school_codes[i]
    V[i] = B_ENROL * database.variables["Enrol." + code] + B_CLOSEST * database.variables["Closest." + code] + database.variables['Dist.' + code] * (B_DIST + B_FAM_DIST * database.variables["Family"])
    av[i] = 1   

# Run model
logprob = models.loglogit(V, av, database.variables["School"])
biogeme = bio.BIOGEME(database, {'loglike': logprob, 'weight': database.variables["Exp_Factor"]})
biogeme.modelName = 'Singly_Constrained_2km'
betas = biogeme.estimate().getBetaValues()    

# Generate probabilities
simulate = {'Prob.' + school_codes[i]: models.logit(V, av, i) for i in range(len(school_codes))}
sim_biogeme = bio.BIOGEME(database, simulate)
probs = sim_biogeme.simulate(betas).set_index(df.index)

# Confusion matrices and metrics
hard_cm = get_cm(probs, True)
soft_cm = get_cm(probs, False)
print(get_accuracy(hard_cm))
print(get_accuracy(soft_cm))

49.88258046691532
36.49709641351159


In [4]:
from IPython.display import HTML
HTML(filename= "Singly_Constrained_2km.html")

0,1
Report file:,Singly_Constrained_2km.html
Database name:,SMTO

0,1
Number of estimated parameters:,4
Sample size:,14478
Excluded observations:,0
Init log likelihood:,-28172.89
Final log likelihood:,-19507.5
Likelihood ratio test for the init. model:,17330.78
Rho-square for the init. model:,0.308
Rho-square-bar for the init. model:,0.307
Akaike Information Criterion:,39022.99
Bayesian Information Criterion:,39053.31

Name,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
B_CLOSEST,1.02,0.0448,22.8,0,0.0419,24.4,0
B_DIST,-0.095,0.00234,-40.5,0,0.00253,-37.5,0
B_ENROL,0.767,0.0103,74.5,0,0.00951,80.7,0
B_FAM_DIST,0.0315,0.00266,11.8,0,0.00281,11.2,0

Coefficient1,Coefficient2,Covariance,Correlation,t-test,p-value,Rob. cov.,Rob. corr.,Rob. t-test,Rob. p-value
B_DIST,B_CLOSEST,3.84e-05,0.365,-25.3,0.0,4.42e-05,0.416,-27.3,0.0
B_ENROL,B_CLOSEST,3.39e-05,0.0735,-5.59,2.21e-08,3.39e-05,0.0851,-6.01,1.9e-09
B_ENROL,B_DIST,3.92e-07,0.0162,81.9,0.0,-2.15e-06,-0.0893,85.8,0.0
B_FAM_DIST,B_CLOSEST,-3.58e-05,-0.3,-21.6,0.0,-4.24e-05,-0.361,-23.0,0.0
B_FAM_DIST,B_DIST,-5.44e-06,-0.873,26.1,0.0,-6.33e-06,-0.89,24.4,0.0
B_FAM_DIST,B_ENROL,-1.68e-06,-0.0614,-68.2,0.0,1.11e-06,0.0414,-75.1,0.0
