In [1]:
# Import packages
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd
import math
import csv
import numpy as np

# Load data
full_df = pd.read_csv('../../../Data/SMTO_2019/SMTO_2019_Complete_Input.csv')
school_codes = full_df['School'].unique().tolist()
uni_codes = full_df[full_df['School_Type'] == 'University']['School'].unique().tolist()

# Convert School column to numeric
full_df['School'] = full_df['School'].apply(lambda x: school_codes.index(x))

# Remove rows with missing information
full_df = full_df.dropna(subset = ['Family'])
full_df['Family'] = (full_df['Family'] * 1).astype(int)

In [2]:
# Load enrollment data
enrol_df = pd.read_csv('../../../Data/School_Info_2019_Pred_Enrol.csv').set_index('Code')

def code_to_log_enrol(code):
    """
    Return natural logarithm of total enrollment of campus with given code
    If code is invalid, raise KeyError
    If no enrollment information available for that code, return np.nan
    """
    return math.log(enrol_df.loc[code]['Total'])

In [3]:
def get_accuracy(cm):
    """
    Given confusion matrix as 2D array, return accuracy
    """
    correct = sum([cm[i][i] for i in range(len(cm))])
    return correct/sum(sum(cm,[])) * 100

In [4]:
def get_cm(probs, hardmax):
    cm = []
    if hardmax:
        for school in range(len(school_codes)):
            cm.append([(probs[full_df['School'] == school][['Prob.' + i for i in school_codes]].idxmax(axis = 1) == 'Prob.' + j).sum() for j in school_codes])
    else:
        for school in range(len(school_codes)):
            cm.append((probs[full_df['School'] == school][['Prob.' + i for i in school_codes]].sum().values.tolist()))    
    return cm

def get_accuracy(cm):
    correct = sum([cm[i][i] for i in range(len(school_codes))])
    return correct/sum(sum(cm,[])) * 100

In [5]:
cols_to_keep = ['School', 'Family'] + ['Dist.' + code for code in school_codes]
database = db.Database("SMTO_2019", full_df[cols_to_keep])
unis, cols = [], []
V, av = {}, {}
B_DIST = Beta('B_DIST', 0, None, None, 0)
B_FAM_DIST = Beta('B_FAM_DIST', 0, None, None, 0)
UNI = Beta('UNI', 3, 0.0000001, 10, 0)
COL = Beta('COL', 3, 0.0000001, 10, 0)

for i in range(len(school_codes)):
    code = school_codes[i]
    if code in uni_codes:
        unis.append(i)
    else:
        cols.append(i)
    V[i] = code_to_log_enrol(code) + database.variables['Dist.' + code] * (B_DIST + B_FAM_DIST * database.variables['Family'])
    av[i] = 1

UNIS = (UNI, unis)
COLS = (COL, cols)
nests = UNIS, COLS

In [6]:
import biogeme.messaging as msg
logprob = models.lognested(V, av, nests, database.variables["School"])
logger = msg.bioMessage()
logger.setGeneral()

biogeme = bio.BIOGEME(database, logprob)
results = biogeme.estimate()
pandasResults = results.getEstimatedParameters()
betas = results.getBetaValues()

[17:16:52] < General >   Remove 0 unused variables from the database as only 29 are used.
[17:16:53] < General >   Log likelihood (N=11005):  -27927.74
[17:16:53] < General >   Minimize with tol 1e-07
[17:16:54] < General >   Log likelihood (N=11005):  -27927.74 Gradient norm:      2e+05  
[17:17:01] < General >   Log likelihood (N=11005):    -319285 Gradient norm:      4e+05  
[17:17:07] < General >   Log likelihood (N=11005):  -48152.19 Gradient norm:      3e+05  
[17:17:13] < General >   Log likelihood (N=11005):  -27188.85 Gradient norm:      8e+04  
[17:17:18] < General >   Log likelihood (N=11005):  -26754.21 Gradient norm:      5e+04  
[17:17:23] < General >   Log likelihood (N=11005):  -26603.74 Gradient norm:      3e+04  
[17:17:29] < General >   Log likelihood (N=11005):  -26561.99 Gradient norm:      9e+03  
[17:17:35] < General >   Log likelihood (N=11005):  -26554.14 Gradient norm:      7e+03  
[17:17:41] < General >   Log likelihood (N=11005):   -26537.8 Gradient norm:   

[17:24:13] < General >   Log likelihood (N=11005):  -26269.23 Gradient norm:      0.002  
[17:24:18] < General >   Log likelihood (N=11005):  -26269.23 Gradient norm:      0.002  
[17:24:22] < General >   Log likelihood (N=11005):  -26269.23 Gradient norm:      0.002  
[17:24:26] < General >   Log likelihood (N=11005):  -26269.23 Gradient norm:      0.002  
[17:24:31] < General >   Log likelihood (N=11005):  -26269.23 Gradient norm:      0.002  
[17:24:36] < General >   Log likelihood (N=11005):  -26269.23 Gradient norm:      0.002  
[17:24:42] < General >   Log likelihood (N=11005):  -26269.23 Gradient norm:      0.002  
[17:24:48] < General >   Log likelihood (N=11005):  -26269.23 Gradient norm:      0.002  
[17:24:53] < General >   Log likelihood (N=11005):  -26269.23 Gradient norm:      0.002  
[17:24:59] < General >   Log likelihood (N=11005):  -26269.23 Gradient norm:      0.002 Hessian norm:       7e+06 BHHH norm:       2e+07
[17:25:00] < General >   Results saved in file biogem

In [7]:
pandasResults

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
B_DIST,-0.020911,0.000711,-29.413694,0.0,0.001404,-14.89398,0.0
B_FAM_DIST,0.002281,0.000881,2.589811,0.009603,0.001688,1.351673,0.17648
COL,1.001433,0.018886,53.025576,0.0,0.022359,44.788973,0.0
UNI,0.81311,0.010881,74.724932,0.0,0.011236,72.368137,0.0


In [8]:
simulate = {'Prob.' + school_codes[i]: models.logit(V, av, i) for i in range(len(school_codes))}
sim_biogeme = bio.BIOGEME(database, simulate)
probs = sim_biogeme.simulate(betas).set_index(full_df.index)
hard_cm = get_cm(probs, True)
soft_cm = get_cm(probs, False)
print("Hardmax Accuracy: {:2.2f} %".format(get_accuracy(hard_cm)))
print("Softmax Accuracy: {:2.2f} %".format(get_accuracy(soft_cm)))

[17:25:00] < General >   Remove 1 unused variables from the database as only 28 are used.
Hardmax Accuracy: 25.50 %
Softmax Accuracy: 15.21 %
