In [3]:
# Import packages
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd
import math
import csv
import numpy as np

# Load data
full_df = pd.read_csv('../../Data/SMTO_2019/SMTO_2019_Complete_Input.csv')
school_codes = full_df['School'].unique().tolist()
uni_codes = full_df[full_df['School_Type'] == 'University']['School'].unique().tolist()

# Convert School column to numeric
full_df['School'] = full_df['School'].apply(lambda x: school_codes.index(x))

# Remove rows with missing information
full_df = full_df.dropna(subset = ['Family'])
full_df['Family'] = (full_df['Family'] * 1).astype(int)

In [4]:
# Load enrollment data
enrol_df = pd.read_csv('../../Data/School_Info_2019.csv').set_index('Code')

def code_to_log_enrol(code):
    """
    Return natural logarithm of total enrollment of campus with given code
    If code is invalid, raise KeyError
    If no enrollment information available for that code, return np.nan
    """
    return math.log(enrol_df.loc[code]['Total'])

In [5]:
def get_accuracy(cm):
    """
    Given confusion matrix as 2D array, return accuracy
    """
    correct = sum([cm[i][i] for i in range(len(cm))])
    return correct/sum(sum(cm,[])) * 100

In [6]:
cols_to_keep = ['School', 'Family'] + ['Dist.' + code for code in school_codes]
database = db.Database("SMTO_2019", full_df[cols_to_keep])
ASCs, V, av = [], {}, {}
B_DIST = Beta('B_DIST', 0, None, None, 0)
MU = Beta('MU', 0.5, 0.000001, 1.0, 0)
unis, cols = [], []

for i in range(len(school_codes)):
    code = school_codes[i]
    if code in uni_codes:
        unis.append(i)
    else:
        cols.append(i)
    enrollment = code_to_log_enrol(code)
    if np.isnan(enrollment): # No enrollment information available
        ASCs.append(Beta('ASC_' + code, 0, None, None, 0))
    else: 
        ASCs.append(enrollment)
    V[i] = ASCs[i] + B_DIST * database.variables['Dist.' + code]
    av[i] = 1

UNIS = (MU, unis)
COLS = (1.0, cols)
nests = UNIS, COLS

In [7]:
import biogeme.messaging as msg
logprob = models.lognested(V, av, nests, database.variables["School"])
logger = msg.bioMessage()
logger.setGeneral()

biogeme = bio.BIOGEME(database, logprob)
results = biogeme.estimate()
pandasResults = results.getEstimatedParameters()

[14:57:54] < General >   Remove 1 unused variables from the database as only 28 are used.
[14:57:58] < General >   Log likelihood (N=11005):  -34712.23
[14:57:58] < General >   Minimize with tol 1e-07
[14:58:04] < General >   Log likelihood (N=11005):  -34712.23 Gradient norm:      3e+04  
[14:58:10] < General >   Log likelihood (N=11005):  -172013.2 Gradient norm:      3e+05  
[14:58:15] < General >   Log likelihood (N=11005):  -34630.46 Gradient norm:      3e+04  
[14:58:20] < General >   Log likelihood (N=11005):  -34378.95 Gradient norm:      5e+03  
[14:58:24] < General >   Log likelihood (N=11005):  -34365.34 Gradient norm:      2e+03  
[14:58:29] < General >   Log likelihood (N=11005):  -34356.64 Gradient norm:      3e+03  
[14:58:34] < General >   Log likelihood (N=11005):   -34336.6 Gradient norm:      3e+03  
[14:58:38] < General >   Log likelihood (N=11005):  -34326.83 Gradient norm:      1e+03  
[14:58:42] < General >   Log likelihood (N=11005):  -34325.57 Gradient norm:   

[15:05:02] < General >   Log likelihood (N=11005):  -24524.74 Gradient norm:      2e+04  
[15:05:07] < General >   Log likelihood (N=11005):  -24518.11 Gradient norm:      2e+03  
[15:05:11] < General >   Log likelihood (N=11005):  -24506.89 Gradient norm:      2e+03  
[15:05:14] < General >   Log likelihood (N=11005):  -26229.55 Gradient norm:      7e+03  
[15:05:18] < General >   Log likelihood (N=11005):  -24505.72 Gradient norm:      1e+03  
[15:05:22] < General >   Log likelihood (N=11005):  -24503.34 Gradient norm:      4e+03  
[15:05:25] < General >   Log likelihood (N=11005):  -24502.53 Gradient norm:      2e+03  
[15:05:29] < General >   Log likelihood (N=11005):  -24501.96 Gradient norm:      9e+02  
[15:05:35] < General >   Log likelihood (N=11005):  -24501.74 Gradient norm:      1e+03  
[15:05:39] < General >   Log likelihood (N=11005):  -24500.44 Gradient norm:      4e+03  
[15:05:44] < General >   Log likelihood (N=11005):  -24498.48 Gradient norm:      5e+03  
[15:05:49]

[15:12:09] < General >   Log likelihood (N=11005):   -24483.9 Gradient norm:      2e+02  
[15:12:14] < General >   Log likelihood (N=11005):  -24481.86 Gradient norm:      2e+02  
[15:12:19] < General >   Log likelihood (N=11005):  -24694.54 Gradient norm:      5e+04  
[15:12:24] < General >   Log likelihood (N=11005):  -24481.83 Gradient norm:      9e+02  
[15:12:29] < General >   Log likelihood (N=11005):  -24478.57 Gradient norm:      5e+02  
[15:12:33] < General >   Log likelihood (N=11005):  -24481.96 Gradient norm:      3e+03  
[15:12:38] < General >   Log likelihood (N=11005):  -24477.96 Gradient norm:      1e+03  
[15:12:42] < General >   Log likelihood (N=11005):  -24475.85 Gradient norm:      9e+01  
[15:12:47] < General >   Log likelihood (N=11005):  -24475.28 Gradient norm:      3e+02  
[15:12:52] < General >   Log likelihood (N=11005):  -24475.24 Gradient norm:      4e+01  
[15:12:56] < General >   Log likelihood (N=11005):  -24475.24 Gradient norm:      6e+01  
[15:13:01]

[15:19:24] < General >   Log likelihood (N=11005):  -24474.06 Gradient norm:          7  
[15:19:29] < General >   Log likelihood (N=11005):  -24474.06 Gradient norm:          8  
[15:19:34] < General >   Log likelihood (N=11005):  -24474.06 Gradient norm:      1e+01  
[15:19:38] < General >   Log likelihood (N=11005):  -24474.06 Gradient norm:      2e+01  
[15:19:43] < General >   Log likelihood (N=11005):  -24474.06 Gradient norm:      1e+02  
[15:19:47] < General >   Log likelihood (N=11005):  -24474.06 Gradient norm:      3e+01  
[15:19:52] < General >   Log likelihood (N=11005):  -24474.06 Gradient norm:      2e+01  
[15:19:56] < General >   Log likelihood (N=11005):  -24474.38 Gradient norm:      2e+03  
[15:20:00] < General >   Log likelihood (N=11005):  -24474.06 Gradient norm:      1e+01  
[15:20:03] < General >   Log likelihood (N=11005):  -24474.06 Gradient norm:          5  
[15:20:07] < General >   Log likelihood (N=11005):  -24474.06 Gradient norm:          4  
[15:20:11]

[15:25:52] < General >   Log likelihood (N=11005):  -24473.83 Gradient norm:      6e+01  
[15:25:56] < General >   Log likelihood (N=11005):  -24473.82 Gradient norm:      9e+01  
[15:26:01] < General >   Log likelihood (N=11005):  -24473.81 Gradient norm:      9e+01  
[15:26:05] < General >   Log likelihood (N=11005):  -24477.07 Gradient norm:      6e+03  
[15:26:10] < General >   Log likelihood (N=11005):  -24473.81 Gradient norm:      1e+02  
[15:26:15] < General >   Log likelihood (N=11005):   -24473.8 Gradient norm:      9e+01  
[15:26:20] < General >   Log likelihood (N=11005):   -24473.8 Gradient norm:      6e+01  
[15:26:24] < General >   Log likelihood (N=11005):  -24473.79 Gradient norm:      2e+01  
[15:26:29] < General >   Log likelihood (N=11005):  -24473.79 Gradient norm:      2e+01  
[15:26:33] < General >   Log likelihood (N=11005):  -24473.79 Gradient norm:      2e+02  
[15:26:38] < General >   Log likelihood (N=11005):  -24473.79 Gradient norm:      9e+01  
[15:26:42]

[15:32:35] < General >   Log likelihood (N=11005):  -24472.37 Gradient norm:      2e+01  
[15:32:39] < General >   Log likelihood (N=11005):  -24472.37 Gradient norm:      3e+01  
[15:32:44] < General >   Log likelihood (N=11005):  -24472.37 Gradient norm:      2e+01  
[15:32:48] < General >   Log likelihood (N=11005):  -24472.37 Gradient norm:      9e+01  
[15:32:52] < General >   Log likelihood (N=11005):  -24472.37 Gradient norm:          8  
[15:32:56] < General >   Log likelihood (N=11005):  -24472.37 Gradient norm:      1e+01  
[15:33:00] < General >   Log likelihood (N=11005):  -24472.37 Gradient norm:          5  
[15:33:04] < General >   Log likelihood (N=11005):  -24472.37 Gradient norm:          9  
[15:33:09] < General >   Log likelihood (N=11005):  -24472.51 Gradient norm:      3e+02  
[15:33:13] < General >   Log likelihood (N=11005):  -24472.37 Gradient norm:          4  
[15:33:17] < General >   Log likelihood (N=11005):  -24472.37 Gradient norm:      1e+01  
[15:33:22]

[15:39:04] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      3e+01  
[15:39:08] < General >   Log likelihood (N=11005):  -24472.85 Gradient norm:      1e+03  
[15:39:12] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      2e+01  
[15:39:16] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          7  
[15:39:20] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          4  
[15:39:25] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          4  
[15:39:28] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      4e+01  
[15:39:32] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          9  
[15:39:36] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          5  
[15:39:40] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          3  
[15:39:44] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.8  
[15:39:48]

[15:44:41] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.3  
[15:44:45] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.4  
[15:44:48] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.5  
[15:44:52] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.6  
[15:44:55] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      3e+01  
[15:44:59] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.4  
[15:45:02] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.2  
[15:45:05] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.8  
[15:45:09] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.2  
[15:45:12] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          1  
[15:45:16] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.8  
[15:45:19]

[15:49:55] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.7  
[15:49:57] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.6  
[15:50:01] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          1  
[15:50:05] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.9  
[15:50:08] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          4  
[15:50:11] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.2  
[15:50:16] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:        0.4  
[15:50:20] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          1  
[15:50:23] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          5  
[15:50:27] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          3  
[15:50:30] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:          2  
[15:50:33]

[15:54:02] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      0.002  
[15:54:05] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      0.002  
[15:54:08] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      0.002  
[15:54:10] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      0.002  
[15:54:12] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      0.002  
[15:54:14] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      0.002  
[15:54:16] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      0.002  
[15:54:19] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      0.002  
[15:54:21] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      0.002  
[15:54:24] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      0.002  
[15:54:28] < General >   Log likelihood (N=11005):   -24472.3 Gradient norm:      0.002  
[15:54:31]

In [8]:
pandasResults

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CDS,3.192171,1.000203,3.191525,0.001415,1.000218,3.191477,0.001415
ASC_CDV,5.709538,0.289388,19.729675,0.0,0.289377,19.730468,0.0
ASC_CMO,8.488201,0.082734,102.595991,0.0,0.082811,102.501311,0.0
ASC_MCB,6.203964,0.222047,27.939878,0.0,0.22123,28.042988,0.0
ASC_MCM,15.056226,0.090384,166.580615,0.0,0.095422,157.785454,0.0
ASC_OTD,7.885171,0.144269,54.655938,0.0,0.143615,54.905009,0.0
ASC_OTN,9.372549,0.071491,131.100562,0.0,0.070463,133.013721,0.0
B_DIST,-0.041064,0.00067,-61.283813,0.0,0.000908,-45.245093,0.0
MU,0.889628,0.009634,92.340546,0.0,0.009208,96.613032,0.0
