In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd
import math
import csv

df = pd.read_csv('../../../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df = df[df['Level'] != 'Other']
school_codes = ['SG', 'SC', 'MI', 'YK','YG', 'RY','OC']
df['School'] = df['School'].apply(lambda x: school_codes.index(x))

for code in school_codes:
    df['Enrol.' + code] = df.apply(lambda x: x['UG.' + code] if x['Level'] == 'UG' else x['Grad.' + code], axis=1)
df.columns
full_df = df.copy()

In [2]:
def run_model(name, with_ASCs, combined = True, closest = [], alt_spec_closest = False):
    """
    Run logit model with given specification and return pd.DataFrame with predicted probabilities.
    name: String for output file names and printing
    with_ASCS: Boolean, if True use ASCs and set B_ENROL = 1 else use B_ENROL
    combined: Boolean, if True include B_FAM_DIST else set B_FAM_DIST = 0
    closest: List of school clodes for which to use B_CLOSEST for closest dummies, if empty do not use set B_CLOSEST = 0 for all schoolds
    alt_spec_closest: Boolean indicating whether the B_CLOSEST coefficient should be alternative-specific or generic
    """
    database = db.Database("SMTO", df.select_dtypes(include = 'number'))
    ASCs, B_CLOSESTs, V, av = [], [], {}, {}
    B_CLOSEST = Beta('B_CLOSEST', 0, None, None, 0 if closest else 1)
    B_DIST = Beta('B_DIST', 0, None, None, 0)
    B_ENROL = Beta('B_ENROL', 1, None, None, 1 if with_ASCs else 0)
    B_FAM_DIST = Beta('B_FAM_DIST', 0, None, None, 0 if combined else 1)

    for i in range(len(school_codes)):
        code = school_codes[i]
        ASCs.append(Beta('ASC_' + code, 0, None, None, 0 if with_ASCs and code != 'SG' else 1))
        B_CLOSESTs.append(Beta('B_CLOSEST_' + code, 0, None, None, 0 if alt_spec_closest else 1))
        if alt_spec_closest:
            V[i] = ASCs[i] + (B_CLOSESTs[i] * database.variables["Closest." + code] if code in closest else 0) + B_ENROL *  database.variables["Enrol." + code] + B_DIST * database.variables['Dist.' + code] + B_FAM_DIST * database.variables["Dist." + code] * database.variables["Family"]
        else:
            V[i] = ASCs[i] + (B_CLOSEST * database.variables["Closest." + code] if code in closest else 0) + B_ENROL *  database.variables["Enrol." + code] + B_DIST * database.variables['Dist.' + code] + B_FAM_DIST * database.variables["Dist." + code] * database.variables["Family"]
        av[i] = 1   
    
    logprob = models.loglogit(V, av, database.variables["School"])
    test_dict = {'loglike': logprob, 'weight': database.variables["Exp_Segment"]}
    biogeme  = bio.BIOGEME(database,test_dict,numberOfThreads=1)
    biogeme.modelName = name
    
    results = biogeme.estimate()
    betas = results.getBetaValues()    
    print("Results for " + name + " model:")
    print(betas)

    simulate = {'Prob.' + school_codes[i]: models.logit(V, av, i) for i in range(len(school_codes))}
    sim_biogeme = bio.BIOGEME(database, simulate)
    results = sim_biogeme.simulate(betas).set_index(df.index)
    
    return results

In [3]:
def get_cm(probs, hardmax):
    cm = []
    if hardmax:
        for school in range(len(school_codes)):
            cm.append([(probs[df['School'] == school][['Prob.' + i for i in school_codes]].idxmax(axis = 1) == 'Prob.' + j).sum() for j in school_codes])
    else:
        for school in range(len(school_codes)):
            cm.append((probs[df['School'] == school][['Prob.' + i for i in school_codes]].sum().values.tolist()))    
    return cm

def get_accuracy(cm):
    correct = sum([cm[i][i] for i in range(len(school_codes))])
    return correct/sum(sum(cm,[])) * 100

In [None]:
df = full_df.copy()
for (name, with_ASCs, combined, closest, alt_spec_closest) in (('Prop_Closest_0', False, True, [], False),
                                                               ('Prop_Closest_2', False, True, ['MI', 'SG'], False),
                                                               ('Prop_Closest_5', False, True, ['MI', 'YK', 'OC', 'SG', 'YG'], False),
                                                               ('Prop_Closest_7', False, True, school_codes, False),
                                                               ('Eric_Closest_0', True, True, [], False),
                                                               ('Eric_Closest_2', True, True, ['MI', 'SG'], True),
                                                               ('Eric_Closest_5', True, True, ['MI', 'YK', 'OC', 'SG', 'YG'], True),
                                                               ('Eric_Closest_7', True, True, school_codes, True)):
    probs = run_model(name, with_ASCs, combined, closest, alt_spec_closest)
    hard_cm = get_cm(probs, True)
    soft_cm = get_cm(probs, False)
    print("Hardmax Accuracy: {:2.2f} %".format(get_accuracy(hard_cm)))
    print("Softmax Accuracy: {:2.2f} %".format(get_accuracy(soft_cm)))
    print()

Results for Prop_Closest_0 model:
{'B_DIST': -0.13415780267314728, 'B_ENROL': 0.9202053070704863, 'B_FAM_DIST': 0.06594386641279779}
Hardmax Accuracy: 49.59 %
Softmax Accuracy: 36.79 %

