In [116]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd
import math
import csv
import numpy as np

df = pd.read_csv('../../Data/SMTO_2019/SMTO_2019_Complete_Input.csv')

df['Family'].replace('', np.nan, inplace=True)
df.dropna(subset = ['Family'], inplace=True)
df['Family'] = df['Family'].astype(int)

uni_df = df[df['School_Type'] == 'University']
col_df = df[df['School_Type'] == 'College']
unis = uni_df['School'].unique().tolist()
cols = col_df['School'].unique().tolist()

#df = df[df['Level'] != 'Other']
#df = df[df['School_Type'] == 'University']
school_codes = df['School'].unique().tolist()
df['School'] = df['School'].apply(lambda x: school_codes.index(x))

full_df = df.copy()
df.head()

Unnamed: 0,Liv_Arr,Children,Cars,Income,Home_Zone,School_Name,Campus,Work,Licence,Mode,...,Dist.SHD,Dist.SHH,Dist.SHT,Dist.MI,Dist.SC,Dist.SG,Dist.YK,Dist.YG,Dist.RY,Dist.OC
0,Live with family/parents,0.0,0.0,,3851.0,Centennial College,Progress Campus,NW,False,Transit,...,29.60268,4.821538,16.18253,19.03615,54.90925,32.74982,40.58064,43.9828,33.22763,31.52583
1,Live with family/parents,0.0,1.0,,181.0,Centennial College,Morningside Campus,NW,False,Transit,...,35.02901,43.01722,51.23784,22.93553,22.04248,8.990107,9.21471,8.279897,10.30155,10.30233
2,Live with family/parents,0.0,2.0,I don't know,1039.0,Centennial College,Progress Campus,NW,False,Transit,...,68.36338,71.35209,81.92638,54.53982,13.64408,35.73098,40.61661,29.41865,35.05555,36.44957
4,Live with family/parents,1.0,1.0,,600.0,Centennial College,Progress Campus,NW,False,,...,49.27058,61.53779,69.75841,41.28113,11.49956,25.83839,18.81172,15.84128,25.16296,26.82406
5,Live with roommates,,0.0,I don't know,544.0,Centennial College,Progress Campus,PT,False,Transit,...,50.584,48.82541,59.43179,32.1869,10.84569,13.23639,27.52188,12.41869,12.52769,13.66335


In [121]:
# Load enrollment data
enrol_df = pd.read_csv('../../Data/School_Info_2019.csv').set_index('Code')

def code_to_log_enrol(code):
    """
    Return natural logarithm of total enrollment of campus with given code
    If code is invalid, raise KeyError
    If no enrollment information available for that code, return np.nan
    """
    return math.log(enrol_df.loc[code]['Total'])

In [122]:
def run_model(name, with_ASCs, combined = True):
    database = db.Database("SMTO", df[['Dist.' + code for code in school_codes] + ['Family', 'School']])
    ASCs, V, av = [], {}, {}
    B_DIST = Beta('B_DIST', 0, None, None, 0)
    #B_ENROL = Beta('B_ENROL', 1, None, None, 1 if with_ASCs else 0)
    B_FAM_DIST = Beta('B_FAM_DIST', 0, None, None, 0)
    
    # Definition of Nest
    NEST_UNI = Beta('NEST_UNI', 1, 1, None, 0)
    
    uni_nest_list = []
    col_nest_list = []
    
    for i in range(len(school_codes)):
        code = school_codes[i]
        enrollment = code_to_log_enrol(code)
        if np.isnan(enrollment): # No enrollment information available
            ASCs.append(Beta('ASC_' + code, 0, None, None, 0))
        else: 
            ASCs.append(enrollment)
            
        if combined:
            V[i] = ASCs[i] + B_DIST * database.variables['Dist.' + code] + B_FAM_DIST * database.variables["Dist." + code] * database.variables["Family"]
        else:
            V[i] = ASCs[i] + B_DIST * database.variables['Dist.' + code]
        av[i] = 1 
        
        if code in unis:
            uni_nest_list.append(i)
        else:
            col_nest_list.append(i)
            
    # List of alternatives
    UNI = NEST_UNI, uni_nest_list
    COL =  1.0, col_nest_list      
    
    nests = UNI, COL
        
    logprob = models.lognested(V, av, nests, database.variables["School"])
    test_dict = {'loglike': logprob} # 'weight': database.variables["Exp_Segment"]}
    biogeme  = bio.BIOGEME(database,test_dict,numberOfThreads=1)
    biogeme.modelName = name
    results = biogeme.estimate(saveIterations=True)
    betas = results.getBetaValues()
    
    print("Results for " + name + " model:")
    print(betas)
    print()
    return betas

In [107]:
school_codes

['MCM', 'MCB', 'OC', 'OTN', 'OTD', 'RY', 'SG', 'SC', 'MI', 'YK', 'YG']

In [123]:
def get_cm(name, betas, with_ASCs, combined = True):
    
    for i in range(len(school_codes)):
        code = school_codes[i]
        if with_ASCs:
            if combined:
                df['V_'+ code] = (betas['ASC_' + code] if code != 'SG' else 0) + betas['B_DIST'] * df['Dist.' + code] + betas['B_FAM_DIST'] * df['Dist.' + code] * df['Family']
            else:
                df['V_'+ code] = (betas['ASC_' + code] if code != 'SG' else 0) + betas['B_DIST'] * df['Dist.' + code]
        else:
            if combined:
                df['V_'+ code] = betas['B_DIST'] * df['Dist.' + code] + betas['B_FAM_DIST'] * df['Dist.' + code] * df['Family']
            else:
                df['V_'+ code] = betas['B_DIST'] * df['Dist.' + code]

    utils = df.iloc[:,-(len(school_codes)):]
    for i in range(len(school_codes)):
        code = school_codes[i]
        df['P_' + code] = utils.apply(lambda x: math.exp(x['V_' + code]) / sum([math.exp(j) for j in x]), axis = 1)
    probs = pd.concat((df['School'], df.iloc[:,-7:]), axis=1)
    
    print("Softmax confusion matrix for " + name + " model:")
    softmax_cm = []
    for school in range(len(school_codes)):
        softmax_cm.append(probs[probs['School'] == school][['P_' + i for i in school_codes]].sum().values.tolist())
        print(*probs[probs['School'] == school][['P_' + i for i in school_codes]].sum().values)
    
    print(softmax_cm)
    accuracy = 0
    total = 0
    for i in range(7):
        total += sum(softmax_cm[i])
        accuracy += softmax_cm[i][i]
    accuracy = accuracy/total
    print('\nAccuracy: ' + str(accuracy))
    
    
    print("\nHardmax confusion matrix for " + name + " model:")
    for school in range(len(school_codes)):
        print(*[(probs[probs['School'] == school][['P_' + i for i in school_codes]].idxmax(axis = 1) == 'P_' + j).sum() for j in school_codes])
    
    print()

In [76]:
'''
for (x, y) in (('Eric', True),('Proposed', False)):
    df = full_df.copy()
    print("----------- Combined ----------")
    get_cm(x, run_model(x, y), y)
    
    #print("----------- Family ----------")
    #df = full_df[full_df['Family'] == 1]
    #get_cm(x, run_model(x, y, False), y, False)
    
    #print("----------- Non-Family ----------")
    #df = full_df[full_df['Family'] == 0]
    #get_cm(x, run_model(x, y, False), y, False)
'''

'\nfor (x, y) in ((\'Eric\', True),(\'Proposed\', False)):\n    df = full_df.copy()\n    print("----------- Combined ----------")\n    get_cm(x, run_model(x, y), y)\n    \n    #print("----------- Family ----------")\n    #df = full_df[full_df[\'Family\'] == 1]\n    #get_cm(x, run_model(x, y, False), y, False)\n    \n    #print("----------- Non-Family ----------")\n    #df = full_df[full_df[\'Family\'] == 0]\n    #get_cm(x, run_model(x, y, False), y, False)\n'

In [124]:
df = full_df.copy()
print("----------- Combined ----------")
run_model('Proposed', True)

----------- Combined ----------


KeyboardInterrupt: 