In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd
import math
import csv

df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df = df[df['Level'] != 'Other']
school_codes = ['SG', 'SC', 'MI', 'YK','YG', 'RY','OC']
df['School'] = df['School'].apply(lambda x: school_codes.index(x))

for code in school_codes:
    df['Enrol.' + code] = df.apply(lambda x: x['UG.' + code] if x['Level'] == 'UG' else x['Grad.' + code], axis=1)
df.columns

Index(['Campus', 'Level', 'Status', 'Mode_Actual', 'Gender', 'Licence', 'Work',
       'Age', 'HomeZone', 'Family',
       ...
       'Closest.YG', 'Closest.YK', 'PD', 'Enrol.SG', 'Enrol.SC', 'Enrol.MI',
       'Enrol.YK', 'Enrol.YG', 'Enrol.RY', 'Enrol.OC'],
      dtype='object', length=107)

In [2]:
name = 'Proposed'
with_ASCs = False
combined = True

database = db.Database("SMTO", df.select_dtypes(include = 'number'))
ASCs, V, av = [], {}, {}
B_DIST = Beta('B_DIST', 0, None, None, 0)
B_ENROL = Beta('B_ENROL', 1, None, None, 1 if with_ASCs else 1)
B_FAM_DIST = Beta('B_FAM_DIST', 0, None, None, 0)

for i in range(len(school_codes)):
    code = school_codes[i]
    ASCs.append(Beta('ASC_' + code, 0, None, None, 0 if with_ASCs and code != 'SG' else 1))
    if combined:
        V[i] = ASCs[i] + B_ENROL *  database.variables["Enrol." + code] + B_DIST * database.variables['Dist.' + code] + B_FAM_DIST * database.variables["Dist." + code] * database.variables["Family"]
    else:
        V[i] = ASCs[i] + B_ENROL *  database.variables["Enrol." + code] + B_DIST * database.variables['Dist.' + code]
    av[i] = 1   

logprob = models.loglogit(V, av, database.variables["School"])
test_dict = {'loglike': logprob, 'weight': database.variables["Exp_Segment"]}
biogeme  = bio.BIOGEME(database,test_dict,numberOfThreads=1)
biogeme.modelName = name
results = biogeme.estimate(saveIterations=True)
betas = results.getBetaValues()

print("Results for " + name + " model:")
print(betas)
print()

Results for Proposed model:
{'B_DIST': -0.13475000404502052, 'B_FAM_DIST': 0.06520057534791564}



In [3]:
# Load 2019 Data
full_df = pd.read_csv('../../Data/SMTO_2019/SMTO_2019_Complete_Input.csv')
new_school_codes = full_df['School'].unique().tolist()
uni_codes = full_df[full_df['School_Type'] == 'University']['School'].unique().tolist()
col_codes = full_df[full_df['School_Type'] == 'College']['School'].unique().tolist()

# Convert School column to numeric
full_df['School'] = full_df['School'].apply(lambda x: new_school_codes.index(x))

# Remove rows with missing information
full_df = full_df.dropna(subset = ['Family'])
full_df['Family'] = (full_df['Family'] * 1).astype(int)

cols_to_keep = ['School', 'Family'] + ['Dist.' + code for code in new_school_codes]
new_database = db.Database("SMTO_2019", full_df[cols_to_keep])

In [4]:
# Load enrollment data
enrol_df = pd.read_csv('../../Data/School_Info_2019_Pred_Enrol.csv').set_index('Code')

def code_to_log_enrol(code):
    """
    Return natural logarithm of total enrollment of campus with given code
    If code is invalid, raise KeyError
    If no enrollment information available for that code, return np.nan
    """
    return math.log(enrol_df.loc[code]['Total'])

In [None]:
for i in range(len(new_school_codes)):
    code = new_school_codes[i]
    enrollment = code_to_log_enrol(code)
    V[i] = code_to_log_enrol(code) + B_DIST * new_database.variables['Dist.' + code] + B_FAM_DIST * new_database.variables["Dist." + code] * new_database.variables["Family"]
    av[i] = 1
           
simulate = {'Prob.' + new_school_codes[i]: models.logit(V, av, i) for i in range(len(new_school_codes))}
sim_biogeme = bio.BIOGEME(new_database, simulate)
probs = sim_biogeme.simulate(betas).set_index(full_df.index)    
hard_cm, soft_cm = [], []

In [14]:
for i in range(len(new_school_codes)):
    hard_cm.append([(probs[full_df['School'] == i][['Prob.' + j for j in new_school_codes]].idxmax(axis = 1) == 'Prob.' + k).sum() for k in new_school_codes])
    soft_cm.append((probs[full_df['School'] == i][['Prob.' + j for j in new_school_codes]].sum().values.tolist()))     

print("Hardmax Accuracy: {:2.2f} %".format(get_accuracy(hard_cm)))
print("Softmax Accuracy: {:2.2f} %".format(get_accuracy(soft_cm)))

def get_accuracy(cm):
    """
    Given confusion matrix as 2D array, return accuracy
    """
    correct = sum([cm[i][i] for i in range(len(cm))])
    return correct/sum(sum(cm,[])) * 100

Hardmax Accuracy: 34.33 %
Softmax Accuracy: 24.94 %


In [10]:
probs.sum().sort_values()

Prob.CEG       0.968568
Prob.CDS       1.021997
Prob.CPI       1.120589
Prob.CDV       1.862359
Prob.MOI       7.038287
Prob.MCB      28.247471
Prob.MOS      60.012749
Prob.CST      68.320504
Prob.DWH      70.854962
Prob.OTN      76.301257
Prob.CAS     104.138213
Prob.YG      107.871307
Prob.MCM     151.282823
Prob.SHH     164.719527
Prob.CMO     170.737135
Prob.OC      181.751952
Prob.OTD     213.942204
Prob.SHD     340.141023
Prob.SHT     350.197483
Prob.DOS     393.129659
Prob.SC      493.711189
Prob.CPR     520.416063
Prob.MI      599.352757
Prob.MOF     746.401724
Prob.RY     1487.914747
Prob.YK     1747.985050
Prob.SG     2915.558400
dtype: float64

In [12]:
full_df['School'].value_counts(sort=False)

0      299
8      509
16     299
24     523
1      157
9       63
17      67
25    1287
2       59
10    1033
18    2288
26      90
3       39
11      27
19     280
4       12
12     260
20     214
5        2
13      31
21     150
6        1
14      11
22    2498
7       11
15     258
23     537
Name: School, dtype: int64