In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd
import math
import csv

df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df = df[df['Level'] != 'Other']
school_codes = ['SG', 'SC', 'MI', 'YK','YG', 'RY','OC']
df['School'] = df['School'].apply(lambda x: school_codes.index(x))

In [2]:
# Load 2019 Data
new_df = pd.read_csv('../../Data/SMTO_2019/SMTO_2019_Complete_Input.csv')
new_school_codes = new_df['School'].unique().tolist()
uni_codes = new_df[new_df['School_Type'] == 'University']['School'].unique().tolist()

# Convert School column to numeric
new_df['School'] = new_df['School'].apply(lambda x: new_school_codes.index(x))

# Remove rows with missing information
new_df['Family'] = (new_df['Family'] == True) * 1
new_df['Family'].value_counts()

0    10460
1     6056
Name: Family, dtype: int64

In [3]:
for code in school_codes:
    df['Closest.' + code] = ((df['Closest.' + code]) & (df['Dist.' + code] <= 2)).astype(int)
    new_df['Closest.' + code] = ((new_df['Closest.' + code]) & (new_df['Dist.' + code] <= 2)).astype(int)

In [4]:
# Load enrollment data
enrol_df = pd.read_csv('../../Data/School_Info_2019_Pred_Enrol.csv').set_index('Code')

def code_to_log_enrol(code):
    return math.log(enrol_df.loc[code]['Total'])

def get_accuracy(cm):
    correct = sum([cm[i][i] for i in range(len(cm))])
    return correct/sum(sum(cm,[])) * 100

In [5]:
# No Closest model
database = db.Database("SMTO_2015", df.select_dtypes(include = 'number'))
V, av = {}, {}
B_DIST = Beta('B_DIST', 0, None, None, 0)
B_ENROL = Beta('B_ENROL', 1, None, None, 0)
B_FAM_DIST = Beta('B_FAM_DIST', 0, None, None, 0)

for i in range(len(school_codes)):
    code = school_codes[i]
    V[i] = B_ENROL * database.variables["Total." + code] + database.variables['Dist.' + code] * (B_DIST + B_FAM_DIST * database.variables["Family"])
    av[i] = 1   

logprob = models.loglogit(V, av, database.variables["School"])
test_dict = {'loglike': logprob, 'weight': database.variables["Exp_Segment"]}
biogeme  = bio.BIOGEME(database, test_dict)
results = biogeme.estimate()
betas = results.getBetaValues()
betas

{'B_DIST': -0.13706131520670153,
 'B_ENROL': 1.006296222787749,
 'B_FAM_DIST': 0.06619384981254627}

In [6]:
simulate = {'Prob.' + school_codes[i]: models.logit(V, av, i) for i in range(len(school_codes))}
sim_biogeme = bio.BIOGEME(database, simulate)
probs = sim_biogeme.simulate(betas).set_index(df.index)    

hard_cm, soft_cm = [], []
for i in range(len(school_codes)):
    hard_cm.append([(probs[df['School'] == i][['Prob.' + j for j in school_codes]].idxmax(axis = 1) == 'Prob.' + k).sum() for k in school_codes])
    soft_cm.append((probs[df['School'] == i][['Prob.' + j for j in school_codes]].sum().values.tolist()))     

print("Hardmax Accuracy: {:2.2f} %".format(get_accuracy(hard_cm)))
print("Softmax Accuracy: {:2.2f} %".format(get_accuracy(soft_cm)))

Hardmax Accuracy: 47.15 %
Softmax Accuracy: 34.31 %


In [7]:
# Closest model
B_DIST = Beta('B_DIST', 0, None, None, 0)
B_ENROL = Beta('B_ENROL', 1, None, None, 0)
B_FAM_DIST = Beta('B_FAM_DIST', 0, None, None, 0)
B_CLOSEST = Beta('B_CLOSEST', 0, None, None, 0)

for i in range(len(school_codes)):
    code = school_codes[i]
    V[i] = B_ENROL * database.variables["Total." + code] + database.variables['Dist.' + code] * (B_DIST + B_FAM_DIST * database.variables["Family"]) + B_CLOSEST * database.variables['Closest.' + code]

logprob = models.loglogit(V, av, database.variables["School"])
test_dict = {'loglike': logprob, 'weight': database.variables["Exp_Segment"]}
biogeme  = bio.BIOGEME(database, test_dict)
results = biogeme.estimate()
betas2 = results.getBetaValues()
betas2

{'B_CLOSEST': 0.8494102407297817,
 'B_DIST': -0.11911691175045107,
 'B_ENROL': 1.0067965524075508,
 'B_FAM_DIST': 0.05008697540446607}

In [8]:
probs = sim_biogeme.simulate(betas2).set_index(df.index)    

hard_cm, soft_cm = [], []
for i in range(len(school_codes)):
    hard_cm.append([(probs[df['School'] == i][['Prob.' + j for j in school_codes]].idxmax(axis = 1) == 'Prob.' + k).sum() for k in school_codes])
    soft_cm.append((probs[df['School'] == i][['Prob.' + j for j in school_codes]].sum().values.tolist()))     

print("Hardmax Accuracy: {:2.2f} %".format(get_accuracy(hard_cm)))
print("Softmax Accuracy: {:2.2f} %".format(get_accuracy(soft_cm)))

Hardmax Accuracy: 47.16 %
Softmax Accuracy: 33.58 %


In [9]:
cols = ['Dist.' + code for code in new_school_codes] + ['Closest.' + code for code in new_school_codes] + ['Family']
new_database = db.Database("SMTO_2019", new_df[cols])
for i in range(len(new_school_codes)):
    code = new_school_codes[i]
    V[i] = B_ENROL * code_to_log_enrol(code) + new_database.variables['Dist.' + code] * (B_DIST + B_FAM_DIST * new_database.variables["Family"])
    av[i] = 1
           
simulate = {'Prob.' + new_school_codes[i]: models.logit(V, av, i) for i in range(len(new_school_codes))}
sim_biogeme = bio.BIOGEME(new_database, simulate)
probs = sim_biogeme.simulate(betas).set_index(new_df.index)    

hard_cm, soft_cm = [], []
for i in range(len(new_school_codes)):
    hard_cm.append([(probs[new_df['School'] == i][['Prob.' + j for j in new_school_codes]].idxmax(axis = 1) == 'Prob.' + k).sum() for k in new_school_codes])
    soft_cm.append((probs[new_df['School'] == i][['Prob.' + j for j in new_school_codes]].sum().values.tolist()))     

print("Hardmax Accuracy: {:2.2f} %".format(get_accuracy(hard_cm)))
print("Softmax Accuracy: {:2.2f} %".format(get_accuracy(soft_cm)))

Hardmax Accuracy: 33.06 %
Softmax Accuracy: 24.68 %


In [10]:
for i in range(len(new_school_codes)):
    code = new_school_codes[i]
    V[i] = B_ENROL * code_to_log_enrol(code) + new_database.variables['Dist.' + code] * (B_DIST + B_FAM_DIST * new_database.variables["Family"])+ B_CLOSEST * new_database.variables['Closest.' + code]
           
simulate = {'Prob.' + new_school_codes[i]: models.logit(V, av, i) for i in range(len(new_school_codes))}
sim_biogeme = bio.BIOGEME(new_database, simulate)
probs = sim_biogeme.simulate(betas2).set_index(new_df.index)    

hard_cm, soft_cm = [], []
for i in range(len(new_school_codes)):
    hard_cm.append([(probs[new_df['School'] == i][['Prob.' + j for j in new_school_codes]].idxmax(axis = 1) == 'Prob.' + k).sum() for k in new_school_codes])
    soft_cm.append((probs[new_df['School'] == i][['Prob.' + j for j in new_school_codes]].sum().values.tolist()))     

print("Hardmax Accuracy: {:2.2f} %".format(get_accuracy(hard_cm)))
print("Softmax Accuracy: {:2.2f} %".format(get_accuracy(soft_cm)))

Hardmax Accuracy: 32.21 %
Softmax Accuracy: 25.54 %
