In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd
import math
import csv

df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df = df[df['Level'] != 'Other']
school_codes = ['SG', 'SC', 'MI', 'YK','YG', 'RY','OC']
df['School'] = df['School'].apply(lambda x: school_codes.index(x))
df['Extra_Util'] = df['Closest_School'].replace({'SG':0, 'SC':1, 'MI':1, 'YK':1,'YG':0, 'RY':0,'OC':0})
df['Closest_School_Code'] = df['Closest_School'].replace({'SG':1, 'SC':2, 'MI':3, 'YK':4,'YG':5, 'RY':6,'OC':7})

for code in school_codes:
    df['Enrol.' + code] = df.apply(lambda x: x['UG.' + code] if x['Level'] == 'UG' else x['Grad.' + code], axis=1)
df.columns
full_df = df.copy()

In [2]:
def run_model(name, with_ASCs, combined = True):
    database = db.Database("SMTO", df.select_dtypes(include = 'number'))
    ASCs, V, av = [], {}, {}
    B_DIST = Beta('B_DIST', 0, None, None, 0)
    B_ENROL = Beta('B_ENROL', 1, None, None, 1 if with_ASCs else 0)
    B_FAM_DIST = Beta('B_FAM_DIST', 0, None, None, 0)
    B_CLOSEST_SCHOOL = Beta('B_CLOSEST_SCHOOL', 0, None, None, 0)

    for i in range(len(school_codes)):
        code = school_codes[i]
        ASCs.append(Beta('ASC_' + code, 0, None, None, 0 if with_ASCs and code != 'SG' else 1))
        if combined:
            V[i] = ASCs[i] + B_CLOSEST_SCHOOL * database.variables["Extra_Util"] + B_ENROL *  database.variables["Enrol." + code] + B_DIST * database.variables['Dist.' + code] + B_FAM_DIST * database.variables["Dist." + code] * database.variables["Family"]
        else:
            V[i] = ASCs[i] + B_CLOSEST_SCHOOL * database.variables["Extra_Util"] + B_ENROL *  database.variables["Enrol." + code] + B_DIST * database.variables['Dist.' + code]
        av[i] = 1   
        
    logprob = models.loglogit(V, av, database.variables["School"])
    test_dict = {'loglike': logprob, 'weight': database.variables["Exp_Segment"]}
    biogeme  = bio.BIOGEME(database,test_dict,numberOfThreads=1)
    biogeme.modelName = name
    results = biogeme.estimate(saveIterations=True)
    betas = results.getBetaValues()
    
    print("Results for " + name + " model:")
    print(betas)
    print()
    return betas

In [3]:
school_codes

['SG', 'SC', 'MI', 'YK', 'YG', 'RY', 'OC']

In [4]:
def get_cm(name, betas, with_ASCs, combined = True):
    
    school_code_nums = [1,2,3,4,5,6,7]
    for i in range(len(school_codes)):
        code = school_codes[i]
        code_num = school_code_nums[i]
        if with_ASCs:
            if combined:
                df['V_'+ code] = (betas['ASC_' + code] if code != 'SG' else 0) + ((betas['B_CLOSEST_SCHOOL'] * df['Extra_Util']) if (df['Closest_School_Code'] == code_num) else 0)  + betas['B_DIST'] * df['Dist.' + code] + betas['B_FAM_DIST'] * df['Dist.' + code] * df['Family'] + df['Total.' + code]
            else:
                df['V_'+ code] = (betas['ASC_' + code] if code != 'SG' else 0) + ((betas['B_CLOSEST_SCHOOL'] * df['Extra_Util']) if (df['Closest_School_Code'] == code_num) else 0) + betas['B_DIST'] * df['Dist.' + code] + df['Total.' + code]
        else:
            if combined:
                df['V_'+ code] = ((betas['B_CLOSEST_SCHOOL'] * df['Extra_Util']) if (df['Closest_School_Code'] == code_num) else 0) + betas['B_DIST'] * df['Dist.' + code] + betas['B_FAM_DIST'] * df['Dist.' + code] * df['Family'] + df['Total.' + code] * betas['B_ENROL']
            else:
                df['V_'+ code] = ((betas['B_CLOSEST_SCHOOL'] * df['Extra_Util']) if (df['Closest_School_Code'] == code_num) else 0) + betas['B_DIST'] * df['Dist.' + code] + df['Total.' + code] * betas['B_ENROL']

    utils = df.iloc[:,-7:]
    for i in range(len(school_codes)):
        code = school_codes[i]
        df['P_' + code] = utils.apply(lambda x: math.exp(x['V_' + code]) / sum([math.exp(j) for j in x]), axis = 1)
    probs = pd.concat((df['School'], df.iloc[:,-7:]), axis=1)
    
    print("Softmax confusion matrix for " + name + " model:")
    softmax_cm = []
    for school in range(len(school_codes)):
        softmax_cm.append(probs[probs['School'] == school][['P_' + i for i in school_codes]].sum().values.tolist())
        print(*probs[probs['School'] == school][['P_' + i for i in school_codes]].sum().values)
    
    print(softmax_cm)
    accuracy = 0
    total = 0
    for i in range(7):
        total += sum(softmax_cm[i])
        accuracy += softmax_cm[i][i]
    accuracy = accuracy/total
    print('\nAccuracy: ' + str(accuracy))
    
    
    print("\nHardmax confusion matrix for " + name + " model:")
    for school in range(len(school_codes)):
        print(*[(probs[probs['School'] == school][['P_' + i for i in school_codes]].idxmax(axis = 1) == 'P_' + j).sum() for j in school_codes])
    
    print()

In [5]:
for (x, y) in (('Eric', True),('Proposed', False)):
    df = full_df.copy()
    print("----------- Combined ----------")
    get_cm(x, run_model(x, y), y)
    
    #print("----------- Family ----------")
    #df = full_df[full_df['Family'] == 1]
    #get_cm(x, run_model(x, y, False), y, False)
    
    #print("----------- Non-Family ----------")
    #df = full_df[full_df['Family'] == 0]
    #get_cm(x, run_model(x, y, False), y, False)

----------- Combined ----------
Results for Eric model:
{'ASC_MI': 0.2011806898377365, 'ASC_OC': 0.06357656413189902, 'ASC_RY': 0.06470721849183579, 'ASC_SC': 0.3618409607319917, 'ASC_YG': 0.20797003569327974, 'ASC_YK': 0.2374327087295077, 'B_CLOSEST_SCHOOL': 0.0, 'B_DIST': -0.13582354467887925, 'B_FAM_DIST': 0.07077240285997809}



ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().