In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd
import math
import csv

df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Complete_Input.csv')
df = df[df['Level'] != 'Other']
school_codes = ['SG', 'SC', 'MI', 'YK','YG', 'RY','OC']
df['School'] = df['School'].apply(lambda x: school_codes.index(x))

for code in school_codes:
    df['Enrol.' + code] = df.apply(lambda x: x['UG.' + code] if x['Level'] == 'UG' else x['Grad.' + code], axis=1)
df.columns
full_df = df.copy()

In [2]:
def run_model(name, with_ASCs, combined = True):
    database = db.Database("SMTO", df.select_dtypes(include = 'number'))
    ASCs, V, av = [], {}, {}
    B_DIST = Beta('B_DIST', 0, None, None, 0)
    B_ENROL = Beta('B_ENROL', 1, None, None, 1 if with_ASCs else 0)
    B_FAM_DIST = Beta('B_FAM_DIST', 0, None, None, 0)

    for i in range(len(school_codes)):
        code = school_codes[i]
        ASCs.append(Beta('ASC_' + code, 0, None, None, 0 if with_ASCs and code != 'SG' else 1))
        if combined:
            V[i] = ASCs[i] + B_ENROL *  database.variables["Enrol." + code] + B_DIST * database.variables['Dist.' + code] + B_FAM_DIST * database.variables["Dist." + code] * database.variables["Family"]
        else:
            V[i] = ASCs[i] + B_ENROL *  database.variables["Enrol." + code] + B_DIST * database.variables['Dist.' + code]
        av[i] = 1   
        
    logprob = models.loglogit(V, av, database.variables["School"])
    test_dict = {'loglike': logprob, 'weight': database.variables["Exp_Segment"]}
    biogeme  = bio.BIOGEME(database,test_dict,numberOfThreads=1)
    biogeme.modelName = name
    results = biogeme.estimate(saveIterations=True)
    betas = results.getBetaValues()
    
    print("Results for " + name + " model:")
    print(betas)
    print()
    return betas

In [3]:
def get_cm(name, betas, with_ASCs, combined = True):
    for i in range(len(school_codes)):
        code = school_codes[i]
        if with_ASCs:
            if combined:
                df['V_'+ code] = (betas['ASC_' + code] if code != 'SG' else 0) + betas['B_DIST'] * df['Dist.' + code] + betas['B_FAM_DIST'] * df['Dist.' + code] * df['Family'] + df['Total.' + code]
            else:
                df['V_'+ code] = (betas['ASC_' + code] if code != 'SG' else 0) + betas['B_DIST'] * df['Dist.' + code] + df['Total.' + code]
        else:
            if combined:
                df['V_'+ code] = betas['B_DIST'] * df['Dist.' + code] + betas['B_FAM_DIST'] * df['Dist.' + code] * df['Family'] + df['Total.' + code] * betas['B_ENROL']
            else:
                df['V_'+ code] = betas['B_DIST'] * df['Dist.' + code] + df['Total.' + code] * betas['B_ENROL']

    utils = df.iloc[:,-7:]
    for i in range(len(school_codes)):
        code = school_codes[i]
        df['P_' + code] = utils.apply(lambda x: math.exp(x['V_' + code]) / sum([math.exp(j) for j in x]), axis = 1)
    probs = pd.concat((df['School'], df.iloc[:,-7:]), axis=1)
    
    print("Softmax confusion matrix for " + name + " model:")
    for school in range(len(school_codes)):
        print(*probs[probs['School'] == school][['P_' + i for i in school_codes]].sum().values)
        
    print("\nHardmax confusion matrix for " + name + " model:")
    for school in range(len(school_codes)):
        print(*[(probs[probs['School'] == school][['P_' + i for i in school_codes]].idxmax(axis = 1) == 'P_' + j).sum() for j in school_codes])
    
    print()

In [4]:
for (x, y) in (('Eric', True),('Proposed', False)):
    df = full_df.copy()
    print("----------- Combined ----------")
    get_cm(x, run_model(x, y), y)
    
    print("----------- Family ----------")
    df = full_df[full_df['Family'] == 1]
    get_cm(x, run_model(x, y, False), y, False)
    
    print("----------- Non-Family ----------")
    df = full_df[full_df['Family'] == 0]
    get_cm(x, run_model(x, y, False), y, False)

----------- Combined ----------
Results for Eric model:
{'ASC_MI': 0.20118036434246545, 'ASC_OC': 0.06357574560641173, 'ASC_RY': 0.06470701039785538, 'ASC_SC': 0.36184190572835867, 'ASC_YG': 0.20797065171077458, 'ASC_YK': 0.23743216956064692, 'B_DIST': -0.13582353230363345, 'B_FAM_DIST': 0.070772379326524}

Softmax confusion matrix for Eric model:
2571.0958699644843 296.1062537823324 318.9483554565836 1116.0588218313687 82.90544763234428 1358.7188693866754 168.16638194621174
296.759531066591 278.10217582127405 32.24789831226019 252.62209606550823 24.723977124900294 170.13747391022576 19.406847699239265
258.74707517143986 24.09326729550723 268.9495529148846 219.11405763242885 9.051148519045764 132.20602726933146 17.838871197362614
876.5932074607556 199.6668318016154 244.30293837014975 1194.4075110708636 49.897944232907385 461.8407533330817 57.29081373063864
103.64810137502175 26.11821796136037 21.7082745321475 91.49939809468509 7.537367085437445 57.79831009146681 6.690330859881192
948.1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Softmax confusion matrix for Eric model:
731.4452896547597 207.1366933268724 204.11198650330502 682.5714462152771 40.2012125335824 538.0141739314413 48.51919783476053
216.49161019985877 144.36722666289373 27.620839920651612 204.51176856212302 16.6689887716554 167.20441530690198 14.135150575915011
183.1898168454982 20.63372724026189 144.5738412646765 171.0488711480401 6.677967285230955 127.2789314489335 12.596844767358723
532.526923246266 162.28161265515064 187.90466622434812 679.455718714101 31.699977667458256 385.1137795138622 35.01732197881524
57.787765606893224 20.839468047497174 17.03511891488182 59.10711466464752 3.540822196424408 42.89333229532482 3.796378274331062
521.9294051403011 173.70718195812418 178.06064886225818 539.7084859890844 29.912010140170285 383.02402068286534 34.658247227195005
65.44737109142417 22.21753802557699 20.038732189078203 65.45549931701363 3.9070817856116586 48.595141754966306 4.338635836329097

Hardmax confusion matrix for Eric model:
1125 181 154 992 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Softmax confusion matrix for Proposed model:
842.3552339794345 182.6029438510732 208.93024851897377 658.8290169622529 49.27855578241533 447.8990947811222 62.1049061247268
252.96606270612315 129.56238609449798 28.315291578591225 199.7422102945148 20.753777492970954 141.30332815948577 18.35694367381574
208.43868905905293 17.563208252687847 148.6334973016911 162.86029568505595 8.007600065285363 104.55488785316874 15.941821783058387
614.2014854591675 143.21480710279624 192.98624436902085 658.756688009343 38.920805085631265 321.03515892284986 44.88481105119242
66.78016988687202 18.47353125946857 17.476311481641027 57.19915699302885 4.355108743206829 35.83991614834922 4.875805487433508
601.9318553387928 153.8963940525037 182.71298499479212 521.9371983730996 36.740723943334054 319.3561852305199 44.42465806695916
75.51893999116179 19.669030614691614 20.548032738646693 63.352083921936604 4.804872323668174 40.54310347931282 5.56393693058236

Hardmax confusion matrix for Proposed model:
1313 142 