In [1]:
# Import packages
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd
import math
import csv
import numpy as np

# Load data
full_df = pd.read_csv('../../../Data/SMTO_2019/SMTO_2019_Complete_Input.csv')
school_codes = full_df['School'].unique().tolist()
uni_codes = full_df[full_df['School_Type'] == 'University']['School'].unique().tolist()
col_codes = full_df[full_df['School_Type'] == 'College']['School'].unique().tolist()

# Convert School column to numeric
full_df['School'] = full_df['School'].apply(lambda x: school_codes.index(x))

# Remove rows with missing information
full_df = full_df.dropna(subset = ['Family'])
full_df['Family'] = (full_df['Family'] * 1).astype(int)

# Subsetted data
uni_df = full_df[full_df['School_Type'] == 'University']
col_df = full_df[full_df['School_Type'] == 'College']

In [2]:
# Load enrollment data
enrol_df = pd.read_csv('../../../Data/School_Info_2019.csv').set_index('Code')

def code_to_log_enrol(code):
    """
    Return natural logarithm of total enrollment of campus with given code
    If code is invalid, raise KeyError
    If no enrollment information available for that code, return np.nan
    """
    return math.log(enrol_df.loc[code]['Total'])

In [3]:
def get_accuracy(cm):
    """
    Given confusion matrix as 2D array, return accuracy
    """
    correct = sum([cm[i][i] for i in range(len(cm))])
    return correct/sum(sum(cm,[])) * 100

In [16]:
for df, codes in ((full_df, school_codes),):
    school_nums = df['School'].unique().tolist()    
    cols_to_keep = ['School', 'Family'] + ['Dist.' + code for code in codes]
    database = db.Database("SMTO_2019", df[cols_to_keep])
    ASCs, V, av = [], {}, {}
    B_DIST = Beta('B_DIST', 0, None, None, 0)
    B_FAM_DIST = Beta('B_FAM_DIST', 0, None, None, 0)

    for i in range(len(codes)):
        code = codes[i]
        enrollment = code_to_log_enrol(code)
        if np.isnan(enrollment): # No enrollment information available
            ASCs.append(Beta('ASC_' + code, 0, None, None, 0))
        else: 
            ASCs.append(enrollment)
        V[school_nums[i]] = ASCs[i] + B_DIST * database.variables['Dist.' + code] + B_FAM_DIST * database.variables["Dist." + code] * database.variables["Family"]
        av[school_nums[i]] = 1

    logprob = models.loglogit(V, av, database.variables["School"])
    biogeme = bio.BIOGEME(database, logprob, numberOfThreads=1)
    results = biogeme.estimate()
    betas = results.getBetaValues()
    print(betas)
    
    """simulate = {'Prob.' + codes[i]: models.logit(V, av, school_nums[i]) for i in range(len(codes))}
    sim_biogeme = bio.BIOGEME(database, simulate)
    probs = sim_biogeme.simulate(betas).set_index(df.index)    
    hard_cm, soft_cm = [], []
    for i in range(len(codes)):
        hard_cm.append([(probs[full_df['School'] == school_nums[i]][['Prob.' + j for j in codes]].idxmax(axis = 1) == 'Prob.' + k).sum() for k in codes])
        soft_cm.append((probs[full_df['School'] == school_nums[i]][['Prob.' + j for j in codes]].sum().values.tolist()))    

    print("Hardmax Accuracy: {:2.2f} %".format(get_accuracy(hard_cm)))
    print("Softmax Accuracy: {:2.2f} %".format(get_accuracy(soft_cm)))"""

{'ASC_CDS': 3.024670519423971, 'ASC_CDV': 3.7427251877395284, 'ASC_CMO': 8.30295391970257, 'ASC_MCB': 6.679542367791751, 'ASC_MCM': 14.763427577264302, 'ASC_OTD': 8.248844567444067, 'ASC_OTN': 9.565721862253193, 'B_DIST': -0.036415079482295803, 'B_FAM_DIST': -0.002423058541245792}


In [17]:
print("\tEstimated ASC\t\tEstimated Enrollment")
for asc in ASCs:
    if isinstance(asc, float):
        pass
    else:
        name = str(asc).split("(")[0]
        print(name + "\t" + str(betas[name]) + "\t" + str(math.exp(betas[name])))

	Estimated ASC		Estimated Enrollment
ASC_CMO	8.30295391970257	4035.7761624998
ASC_CDV	3.7427251877395284	42.21287155723552
ASC_CDS	3.024670519423971	20.5872205043546
ASC_MCM	14.763427577264302	2580329.281556645
ASC_MCB	6.679542367791751	795.9547741149407
ASC_OTN	9.565721862253193	14267.248321270277
ASC_OTD	8.248844567444067	3823.2058119570465
