## Data Loading and Transformation

In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd
import math
import csv

hh_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Respondents.csv')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime', 'psuniversityinvolvednumyears']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation']])
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years'})
df = df.dropna() # Remove rows with missing data

# Convert Campus column to numerical column
campus_name_to_num = {"Downtown Toronto (St. George)": 0, "Scarborough (UTSC)": 1, "Mississauga (UTM)": 2,
                      "Keele": 3, "Glendon": 4, "RyersonU": 5, "OCADu": 6}
df.replace({'Campus': campus_name_to_num}, inplace=True)

# Convert columns to numerical
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Family'] = (df['Family'] == 'Live with family/parents')*1

# All campuses available to all students
df['Available'] = 1

# Dataframe with walk distances
df_path = pd.read_csv('../../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Function for distance lookup
not_found = set()
def find_distance(origin, destination):
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return 0
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return 0
    return dists[i*2392 + j] / 1000

# List of campus' TTS zones from Joven's MOE data
campus_zones = [69, 566, 3631, 391, 225, 38, 67]

# Load distances into dataframe
for i in range(len(campus_zones)):
    df["Dist" + str(i)] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i]))
print("# of zones not found:", len(not_found))

# Load enrollment data
enrollment_df = pd.read_csv('../../Data/Enrolment/Joven_Enrollment.csv').set_index('School')
def get_log_enrollment(level, school):
    return math.log1p(enrollment_df.loc[school][level]) # Level: UG, Grad, Total

# of zones not found: 127


## Running Model

In [10]:
def print_results(results, i):
    print("___________Segment " + str(i) + "__________")
    #print("n:" + str(results.getGeneralStatistics()['Sample size'][0]), "\tR^2", results.getGeneralStatistics()['Rho-square for the init. model'][0])
    print(results.getEstimatedParameters()[['Value', 'p-value']])
    print()

    
def run_model(run_name, num_segments, row_to_segment, segment_to_level = None):
    global df
    
    df_in_func = df.copy()
    df_in_func['Segment'] = df_in_func.apply(row_to_segment, axis=1)
    df_in_func = df_in_func.drop(columns=['Level', 'Status', 'Years', 'HomeZone', 'Family'])
    
    for i in range(num_segments):
        temp_df = df_in_func.copy()
        database = db.Database("SMTO", temp_df)
        globals().update(database.variables)
        database.remove(Dist0 == 0) # Remove unknown distances
        database.remove(Segment != i)

        # Beta initialization: (name, value, lowerbound, upperbound, status, desc='')
        # Status 0 if estimated, 1 if maintained - reference choice should be 1
        if segment_to_level: # Enrollment
            level = segment_to_level(i)
            ASC_SG = Beta('ASC_SG', get_log_enrollment(level, 'SG'), None, None, 1)
            ASC_SC = Beta('ASC_SC', get_log_enrollment(level, 'SC'), None, None, 1)
            ASC_MI = Beta('ASC_MI', get_log_enrollment(level, 'MI'), None, None, 1)
            ASC_YK = Beta('ASC_YK', get_log_enrollment(level, 'YK'), None, None, 1)
            ASC_YG = Beta('ASC_YG', get_log_enrollment(level, 'YG'), None, None, 1)
            ASC_RY = Beta('ASC_RY', get_log_enrollment(level, 'RY'), None, None, 1)
            ASC_OC = Beta('ASC_OC', get_log_enrollment(level, 'OC'), None, None, 1)
        else:
            ASC_SG = Beta('ASC_SG', 0, None, None, 0)
            ASC_SC = Beta('ASC_SC', 0, None, None, 0)
            ASC_MI = Beta('ASC_MI', 0, None, None, 0)
            ASC_YK = Beta('ASC_YK', 0, None, None, 0)
            ASC_YG = Beta('ASC_YG', 0, None, None, 1)
            ASC_RY = Beta('ASC_RY', 0, None, None, 0)
            ASC_OC = Beta('ASC_OC', 0, None, None, 0)

        B_DIST = Beta('B_DIST', 0, None, None, 0)

        # Variables: from columns in database
        AV = DefineVariable('AV', Available, database)
        SG_DIST = DefineVariable('SG_DIST', Dist0, database)
        SC_DIST = DefineVariable('SC_DIST', Dist1, database)
        MI_DIST = DefineVariable('MI_DIST', Dist2, database)
        YK_DIST = DefineVariable('YK_DIST', Dist3, database)
        YG_DIST = DefineVariable('YG_DIST', Dist4, database)
        RY_DIST = DefineVariable('RY_DIST', Dist5, database)
        OC_DIST = DefineVariable('OC_DIST', Dist6, database)

        # Utility Functions: note ASC_YG is 0
        V0 = ASC_SG + B_DIST * SG_DIST
        V1 = ASC_SC + B_DIST * SC_DIST
        V2 = ASC_MI + B_DIST * MI_DIST
        V3 = ASC_YK + B_DIST * YK_DIST
        V4 = ASC_YG + B_DIST * YG_DIST
        V5 = ASC_RY + B_DIST * RY_DIST
        V6 = ASC_OC + B_DIST * OC_DIST

        V  = {0: V0, 1: V1, 2: V2, 3: V3, 4: V4, 5: V5, 6: V6}
        av = {0: AV, 1: AV, 2: AV, 3: AV, 4: AV, 5: AV, 6: AV}

        logprob = models.loglogit(V, av, Campus)

        biogeme = bio.BIOGEME(database, logprob, numberOfThreads=1)
        biogeme.modelName = run_name + str(i)
        results = biogeme.estimate(saveIterations=False)
        print_results(results, i)
        
        full_results['Sample_Size'].append(results.getGeneralStatistics()['Sample size'][0])
        full_results['Log_Lhood'].append(results.getGeneralStatistics()['Final log likelihood'][0])
        full_results['Akaike'].append(results.getGeneralStatistics()['Akaike Information Criterion'][0])
        full_results['Bayesian'].append(results.getGeneralStatistics()['Bayesian Information Criterion'][0])
        
        full_results['B_Dist'].append(results.getEstimatedParameters()['Value']['B_DIST'])
        full_results['B_Dist'].append(results.getEstimatedParameters()['p-value']['B_DIST'])

## Segmentation Method (must start at 0)

In [3]:
def segments13(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return (0 if x.Status == 'FT' else 2) + (not x.Family) + (1 if x.Years == 0 else 5)
    else:
        return 9 + (0 if x.Status == 'FT' else 2) + (not x.Family)
def segments13_to_level(segment):
    if segment == 0:
        return 'Total'
    elif segment < 9:
        return 'UG'
    else:
        return 'Grad'
    
    
def segments2(x):
    return (not x.Family)*1
def segments2_to_level(segment):
    return 'Total'


def segments7(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return (not x.Family) + (1 if x.Years == 0 else 3)
    else:
        return 5 + (not x.Family)
def segments7_to_level(segment):
    if segment == 0:
        return 'Total'
    elif segment  < 5:
        return 'UG'
    else:
        return 'Grad'
    

def segments4(x):
    return (not x.Family) + (0 if (x.Level == 'UG') else 2)
def segments4_to_level(segment):
    if segment < 2:
        return 'Grad' # Includes other
    else:
        return 'UG'
    

def miller_segments(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return 3 if x.Status == 'PT' else (not x.Family) + 1
    else:
        return 6 if x.Status == 'PT' else (4 + (not x.Family))
def miller_segments_to_level(segment):
    if segment == 0:
        return 'Total'
    elif segment < 4:
        return 'UG'
    else:
        return 'Grad'

## Main Code

In [4]:
#run_model("LC_Segment_", 13, segments13, segments13_to_level)
#run_model("LC_ASC_Segment_", 13, segments13)
#run_model("LC_Bool_Segment_", 2, segments2)
#run_model("LC_TriLevel_Segment_", 7, segments7, segments7_to_level)
#run_model("LC_TriLevel_ASC_Segment_", 7, segments7)
#run_model("LC_BiLevel_Segment_", 4, segments4, segments4_to_level)
#run_model("LC_BiLevel_ASC_Segment_", 4, segments4)
#run_model("LC_Miller_Segment_", 7, miller_segments, miller_segments_to_level)
#run_model("LC_Miller_ASC_Segment_", 7, miller_segments)

In [11]:
result_headers = ('Sample_Size', 'B_Dist', 'B_Dist_p', 'Log_Lhood', 'Akaike', 'Bayesian')
full_results = {}
for header in result_headers:
    full_results[header] = []
run_model("LC_Miller_ASC_Segment_", 7, miller_segments)
#run_model("LC_Miller_Segment_", 7, miller_segments, miller_segments_to_level)

73        6.776886
87        1.572915
155       1.297744
243       1.897934
278       5.671636
           ...    
14877     9.536488
14929    41.918740
14953    21.296900
15130    18.082850
15175    12.097850
Name: Dist0, Length: 309, dtype: float64
Dist0


AttributeError: 'Series' object has no attribute 'setRow'

In [None]:
"""
with open('Run_Results.csv', mode='w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    headers = ("Segment", 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6)
    writer.writerow(headers)
    for header in result_headers:
        writer.writerow([header] + full_results[header])
"""