## Data Loading and Transformation

In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd
import math
import csv

hh_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Respondents.csv')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime', 'psuniversityinvolvednumyears']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation']])
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years'})
df = df.dropna() # Remove rows with missing data

# Convert Campus column to numerical column
campus_name_to_num = {"Downtown Toronto (St. George)": 0, "Scarborough (UTSC)": 1, "Mississauga (UTM)": 2,
                      "Keele": 3, "Glendon": 4, "RyersonU": 5, "OCADu": 6}
df.replace({'Campus': campus_name_to_num}, inplace=True)

# Convert columns to numerical
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Family'] = (df['Family'] == 'Live with family/parents')*1

# All campuses available to all students
df['Available'] = 1

# Dataframe with walk distances
df_path = pd.read_csv('../../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Function for distance lookup
not_found = set()
def find_distance(origin, destination):
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return 0
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return 0
    return dists[i*2392 + j] / 1000

# List of campus' TTS zones from Joven's MOE data
campus_zones = (69, 566, 3631, 391, 225, 38, 67)
school_codes = ('SG', 'SC', 'MI', 'YK', 'YG', 'RY', 'OC')

# Load distances into dataframe
for i in range(len(campus_zones)):
    df["Dist_" + school_codes[i]] = df['HomeZone'].apply(lambda x: find_distance(x, campus_zones[i]))
print("# of zones not found:", len(not_found))

# Load enrollment data
enrollment_df = pd.read_csv('../../Data/Enrolment/Joven_Enrollment.csv').set_index('School')
def get_log_enrollment(level, school):
    return math.log1p(enrollment_df.loc[school][level]) # Level: UG, Grad, Total

# of zones not found: 127


## Running Model

In [2]:
def print_results(results, i):
    print("___________Segment " + str(i) + "__________")
    #print("n:" + str(results.getGeneralStatistics()['Sample size'][0]), "\tR^2", results.getGeneralStatistics()['Rho-square for the init. model'][0])
    print(results.getEstimatedParameters()[['Value', 'p-value']])
    print()

    
def run_model(run_name, num_segments, row_to_segment, segment_to_level = None):
    global df
    
    df_in_func = df.copy()
    df_in_func['Segment'] = df_in_func.apply(row_to_segment, axis=1)
    df_in_func = df_in_func.drop(columns=['Level', 'Status', 'Years', 'HomeZone', 'Family'])
    
    for i in range(num_segments):
        temp_df = df_in_func.copy()
        database = db.Database("SMTO", temp_df)
        globals().update(database.variables)
        database.remove(Dist_SG == 0) # Remove unknown distances
        database.remove(Segment != i)

        # Beta initialization: (name, value, lowerbound, upperbound, status, desc='')
        # Status 0 if estimated, 1 if maintained - reference choice should be 1
        
        ASCs, Vs, AVs = [], [], {}
        if segment_to_level: # Enrollment
            level = segment_to_level(i)
            log_enrols = []
            for j in range(len(school_codes)):
                log_enrols.append(get_log_enrollment(level, school_codes[j]))
                ASCs.append(Beta('ASC_' + school_codes[j], log_enrols[j], None, None, 1))           
        else:
            for j in range(len(school_codes)):
                ASCs.append(Beta('ASC_' + school_codes[j], 0, None, None, 0 if school_codes[j] != 'SG' else 1))

        B_DIST = Beta('B_DIST', -0.0005, None, None, 0)
        
        for j in range(len(school_codes)):
            Vs[j] = ASCs[j] + B_DIST * database.variables["Dist_" + school_codes[i]]     
            AVs[j] = Available

        logprob = models.loglogit(Vs, AVs, Campus)

        import biogeme.messaging as msg
        logger = msg.bioMessage()
        logger.setDebug()
                
        biogeme = bio.BIOGEME(database, logprob, numberOfThreads=1)
        biogeme.modelName = run_name + str(i)
        results = biogeme.estimate(saveIterations=False)
        print_results(results, i)
        
        full_results['Sample_Size'].append(results.getGeneralStatistics()['Sample size'][0])
        full_results['Log_Lhood'].append(results.getGeneralStatistics()['Final log likelihood'][0])
        full_results['Akaike'].append(results.getGeneralStatistics()['Akaike Information Criterion'][0])
        full_results['Bayesian'].append(results.getGeneralStatistics()['Bayesian Information Criterion'][0])
        
        full_results['B_Dist'].append(results.getEstimatedParameters()['Value']['B_DIST'])
        full_results['B_Dist_p'].append(results.getEstimatedParameters()['p-value']['B_DIST'])

## Segmentation Method (must start at 0)

In [3]:
def segments13(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return (0 if x.Status == 'FT' else 2) + (not x.Family) + (1 if x.Years == 0 else 5)
    else:
        return 9 + (0 if x.Status == 'FT' else 2) + (not x.Family)
def segments13_to_level(segment):
    if segment == 0:
        return 'Total'
    elif segment < 9:
        return 'UG'
    else:
        return 'Grad'
    
    
def segments2(x):
    return (not x.Family)*1
def segments2_to_level(segment):
    return 'Total'


def segments7(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return (not x.Family) + (1 if x.Years == 0 else 3)
    else:
        return 5 + (not x.Family)
def segments7_to_level(segment):
    if segment == 0:
        return 'Total'
    elif segment  < 5:
        return 'UG'
    else:
        return 'Grad'
    

def segments4(x):
    return (not x.Family) + (0 if (x.Level == 'UG') else 2)
def segments4_to_level(segment):
    if segment < 2:
        return 'Grad' # Includes other
    else:
        return 'UG'
    

def miller_segments(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return 3 if x.Status == 'PT' else (not x.Family) + 1
    else:
        return 6 if x.Status == 'PT' else (4 + (not x.Family))
def miller_segments_to_level(segment):
    if segment == 0:
        return 'Total'
    elif segment < 4:
        return 'UG'
    else:
        return 'Grad'

## Main Code

In [4]:
#run_model("LC_Segment_", 13, segments13, segments13_to_level)
#run_model("LC_ASC_Segment_", 13, segments13)
#run_model("LC_Bool_Segment_", 2, segments2)
#run_model("LC_TriLevel_Segment_", 7, segments7, segments7_to_level)
#run_model("LC_TriLevel_ASC_Segment_", 7, segments7)
#run_model("LC_BiLevel_Segment_", 4, segments4, segments4_to_level)
#run_model("LC_BiLevel_ASC_Segment_", 4, segments4)
#run_model("LC_Miller_Segment_", 7, miller_segments, miller_segments_to_level)
#run_model("LC_Miller_ASC_Segment_", 7, miller_segments)

In [5]:
result_headers = ('Sample_Size', 'B_Dist', 'B_Dist_p', 'Log_Lhood', 'Akaike', 'Bayesian')
full_results = {}
for header in result_headers:
    full_results[header] = []
run_model("LC_Miller_ASC_Segment_", 7, miller_segments)
run_model("LC_Miller_Segment_", 7, miller_segments, miller_segments_to_level)

[15:58:26] < General >   Remove 7 unused variables from the database as only 3 are used.
[15:58:26] < General >   Log likelihood (N=309):  -601.2862
[15:58:26] < General >   Minimize with tol 1e-07
[15:58:26] < Debug >     ASC_MI:          0
[15:58:26] < Debug >     ASC_OC:          0
[15:58:26] < Debug >     ASC_RY:          0
[15:58:26] < Debug >     ASC_SC:          0
[15:58:26] < Debug >     ASC_YG:          0
[15:58:26] < Debug >     ASC_YK:          0
[15:58:26] < Debug >     B_DIST:    -0.0005
[15:58:26] < General >   Log likelihood (N=309):  -601.2862 Gradient norm:      1e+02  
[15:58:26] < Debug >     ASC_MI: -0.2483039
[15:58:26] < Debug >     ASC_OC: -0.2895243
[15:58:26] < Debug >     ASC_RY:  0.8577772
[15:58:26] < Debug >     ASC_SC:  -0.255174
[15:58:26] < Debug >     ASC_YG: -0.2276937
[15:58:26] < Debug >     ASC_YK: -0.04220185
[15:58:26] < Debug >     B_DIST:    -0.0005
[15:58:26] < General >   Log likelihood (N=309):  -482.6405 Gradient norm:      9e+01  
[15:58:26

[15:58:26] < General >   Results saved in file LC_Miller_ASC_Segment_0~05.html
[15:58:26] < General >   Results saved in file LC_Miller_ASC_Segment_0~05.pickle
___________Segment 0__________
           Value       p-value
ASC_MI -2.224623  2.267632e-09
ASC_OC -3.610918  4.680485e-07
ASC_RY  0.825834  3.133293e-09
ASC_SC -2.358152  2.470266e-09
ASC_YG -1.906169  3.661367e-09
ASC_YK -0.666479  8.392125e-04
B_DIST -0.000500  1.000000e+00

[15:58:28] < General >   Remove 7 unused variables from the database as only 3 are used.
[15:58:29] < General >   Log likelihood (N=6843):  -13315.86
[15:58:29] < General >   Minimize with tol 1e-07
[15:58:29] < Debug >     ASC_MI:          0
[15:58:29] < Debug >     ASC_OC:          0
[15:58:29] < Debug >     ASC_RY:          0
[15:58:29] < Debug >     ASC_SC:          0
[15:58:29] < Debug >     ASC_YG:          0
[15:58:29] < Debug >     ASC_YK:          0
[15:58:29] < Debug >     B_DIST:    -0.0005
[15:58:29] < General >   Log likelihood (N=6843):  -1

[15:58:31] < General >   Remove 7 unused variables from the database as only 3 are used.
[15:58:32] < General >   Log likelihood (N=3334):  -6487.664
[15:58:32] < General >   Minimize with tol 1e-07
[15:58:32] < Debug >     ASC_MI:          0
[15:58:32] < Debug >     ASC_OC:          0
[15:58:32] < Debug >     ASC_RY:          0
[15:58:32] < Debug >     ASC_SC:          0
[15:58:32] < Debug >     ASC_YG:          0
[15:58:32] < Debug >     ASC_YK:          0
[15:58:32] < Debug >     B_DIST:    -0.0005
[15:58:32] < General >   Log likelihood (N=3334):  -6487.664 Gradient norm:      6e+02  
[15:58:32] < Debug >     ASC_MI: -0.4396729
[15:58:32] < Debug >     ASC_OC: -0.4815782
[15:58:32] < Debug >     ASC_RY:  0.2059789
[15:58:32] < Debug >     ASC_SC: -0.3869033
[15:58:32] < Debug >     ASC_YG: -0.5964298
[15:58:32] < Debug >     ASC_YK:  0.1640736
[15:58:32] < Debug >     B_DIST:    -0.0005
[15:58:32] < General >   Log likelihood (N=3334):    -6003.7 Gradient norm:      4e+02  
[15:58:

[15:58:34] < Debug >     ASC_MI:  -2.051272
[15:58:34] < Debug >     ASC_OC:  -2.108428
[15:58:34] < Debug >     ASC_RY: -0.8641055
[15:58:34] < Debug >     ASC_SC:  -1.828127
[15:58:34] < Debug >     ASC_YG:  -2.801576
[15:58:34] < Debug >     ASC_YK: -0.5658853
[15:58:34] < Debug >     B_DIST:    -0.0005
[15:58:34] < General >   Log likelihood (N=689):  -1087.776 Gradient norm:      9e-05 Hessian norm:       2e+02 BHHH norm:       2e+02
[15:58:34] < General >   Results saved in file LC_Miller_ASC_Segment_3~05.html
[15:58:34] < General >   Results saved in file LC_Miller_ASC_Segment_3~05.pickle
___________Segment 3__________
           Value       p-value
ASC_MI -2.051272  0.000000e+00
ASC_OC -2.108428  0.000000e+00
ASC_RY -0.864106  3.552714e-15
ASC_SC -1.828127  0.000000e+00
ASC_YG -2.801576  0.000000e+00
ASC_YK -0.565885  1.207432e-08
B_DIST -0.000500  1.000000e+00

[15:58:35] < General >   Remove 7 unused variables from the database as only 3 are used.
[15:58:35] < General >   Log

[15:58:36] < General >   Results saved in file LC_Miller_ASC_Segment_4~05.html
[15:58:36] < General >   Results saved in file LC_Miller_ASC_Segment_4~05.pickle
___________Segment 4__________
           Value  p-value
ASC_MI -3.670858      0.0
ASC_OC -4.007328      0.0
ASC_RY -1.714798      0.0
ASC_SC -3.670858      0.0
ASC_YG -4.923621      0.0
ASC_YK -1.234744      0.0
B_DIST -0.000500      1.0

[15:58:38] < General >   Remove 7 unused variables from the database as only 3 are used.
[15:58:38] < General >   Log likelihood (N=2225):   -4329.65
[15:58:38] < General >   Minimize with tol 1e-07
[15:58:38] < Debug >     ASC_MI:          0
[15:58:38] < Debug >     ASC_OC:          0
[15:58:38] < Debug >     ASC_RY:          0
[15:58:38] < Debug >     ASC_SC:          0
[15:58:38] < Debug >     ASC_YG:          0
[15:58:38] < Debug >     ASC_YK:          0
[15:58:38] < Debug >     B_DIST:    -0.0005
[15:58:38] < General >   Log likelihood (N=2225):   -4329.65 Gradient norm:      6e+02  
[15:

[15:58:41] < Debug >     ASC_RY:  -1.880638
[15:58:41] < Debug >     ASC_SC:  -5.084836
[15:58:41] < Debug >     ASC_YG:  -4.467719
[15:58:41] < Debug >     ASC_YK: -0.9147084
[15:58:41] < Debug >     B_DIST:    -0.0005
[15:58:41] < General >   Log likelihood (N=339):  -365.1925 Gradient norm:          2  
[15:58:41] < Debug >     ASC_MI:  -5.161828
[15:58:41] < Debug >     ASC_OC:  -2.737889
[15:58:41] < Debug >     ASC_RY:  -1.813124
[15:58:41] < Debug >     ASC_SC:  -5.161828
[15:58:41] < Debug >     ASC_YG:  -4.435286
[15:58:41] < Debug >     ASC_YK: -0.9071434
[15:58:41] < Debug >     B_DIST:    -0.0005
[15:58:41] < General >   Log likelihood (N=339):  -365.1039 Gradient norm:        0.9  
[15:58:41] < Debug >     ASC_MI:  -5.199764
[15:58:41] < Debug >     ASC_OC:  -2.741271
[15:58:41] < Debug >     ASC_RY:  -1.811143
[15:58:41] < Debug >     ASC_SC:  -5.199764
[15:58:41] < Debug >     ASC_YG:   -4.40277
[15:58:41] < Debug >     ASC_YK: -0.9047902
[15:58:41] < Debug >     B_DIST:

[15:58:53] < General >   Remove 7 unused variables from the database as only 3 are used.
[15:58:54] < General >   Log likelihood (N=2225):  -2198.074
[15:58:54] < General >   Minimize with tol 1e-07
[15:58:54] < Debug >     B_DIST:    -0.0005
[15:58:54] < General >   Log likelihood (N=2225):  -2198.074 Gradient norm:      6e-14  
[15:58:54] < Debug >     B_DIST:    -0.0005
[15:58:54] < General >   Log likelihood (N=2225):  -2198.074 Gradient norm:      6e-14 Hessian norm:       8e-12 BHHH norm:       9e-27
[15:58:54] < General >   Results saved in file LC_Miller_Segment_5~04.html
[15:58:54] < General >   Results saved in file LC_Miller_Segment_5~04.pickle
___________Segment 5__________
         Value  p-value
B_DIST -0.0005      1.0

[15:58:55] < General >   Remove 7 unused variables from the database as only 3 are used.
[15:58:55] < General >   Log likelihood (N=339):  -388.2551
[15:58:55] < General >   Minimize with tol 1e-07
[15:58:55] < Debug >     B_DIST:    -0.0005
[15:58:55] < G

In [6]:
with open('Run_Results.csv', mode='w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    headers = ("Segment", 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6)
    writer.writerow(headers)
    for header in result_headers:
        writer.writerow([header] + full_results[header])

In [7]:
df.head()

Unnamed: 0,Campus,Level,Status,Years,HomeZone,Family,Available,Dist_SG,Dist_SC,Dist_MI,Dist_YK,Dist_YG,Dist_RY,Dist_OC
0,1,UG,FT,2.0,261,1,1,10.25606,14.88098,29.20657,22.59214,9.218413,9.580635,11.24173
1,0,Grad,FT,3.0,71,0,1,1.132351,23.0392,19.6429,15.87906,11.21115,2.675173,2.723838
2,0,UG,FT,5.0,3714,1,1,23.31923,45.63271,4.51742,28.58045,32.5552,24.964,23.68615
3,0,UG,FT,2.0,74,0,1,0.699414,24.11954,19.43932,16.81186,12.83041,2.314008,1.541276
4,0,Grad,FT,6.0,71,0,1,1.132351,23.0392,19.6429,15.87906,11.21115,2.675173,2.723838


In [8]:
df_in_func = df.copy()
df_in_func = df_in_func.drop(columns=['Level', 'Status', 'Years', 'HomeZone', 'Family'])
database = db.Database("SMTO", df_in_func)
database.variables

{'Campus': Campus,
 'Available': Available,
 'Dist_SG': Dist_SG,
 'Dist_SC': Dist_SC,
 'Dist_MI': Dist_MI,
 'Dist_YK': Dist_YK,
 'Dist_YG': Dist_YG,
 'Dist_RY': Dist_RY,
 'Dist_OC': Dist_OC}

In [9]:
database.data

Unnamed: 0,Campus,Available,Dist_SG,Dist_SC,Dist_MI,Dist_YK,Dist_YG,Dist_RY,Dist_OC
0,1,1,10.256060,14.88098,29.20657,22.59214,9.218413,9.580635,11.241730
1,0,1,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838
2,0,1,23.319230,45.63271,4.51742,28.58045,32.555200,24.964000,23.686150
3,0,1,0.699414,24.11954,19.43932,16.81186,12.830410,2.314008,1.541276
4,0,1,1.132351,23.03920,19.64290,15.87906,11.211150,2.675173,2.723838
...,...,...,...,...,...,...,...,...,...
15221,3,1,7.120260,19.01731,23.76046,14.19530,5.732956,6.553092,8.193741
15222,3,1,15.917590,12.03644,32.96591,17.68772,6.019180,15.242170,16.903260
15223,3,1,2.783940,25.09743,17.84462,15.53600,12.829600,4.733398,3.979057
15224,3,1,23.379880,26.15476,37.35434,13.30458,15.379040,23.250580,24.518920
