## Data Loading and Transformation (Run Once)

In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable, exp, log
import pandas as pd
import math
import csv

hh_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Households.csv')
ps_df = pd.read_csv('../../Data/SMTO_2015/SMTO_2015_Respondents.csv')
campus_info = pd.read_csv('Campus_Info.csv')

# Load relevant columns
df = ps_df[['pscampusattend', 'personstatusgrad', 'personstatustime', 'psuniversityinvolvednumyears', 'psmainmodefalltypical']]
df = df.join(hh_df[['HmTTS2006', 'hhlivingsituation']])
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusattend': 'Campus', 'hhlivingsituation': 'Family', 
                       'personstatusgrad': 'Level', 'personstatustime': 'Status', 'psuniversityinvolvednumyears': 'Years', 'psmainmodefalltypical': 'Mode'})
df = df.dropna() # Remove rows with missing data

# Convert Campus column to numerical column
df['Campus'] = df["Campus"].apply(lambda x: campus_info.index[campus_info['School'] == x].tolist()[0])

# Convert columns to numerical
mode_name_to_num = {"Car - Driver alone": 0, "Car - Driver with passenger(s)": 0, "Car - Passenger": 0, "Taxi": 0, "Transit Bus": 1, "Streetcar": 1, "Subway/RT": 1, "GO Bus": 1, "GO Train": 1, "Walk": 2, "Bicycle": 2}
df.replace({'Mode': mode_name_to_num}, inplace=True)
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Family'] = (df['Family'] == 'Live with family/parents')*1

# Remove "Other" Modes
df = df[(df['Mode'] == 0) | (df['Mode'] == 1) | (df['Mode'] == 2)]
df['Mode'] = pd.to_numeric(df['Mode'], downcast='signed')

# All campuses available to all students
df['Available'] = 1

# Dataframe with walk distances
df_path = pd.read_csv('../../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Dataframe with AutoTravelTimes
df_att = pd.read_csv('../../../LoS/Auto_Travel_Times.csv')
AutoTravelTimes = list(df_att['Data'])

# Dataframe with TransitTravelTimes
df_ttt = pd.read_csv('../../../LoS/Transit_Travel_Times.csv')
TransitTravelTimes = list(df_ttt['Data'])

# Function for distance lookup
not_found = set()
def find_value(origin, destination, mode):
    # mode: 0 - AIVTT, 1 - TPTT, 2 - Dist
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return 0
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return 0
    
    if mode == 0:
        return AutoTravelTimes[i*2392 + j] / 60
    elif mode == 1:
        return TransitTravelTimes[i*2392 + j] / 60
    elif mode == 2:
        return dists[i*2392 + j] / 1000
    else:
        print("Invalid mode!")
        return 0

# List of campus' TTS zones from Joven's MOE data
campus_zones = list(campus_info['Zone'])
school_codes = list(campus_info['Code'])

# Load times and distances into dataframe
for i in range(len(campus_zones)):
    df[school_codes[i] + "_AIVTT"] = df['HomeZone'].apply(lambda x: find_value(x, campus_zones[i], 0))
    df[school_codes[i] + "_TPTT"] = df['HomeZone'].apply(lambda x: find_value(x, campus_zones[i], 1))
    df[school_codes[i] + "_DIST"] = df['HomeZone'].apply(lambda x: find_value(x, campus_zones[i], 2))
print("# of zones not found:", len(not_found))

# Times/distances for actual campus
df['AIVTT'] = df.apply(lambda x: find_value(x.HomeZone, campus_zones[x.Campus], 0), axis=1)
df['TPTT'] = df.apply(lambda x: find_value(x.HomeZone, campus_zones[x.Campus], 1), axis=1)
df['DIST'] = df.apply(lambda x: find_value(x.HomeZone, campus_zones[x.Campus], 2), axis=1)

# Remove rows with unknowns
#df = df[df['AIVTT'] > 0 & df['TPTT'] > 0 &  df['DIST'] > 0]
df = df[(df['SG_DIST'] > 0) | (df['SC_DIST'] > 0)]

# All modes and campuses available to all students
df['Available'] = 1

# Load enrollment data
enrollment_df = pd.read_csv('../../Data/Enrolment/Joven_Enrollment.csv').set_index('School')
def get_log_enrollment(level, school):
    return math.log1p(enrollment_df.loc[school][level]) # Level: UG, Grad, Total

# of zones not found: 122


## Mode Choice Model

In [2]:
temp = df.copy()
temp = temp.drop(columns=['Level', 'Status', 'Years', 'HomeZone', 'Family'])

# Load data into Biogeme database
database = db.Database("SMTO_Full_Data", temp)

# Make variable names global
globals().update(database.variables)

# Beta initialization: (name, value, lowerbound, upperbound, status, desc='')
# Status 0 if estimated, 1 if maintained - reference choice should be 1
ASC_AU = Beta('ASC_AU', 0, None, None, 1)
ASC_TR = Beta('ASC_TR', 0, None, None, 0)
ASC_AC = Beta('ASC_AC', 0, None, None, 0)
B_AIVTT = Beta('B_AIVTT', -0.05, None, None, 0)
B_TPTT = Beta('B_TPTT', -0.05, None, None, 0)
B_DIST = Beta('B_DIST', -0.0005, None, None, 0)

# Mode Choice Utility Functions: ASC_AU is 0
V0 = ASC_AU + B_AIVTT * AIVTT
V1 = ASC_TR + B_TPTT * TPTT
V2 = ASC_AC + B_DIST * DIST

# Model definition
V  = {0: V0, 1: V1, 2: V2}
av = {0: Available, 1: Available, 2: Available}
logprob = models.loglogit(V, av, Mode)

# Create the Biogeme object
biogeme  = bio.BIOGEME(database, logprob, numberOfThreads=1)
biogeme.modelName = "Mode_Choice_Model"

# Estimate the parameters
results = biogeme.estimate(saveIterations=True)
betas = results.getBetaValues()
del results
betas

{'ASC_AC': 4.02808766197126,
 'ASC_TR': 2.012491812096565,
 'B_AIVTT': -0.5737258653133533,
 'B_DIST': -0.48027241851348973,
 'B_TPTT': -0.24055582623340382}

## Running Model

In [3]:
def print_results(results, i):
    print("___________Segment " + str(i) + "__________")
    #print("n:" + str(results.getGeneralStatistics()['Sample size'][0]), "\tR^2", results.getGeneralStatistics()['Rho-square for the init. model'][0])
    print(results.getEstimatedParameters()[['Value', 'p-value']])
    print()

    
def run_model(run_name, num_segments, row_to_segment, segment_to_level = None):
    global df, full_results, betas
    df_in_func = df.copy()
    df_in_func['Segment'] = df_in_func.apply(row_to_segment, axis=1)
    df_in_func = df_in_func.drop(columns=['Level', 'Status', 'Years', 'HomeZone', 'Family'])
    
    for i in range(num_segments):
        temp_df = df_in_func.copy()
        database = db.Database("SMTO", temp_df)
        globals().update(database.variables)
        database.remove(SG_DIST == 0) # Remove unknown distances
        database.remove(Segment != i)

        ASCs, V, av = [], {}, {}
        if segment_to_level: # Enrollment
            level = segment_to_level(i)
            log_enroll = []
            for j in range(len(school_codes)):
                log_enroll.append(get_log_enrollment(level, school_codes[j]))
                ASCs.append(Beta('ASC_' + school_codes[j], log_enroll[j], None, None, 1))
        else:
            for j in range(len(school_codes)):
                ASCs.append(Beta('ASC_' + school_codes[j], 0, None, None, 0 if school_codes[j] != 'YG' else 1))
        
        B_ACCESS = Beta('B_ACCESS', 0, None, None, 0)
      
        for j in range(len(school_codes)):
            av[j] = Available         
            V[j] = ASCs[j] + B_ACCESS * log(exp(database.variables[school_codes[j] + "_AIVTT"] * betas['B_AIVTT']) +
                                             exp(database.variables[school_codes[j] + "_DIST"] * betas['B_DIST'] + betas['ASC_AC']) +
                                             exp(database.variables[school_codes[j] + "_TPTT"] * betas['B_TPTT'] + betas['ASC_TR']))


        logprob = models.loglogit(V, av, Campus)

        biogeme = bio.BIOGEME(database, logprob, numberOfThreads=1)
        biogeme.modelName = run_name + str(i)
        results = biogeme.estimate(saveIterations=False)
        print_results(results, i)
        
        
        full_results['Sample_Size'].append(results.getGeneralStatistics()['Sample size'][0])
        full_results['Log_Lhood'].append(results.getGeneralStatistics()['Final log likelihood'][0])
        full_results['Akaike'].append(results.getGeneralStatistics()['Akaike Information Criterion'][0])
        full_results['Bayesian'].append(results.getGeneralStatistics()['Bayesian Information Criterion'][0])
        full_results['B_ACCESS'].append(results.getEstimatedParameters()['Value']['B_ACCESS'])
        full_results['B_ACCESS_P'].append(results.getEstimatedParameters()['p-value']['B_ACCESS'])
        
        if segment_to_level:
            for j in range(len(school_codes)):
                full_results['ASC_' + school_codes[j]].append(log_enroll[j])
        else:
            for code in school_codes:
                full_results['ASC_' + code].append(results.getEstimatedParameters()['Value']['ASC_' + code] if code != 'YG' else 0)
        

## Segmentation Method (must start at 0)

In [4]:
def segments13(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return (0 if x.Status == 'FT' else 2) + (not x.Family) + (1 if x.Years == 0 else 5)
    else:
        return 9 + (0 if x.Status == 'FT' else 2) + (not x.Family)
def segments13_to_level(segment):
    if segment == 0:
        return 'Total'
    elif segment < 9:
        return 'UG'
    else:
        return 'Grad'
    
    
def segments2(x):
    return (not x.Family)*1
def segments2_to_level(segment):
    return 'Total'


def segments7(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return (not x.Family) + (1 if x.Years == 0 else 3)
    else:
        return 5 + (not x.Family)
def segments7_to_level(segment):
    if segment == 0:
        return 'Total'
    elif segment  < 5:
        return 'UG'
    else:
        return 'Grad'
    

def segments4(x):
    return (not x.Family) + (0 if (x.Level == 'UG') else 2)
def segments4_to_level(segment):
    if segment < 2:
        return 'Grad' # Includes other
    else:
        return 'UG'
    

def miller_segments(x):
    if x.Level == 'Other':
        return 0
    elif x.Level == 'UG':
        return 3 if x.Status == 'PT' else (not x.Family) + 1
    else:
        return 6 if x.Status == 'PT' else (4 + (not x.Family))
def miller_segments_to_level(segment):
    if segment == 0:
        return 'Total'
    elif segment < 4:
        return 'UG'
    else:
        return 'Grad'

## Main Code

In [5]:
#run_model("LC_Segment_", 13, segments13, segments13_to_level)
#run_model("LC_ASC_Segment_", 13, segments13)
#run_model("LC_Bool_Segment_", 2, segments2, segments2_to_level)
#run_model("LC_Bool_ASC_Segment_", 2, segments2)
#run_model("LC_TriLevel_Segment_", 7, segments7, segments7_to_level)
#run_model("LC_TriLevel_ASC_Segment_", 7, segments7)
#run_model("LC_BiLevel_Segment_", 4, segments4, segments4_to_level)
#run_model("LC_BiLevel_ASC_Segment_", 4, segments4)

result_headers = ['Sample_Size', 'B_ACCESS', 'B_ACCESS_P']
for code in school_codes:
    result_headers.append('ASC_' + code)
result_headers += ['Log_Lhood', 'Akaike', 'Bayesian']
full_results = {}
for header in result_headers:
    full_results[header] = []
        
run_model("LC_Miller_ASC_Segment_", 7, miller_segments)
run_model("LC_Miller_Segment_", 7, miller_segments, miller_segments_to_level)

___________Segment 0__________
             Value       p-value
ASC_MI   -0.157172  7.479480e-01
ASC_OC   -1.963596  1.163815e-02
ASC_RY    2.449956  1.978417e-13
ASC_SC   -0.446260  3.797255e-01
ASC_SG    1.595978  3.786613e-06
ASC_YK    1.398151  1.301061e-04
B_ACCESS  1.576996  1.398881e-14

___________Segment 1__________
             Value   p-value
ASC_MI    1.090280  0.000000
ASC_OC    0.003987  0.968774
ASC_RY    2.127983  0.000000
ASC_SC    1.224162  0.000000
ASC_SG    2.139558  0.000000
ASC_YK    2.169079  0.000000
B_ACCESS  1.292926  0.000000

___________Segment 2__________
             Value       p-value
ASC_MI    0.791563  1.207512e-07
ASC_OC   -0.633404  1.272531e-05
ASC_RY    0.596828  3.722337e-06
ASC_SC    0.591976  7.558308e-05
ASC_SG    1.514116  0.000000e+00
ASC_YK    1.927695  0.000000e+00
B_ACCESS  2.240838  0.000000e+00

___________Segment 3__________
             Value       p-value
ASC_MI    0.756802  1.161835e-02
ASC_OC    0.457886  1.255972e-01
ASC_RY    1.67

In [6]:
with open('Run_Results.csv', mode='w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    headers = ("Segment",) + tuple(range(7)) + tuple(range(7))
    writer.writerow(headers)
    for header in result_headers:
        writer.writerow([header] + full_results[header])