# Biogeme Test - SMTO Data

Gonna try to adapt the Biogeme Swissmetro example to university choice using SMTO data set. We'll try to predict __mode__ choice using distance.

In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable
import pandas as pd

hh_df = pd.read_csv('../../Data/SMTO_2019/SMTO_2019_Households.csv', low_memory = False)
ps_df = pd.read_csv('../../Data/SMTO_2019/SMTO_2019_Respondents.csv', low_memory = False)

In [2]:
# Load relevant columns
df = ps_df[['pscampusmain']]
df = df.join(ps_df['psinstitution'])
df = df.join(hh_df[['HmTTS2006']])
df = df.join(ps_df[['psmainmodefalltypical']])
df = df.rename(columns={'HmTTS2006': 'HomeZone', 'pscampusmain': 'Campus', 'psmainmodefalltypical': 'Mode'})
#df = df.dropna() # Remove rows with missing data

# Convert Campus and Mode column to numerical column
campus_name_to_num = {"Story Arts Centre ": 0, "Ashtonbee Campus": 1, "Progress Campus": 2,
                      "Morningside Campus": 3, "Downsview Campus": 4, "Eglinton Learning Site": 5, "Pickering Learning Site": 6,
                      "Performing Arts Commons at Daniels Spectrum": 7, "Oshawa Campus": 8, "Whitby Campus": 9,
                      "McMaster Campus": 10, "Burlington Campus/Ron Joyce Centre": 11, "Fennell Campus (Main Campus)": 12, "Stoney Creek Campus ": 13,
                      "Institute for Applied Health Sciences": 14, "Downtown Oshawa Campus": 15, "North Oshawa Campus": 16,
                      "Davis Campus, Brampton": 17, "Hazel McCallion Campus, Mississauga": 18, "Trafalgar Road Campus, Oakville": 19, "Mississauga Campus": 20,
                      "Scarborough Campus": 21, "St. George Campus": 22, "Keele Campus": 23,
                      "Glendon Campus": 24}

mode_name_to_num = {"Drive alone": 0, "Drive with passenger(s) (household members only)": 0, "Drive with passenger(s) (including non-household members)": 0, "Auto passenger (driver is a household member)":0, "Auto passenger (driver is a non-household member)":0, "Ride-hailing alone (UberX, Lyft etc.)":0, "Ride-hailing with other passengers (Uberpool, Lyftpool etc.)": 0, "Taxi": 0, 
                    "Transit Bus": 1, "Streetcar": 1, "Subway/RT": 1, "GO Bus": 1, "GO Train": 1, 
                    "Walk": 2, "Bicycle": 2}
#"Bikeshare": 2

df.replace({'Campus': campus_name_to_num}, inplace=True)
df.replace({'Mode': mode_name_to_num}, inplace=True)
print(df.shape)
#temp_df = df[df['psinstitution'] == 'Ryerson University']
#temp_df.head()

df.loc[df['psinstitution'] == 'Ryerson University', 'Campus'] = 25
df.loc[df['psinstitution'] == 'OCAD University', 'Campus'] = 26

# Remove "Other" Modes
df = df[(df['Mode'] == 0) | (df['Mode'] == 1) | (df['Mode'] == 2)]
print(df.shape)
df['HomeZone'] = pd.to_numeric(df['HomeZone'], downcast='signed')
df['Mode'] = pd.to_numeric(df['Mode'], downcast='signed')
df['Campus'] = pd.to_numeric(df['Campus'], downcast='signed')
del df['psinstitution']

df.head()

(19135, 4)
(10561, 4)


Unnamed: 0,Campus,HomeZone,Mode
0,2,3851.0,1
1,3,181.0,1
2,2,1039.0,1
5,2,544.0,1
11,2,548.0,1


In [3]:
# Dataframe with walk distances
df_path = pd.read_csv('../../../LoS/Walk_Distances.csv')
origins = list(set(list(df_path['Origin'])))
dists = list(df_path['Data'])

# Dataframe with AutoTravelTimes
df_att = pd.read_csv('../../../LoS/Auto_Travel_Times.csv')
AutoTravelTimes = list(df_att['Data'])

# Dataframe with TransitTravelTimes
df_ttt = pd.read_csv('../../../LoS/Transit_Travel_Times.csv')
TransitTravelTimes = list(df_ttt['Data'])

In [4]:
not_found = set()
# Function for distance/AutoTravelTime/TransitTravelTime lookup

def find_value(origin, destination, mode_num):
    # mode_num: 0 - to find AutoTravelTime
    #           1 - to find TransitTravelTime
    #           2 - to find Walking Distance
    try:
        i = origins.index(origin)
    except ValueError:
        not_found.add(origin)
        return 0
    try:
        j = origins.index(destination)
    except ValueError:
        not_found.add(destination)
        return 0
    
    if mode_num == 0:
        return AutoTravelTimes[i*2392 + j]
    elif mode_num == 1:
        return TransitTravelTimes[i*2392 + j]
    elif mode_num == 2:
        return dists[i*2392 + j] / 1000
    else:
        print("ERROR: Enter correct mode_num!")
        return 0

print(df.shape)
df.head()

(10561, 3)


Unnamed: 0,Campus,HomeZone,Mode
0,2,3851.0,1
1,3,181.0,1
2,2,1039.0,1
5,2,544.0,1
11,2,548.0,1


In [5]:
# List of campus' TTS zones from Joven's MOE data
campus_zones = [282, 526, 493, 564, 419, 200, 1042, 17, 1179, 1148, 5198, 4069, 5142, 5119, 5198, 1208, 1179, 3325, 3842, 4029, 3631, 566, 69, 391, 225, 38, 67]
# Add column with campus zones
df['CampusZone'] = df['Campus'].apply(lambda x: campus_zones[x])

# Add columns for Distance, Transit Travel Time, Auto Travel Time
df['AIVTT'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 0), axis=1)
df['TPTT'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 1), axis=1)
df['Dist'] = df.apply(lambda x: find_value(x.HomeZone, x.CampusZone, 2), axis=1)

IndexError: list index out of range

In [None]:
print(df.shape)
df.head()

In [None]:
# Load data into Biogeme database
database = db.Database("SMTO", df)

# Definition of new variables: adding columns to the database 
Available =  DefineVariable('Available', 1, database) # All modes available to all students

# Make variable names global
globals().update(database.variables)

# Remove unknown values
database.remove(Dist == 0.0)
database.remove(TPTT == 0.0)
database.remove(AIVTT == 0.0)

In [None]:
df['Mode'].value_counts()

In [None]:
"""
# Here we use the "biogeme" way for backward compatibility
exclude = HomeToMainCampusKM > 130
database.remove(exclude)
database.data

# Parameters to be estimated
ASC_CAR = Beta('ASC_CAR',0,None,None,0)
ASC_TRAIN = Beta('ASC_TRAIN',0,None,None,0)
ASC_SM = Beta('ASC_SM',0,None,None,1)
B_TIME = Beta('B_TIME',0,None,None,0)
B_COST = Beta('B_COST',0,None,None,0)

# Definition of new variables
SM_COST =  SM_CO   * (  GA   ==  0  ) 
TRAIN_COST =  TRAIN_CO   * (  GA   ==  0  )

# Definition of new variables: adding columns to the database 
CAR_AV_SP =  DefineVariable('CAR_AV_SP',CAR_AV  * (  SP   !=  0  ),database)
TRAIN_AV_SP =  DefineVariable('TRAIN_AV_SP',TRAIN_AV  * (  SP   !=  0  ),database)
TRAIN_TT_SCALED = DefineVariable('TRAIN_TT_SCALED',\
                                 TRAIN_TT / 100.0,database)
TRAIN_COST_SCALED = DefineVariable('TRAIN_COST_SCALED',\
                                   TRAIN_COST / 100,database)
SM_TT_SCALED = DefineVariable('SM_TT_SCALED', SM_TT / 100.0,database)
SM_COST_SCALED = DefineVariable('SM_COST_SCALED', SM_COST / 100,database)
CAR_TT_SCALED = DefineVariable('CAR_TT_SCALED', CAR_TT / 100,database)
CAR_CO_SCALED = DefineVariable('CAR_CO_SCALED', CAR_CO / 100,database)

# Definition of the utility functions
V1 = ASC_TRAIN + \
     B_TIME * TRAIN_TT_SCALED + \
     B_COST * TRAIN_COST_SCALED
V2 = ASC_SM + \
     B_TIME * SM_TT_SCALED + \
     B_COST * SM_COST_SCALED
V3 = ASC_CAR + \
     B_TIME * CAR_TT_SCALED + \
     B_COST * CAR_CO_SCALED

# Associate utility functions with the numbering of alternatives
V = {1: V1,
     2: V2,
     3: V3}

# Associate the availability conditions with the alternatives
av = {1: TRAIN_AV_SP,
      2: SM_AV,
      3: CAR_AV_SP}
      
# Definition of the model. This is the contribution of each
# observation to the log likelihood function.
logprob = models.loglogit(V,av,CHOICE)

"""


# Beta initialization: (name, value, lowerbound, upperbound, status, desc='')
# Status 0 if estimated, 1 if maintained - reference choice should be 1
ASC_AU = Beta('ASC_AU', 0, None, None, 1)
ASC_TR = Beta('ASC_TR', 0, None, None, 0)
ASC_AC = Beta('ASC_AC', 0, None, None, 0)
B_AIVTT = Beta('B_AIVTT', -0.001, None, None, 0)
B_TPTT = Beta('B_TPTT', -0.001, None, None, 0)
B_DIST = Beta('B_DIST', -0.0001, None, None, 0)


# Variables: from columns in database
AV = DefineVariable('AV', Available, database)
AU_AIVTT = DefineVariable('AU_AIVTT', AIVTT, database)
TR_TPTT = DefineVariable('TR_TPTT', TPTT, database)
AC_DIST = DefineVariable('AC_DIST', Dist, database)

# Mode Choice Utility Functions: ASC_AU is 0
V0 = ASC_AU + B_AIVTT * AU_AIVTT 
V1 = ASC_TR + B_TPTT * TR_TPTT
V2 = ASC_AC + B_DIST * AC_DIST


V  = {0: V0, 1: V1, 2: V2}
av = {0: AV, 1: AV, 2: AV}

In [None]:
database.data

In [None]:
# Definition of the model. This is the contribution of each
# observation to the log likelihood function.
logprob = models.loglogit(V, av, Mode)

# Define level of verbosity
import biogeme.messaging as msg
logger = msg.bioMessage()
logger.setDebug()
#logger.setWarning()
#logger.setGeneral()
#logger.setDetailed()

# Create the Biogeme object
biogeme  = bio.BIOGEME(database,logprob,numberOfThreads=1)
biogeme.modelName = "SMTO_2019_ModeChoice_Run2"
#,numberOfThreads=1

# Estimate the parameters
results = biogeme.estimate(saveIterations=True)
biogeme.createLogFile()

# Print the estimated values
betas = results.getBetaValues()
for k,v in betas.items():
    print(f"{k:10}=\t{v:.3g}")

# Get the results in a pandas table
pandasResults = results.getEstimatedParameters()
print(pandasResults)

Upon running, this produces several output files. The most important is the html file.

### Resources
UT Austin example: https://www.youtube.com/watch?v=QeJgyBIaXMQ  
Biogeme example: https://www.youtube.com/watch?v=OiM94B8WayA  
Nested logit example: https://www.youtube.com/watch?v=vEhvf54IKvs