In [None]:
import pandas as pd
import biogeme.biogeme as bio
import biogeme.database as db
from biogeme import models
from biogeme.expressions import Beta
from biogeme.expressions import Variable
import numpy as np

In [None]:
data = pd.read_csv('../data/preprocessed_data.csv')

In [None]:
USER_MAP = {
    u: i+1 for (i, u) in enumerate(data.user_id.unique())
}

# Drop section_mode_argmax and available_modes.
data.drop(columns=[
    'section_mode_argmax', 'available_modes', 'section_distance_argmax', 'section_duration_argmax'
    ], inplace=True)

data['user_id'] = data['user_id'].apply(lambda x: USER_MAP[x])

In [None]:
# Convert from min -> hrs
data[[c for c in data.columns if 'tt_' in c]] /= 60.

In [None]:
# Scale currency by 1/10.
data[[c for c in data.columns if 'cost_' in c]] /= 10.

In [None]:
display(data[[c for c in data.columns if 'tt_' in c]].describe())
display(data[[c for c in data.columns if 'cost_' in c]].describe())

In [None]:

for c in data.columns:
    if 'av_' in c:
        print(f"Feature {c} availability: {data.loc[:, c].sum()}/{data.shape[0]}")

In [None]:
# data.drop(columns=['tt_unknown','av_unknown'], inplace=True)
# data.drop(index=data.loc[(data.chosen == 9)].index, inplace=True)

In [None]:
database = db.Database('emission', data)

In [None]:
# Variable definitions.

# ID.
USER_ID = Variable('user_id')

# Availability.
AV_P_MICRO = Variable('av_p_micro')
AV_NO_TRIP = Variable('av_no_trip')
AV_S_CAR = Variable('av_s_car')
AV_TRANSIT = Variable('av_transit')
AV_CAR = Variable('av_car')
AV_S_MICRO = Variable('av_s_micro')
AV_RIDEHAIL = Variable('av_ridehail')
AV_WALK = Variable('av_walk')
AV_UNKNOWN = Variable('av_unknown')

# Time.
TT_P_MICRO = Variable('tt_p_micro')
TT_NO_TRIP = Variable('tt_no_trip')
TT_S_CAR = Variable('tt_s_car')
TT_TRANSIT = Variable('tt_transit')
TT_CAR = Variable('tt_car')
TT_S_MICRO = Variable('tt_s_micro')
TT_RIDEHAIL = Variable('tt_ridehail')
TT_WALK = Variable('tt_walk')
TT_UNKNOWN = Variable('tt_unknown')

# Cost.
CO_P_MICRO = Variable('cost_p_micro')
CO_NO_TRIP = Variable('cost_no_trip')
CO_S_CAR = Variable('cost_s_car')
CO_TRANSIT = Variable('cost_transit')
CO_CAR = Variable('cost_car')
CO_S_MICRO = Variable('cost_s_micro')
CO_RIDEHAIL = Variable('cost_ridehail')
CO_WALK = Variable('cost_walk')
CO_UNKNOWN = Variable('cost_unknown')

# Choice.
CHOICE = Variable('chosen')

In [None]:
# Parameters

# Create a coefficient from time and set the initial value to 0. There is no lower bound for this coefficient,
# but we would like to constrain the ceil to 0 since we want these coefficients to denote the utiltiy function.
# Finally, we want this parameter to be updated by the optimization algorithm.
B_TIME = Beta('B_TIME', 0, None, 0, 0)
B_COST = Beta('B_COST', 0, None, 0, 0)

# Alternative-Specific Constants.
ASC_P_MICRO = Beta('ASC_P_MICRO', 0, None, None, 0)
ASC_NO_TRIP = Beta('ASC_NO_TRIP', 0, None, None, 1)
ASC_S_CAR = Beta('ASC_S_CAR', 0, None, None, 0)
ASC_TRANSIT = Beta('ASC_TRANSIT', 0, None, None, 0)
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_S_MICRO = Beta('ASC_S_MICRO', 0, None, None, 0)
ASC_RIDEHAIL = Beta('ASC_RIDEHAIL', 0, None, None, 0)
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_UNKNOWN = Beta('ASC_UNKNOWN', 0, None, None, 1)

In [None]:
# Define the utility functions.
# number of alternatives = number of UFs.

V_P_MICRO = (
    ASC_P_MICRO +
    B_TIME * TT_P_MICRO +
    B_COST * CO_P_MICRO
)

V_NO_TRIP = (
    ASC_NO_TRIP +
    B_TIME * TT_NO_TRIP +
    B_COST * CO_NO_TRIP
)

V_S_CAR = (
    ASC_CAR +
    B_TIME * TT_CAR +
    B_COST * CO_CAR
)

V_TRANSIT = (
    ASC_TRANSIT +
    B_TIME * TT_TRANSIT +
    B_COST * CO_TRANSIT
)

V_CAR = (
    ASC_CAR +
    B_TIME * TT_CAR +
    B_COST * CO_CAR
)

V_S_MICRO = (
    ASC_S_MICRO +
    B_TIME * TT_S_MICRO +
    B_COST * CO_S_MICRO
)

V_RIDEHAIL = (
    ASC_RIDEHAIL +
    B_TIME * TT_RIDEHAIL +
    B_COST * CO_RIDEHAIL
)

V_WALK = (
    ASC_WALK +
    B_TIME * TT_WALK +
    B_COST * CO_WALK
)

V_UNKNOWN = (
    ASC_UNKNOWN +
    B_TIME * TT_UNKNOWN +
    B_COST * CO_UNKNOWN
)

In [None]:
# Map alterative to utility functions.
V = {
    1: V_P_MICRO, 
    2: V_NO_TRIP,
    3: V_S_CAR, 
    4: V_TRANSIT,
    5: V_CAR, 
    6: V_S_MICRO,
    7: V_RIDEHAIL, 
    8: V_WALK, 
    9: V_UNKNOWN
}

In [None]:
# Availability.
# We have already masked-off costs and times for the instances where a particular mode is not available to the user.
# So here, we can specify the availability as 1 for every mode, saying that every other mode should be included when computing
# the logprob for a certain mode.

av = {
    1: AV_P_MICRO,
    2: AV_NO_TRIP,
    3: AV_S_CAR,
    4: AV_TRANSIT,
    5: AV_CAR,
    6: AV_S_MICRO,
    7: AV_RIDEHAIL,
    8: AV_WALK,
    9: AV_UNKNOWN
}

In [None]:
logprob = models.loglogit(V, av, CHOICE)

In [None]:
model = bio.BIOGEME(database, logprob)
model.modelName = 'noMaskNoTrip'

In [None]:
results = model.estimate()

In [None]:
print(results.short_summary())

In [None]:
results.getEstimatedParameters()

In [None]:
results.getCorrelationResults()

In [None]:
def generate_metrics(model: bio.BIOGEME, data: pd.DataFrame):
    """
    Given a test dataframe and the trained model, generate precision, recall, F1.
    """

    pass