Install biogeme: `pip3 install biogeme==3.2.12`

In [None]:
import pandas as pd
import numpy as np
from enum import Enum
from sklearn.model_selection import train_test_split

import pandas as pd
import biogeme.biogeme as bio
import biogeme.database as db
from biogeme import models
from biogeme.expressions import Beta, DefineVariable
from biogeme.expressions import Variable
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import f1_score, r2_score, ConfusionMatrixDisplay

%matplotlib inline

In [None]:
# Global experiment flags and variables.
SEED = 19348
TARGETS = ['p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown']

# Set the Numpy seed too.
np.random.seed(SEED)

In [None]:
class SPLIT_TYPE(Enum):
    INTRA_USER = 0
    TARGET = 1
    MODE = 2
    

class SPLIT(Enum):
    TRAIN = 0
    TEST = 1


def get_train_test_splits(data: pd.DataFrame, how=SPLIT_TYPE, test_ratio=0.2, shuffle=True):

    if how == SPLIT_TYPE.INTRA_USER:
        
        # There are certain users with only one observation. What do we do with those?
        # As per the mobilitynet modeling pipeline, we randomly assign them to either the
        # training or test set.
        
        value_counts = data.user_id.value_counts()
        single_count_ids = value_counts[value_counts == 1].index
        
        data_filtered = data.loc[~data.user_id.isin(single_count_ids), :].reset_index(drop=True)
        data_single_counts = data.loc[data.user_id.isin(single_count_ids), :].reset_index(drop=True)
        
        X_tr, X_te = train_test_split(
            data_filtered, test_size=test_ratio, shuffle=shuffle, stratify=data_filtered.user_id,
            random_state=SEED
        )
        
        data_single_counts['assigned'] = np.random.choice(['train', 'test'], len(data_single_counts))
        X_tr_merged = pd.concat(
            [X_tr, data_single_counts.loc[data_single_counts.assigned == 'train', :].drop(
                columns=['assigned'], inplace=False
            )],
            ignore_index=True, axis=0
        )
        
        X_te_merged = pd.concat(
            [X_te, data_single_counts.loc[data_single_counts.assigned == 'test', :].drop(
                columns=['assigned'], inplace=False
            )],
            ignore_index=True, axis=0
        )
        
        return X_tr_merged, X_te_merged
    
    elif how == SPLIT_TYPE.TARGET:
        
        X_tr, X_te = train_test_split(
            data, test_size=test_ratio, shuffle=shuffle, stratify=data.target,
            random_state=SEED
        )
        
        return X_tr, X_te
    
    elif how == SPLIT_TYPE.MODE:
        
        X_tr, X_te = train_test_split(
            data, test_size=test_ratio, shuffle=shuffle, stratify=data.section_mode_argmax,
            random_state=SEED
        )
        
        return X_tr, X_te
    
    raise NotImplementedError("Unknown split type")

## Modeling

The following are common features across all datasets:

```
{'age_21___25_years_old', 'cost_unknown', 'start_local_dt_hour', 'av_walk', 'distance', 'duration', 'av_unknown', 'ft_job', 'end_local_dt_hour', 'cost_no_trip', 'cost_s_micro', 'mph', 'n_residents_u18', 'is_paid', 'n_motor_vehicles', 'target', 'n_working_residents', 'section_distance_argmax', 'n_residence_members', 'has_medical_condition', 'primary_job_description_Other', 'cost_walk', 'cost_p_micro', 'av_transit', 'age_16___20_years_old', 'income_category', 'av_s_car', 'av_no_trip', 'cost_s_car', 'multiple_jobs', 'n_residents_with_license', 'section_duration_argmax', 'age_26___30_years_old', 'cost_car', 'av_p_micro', 'av_ridehail', 'av_car', 'cost_transit', 'available_modes', 'av_s_micro', 'has_drivers_license', 'cost_ridehail', 'user_id', 'section_mode_argmax', 'is_student'}
```

In [None]:
# Read the data.

DATA_SOURCES = [
    ('../data/filtered_data/preprocessed_data_Stage_database.csv', 'allceo'),
    ('../data/filtered_data/preprocessed_data_openpath_prod_uprm_nicr.csv', 'nicr'),
    ('../data/filtered_data/preprocessed_data_openpath_prod_durham.csv', 'durham')
]

DB_IX = 2

PATH = DATA_SOURCES[DB_IX][0]
CURRENT_DB = DATA_SOURCES[DB_IX][1]

In [None]:
data = pd.read_csv(PATH)

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
print(data.shape)

In [None]:
def norm_data(df: pd.DataFrame, split: SPLIT, scaler=None):
    
    columns = df.columns.tolist()
    
    # Ignore dummy features (1/0).
    ignore_cols = [
        c for c in columns if 'age_' in c or 'av_' in c or 'gender_' in c 
        or 'primary_job_description' in c or 'is_' in c or 'highest_education' in c
        or '_job' in c or 'has_' in c
    ] + ['user_id', 'target', 'section_mode_argmax']
    
    data = df.loc[:, [c for c in df.columns if c not in ignore_cols]]
    ignored = df.loc[:, ignore_cols]
    
    if split == SPLIT.TRAIN:
        
        scaler = StandardScaler()
        
        scaled = pd.DataFrame(
            scaler.fit_transform(data), 
            columns=data.columns, 
            index=data.index
        )
    
    elif split == SPLIT.TEST:
        scaled = pd.DataFrame(
            scaler.transform(data), 
            columns=data.columns, 
            index=data.index
        )
        
    else:
        raise NotImplementedError("Unknown split")
    
    return pd.concat([scaled, ignored], axis=1), scaler

In [None]:
def drop_columns(df: pd.DataFrame):
    
    to_drop = [
        'available_modes'
    ]
    
    for col in to_drop:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)
    
    return df

In [None]:
def get_duration_estimate(df: pd.DataFrame, dset: SPLIT, model_dict: dict):
    
    X_features = ['section_distance_argmax', 'mph']
    
    if dset == SPLIT.TRAIN and model_dict is None:
        model_dict = dict()
    
    if dset == SPLIT.TEST and model_dict is None:
        raise AttributeError("Expected model dict for testing.")
    
    if dset == SPLIT.TRAIN:
        for section_mode in df.section_mode_argmax.unique():
            section_data = df.loc[df.section_mode_argmax == section_mode, :]
            if section_mode not in model_dict:
                model_dict[section_mode] = dict()

                model = LinearRegression(fit_intercept=True)

                X = section_data[
                    X_features
                ]
                Y = section_data[['section_duration_argmax']]

                model.fit(X, Y.values.ravel())

                r2 = r2_score(y_pred=model.predict(X), y_true=Y.values.ravel())
                print(f"Train R2 for {section_mode}: {r2}")

                model_dict[section_mode]['model'] = model
                
    elif dset == SPLIT.TEST:
        for section_mode in df.section_mode_argmax.unique():
            
            section_data = df.loc[df.section_mode_argmax == section_mode, :]
            
            X = section_data[
                X_features
            ]
            Y = section_data[['section_duration_argmax']]
            
            if section_mode not in model_dict:
                y_pred = [np.nan for _ in range(len(X))]
            else:
                y_pred = model_dict[section_mode]['model'].predict(X)
                
            r2 = r2_score(y_pred=y_pred, y_true=Y.values.ravel())
            print(f"Test R2 for {section_mode}: {r2}")
    
    # Create the new columns for the duration.
    new_columns = ['p_micro','no_trip','s_car','transit','car','s_micro','ridehail','walk','unknown']
    df[new_columns] = 0
    df['temp'] = 0
    
    for section in df.section_mode_argmax.unique():
        X_section = df.loc[df.section_mode_argmax == section, X_features]
        
        # broadcast to all columns.
        df.loc[df.section_mode_argmax == section, 'temp'] = model_dict[section]['model'].predict(X_section)
    
    for c in new_columns:
        df[c] = df['av_' + c] * df['temp']
    
    df.drop(columns=['temp'], inplace=True)
    
    df.rename(columns=dict([(x, 'tt_'+x) for x in new_columns]), inplace=True)
    
    # return model_dict, result_df
    return model_dict, df

In [None]:
# Now, we drop columns, split the data, and normalize

data = drop_columns(data)

train_data, test_data = get_train_test_splits(data=data, how=SPLIT_TYPE.INTRA_USER, shuffle=True)

train_data, scaler = norm_data(train_data, split=SPLIT.TRAIN)
test_data, _ = norm_data(test_data, SPLIT.TEST, scaler)

In [None]:
USERS = list(data.user_id.unique())

USER_MAP = {
    u: i+1 for (i, u) in enumerate(USERS)
}

train_data['user_id'] = train_data['user_id'].apply(lambda x: USER_MAP[x])
test_data['user_id'] = test_data['user_id'].apply(lambda x: USER_MAP[x])

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(7, 7))
train_data.target.hist(ax=ax[0])
test_data.target.hist(ax=ax[1])
plt.tight_layout()
plt.show()

In [None]:
params, train_data = get_duration_estimate(train_data, SPLIT.TRAIN, None)
print(10 * "-")
_, test_data = get_duration_estimate(test_data, SPLIT.TEST, params)

In [None]:
# Drop section_mode

train_data.drop(columns=['section_mode_argmax'], inplace=True)
# test_data.drop(columns=['section_mode_argmax'], inplace=True)

In [None]:
train_data.shape, test_data.shape

In [None]:
print(train_data.columns.tolist())

In [None]:
# Some helper functions that will help ease redundancy in the code.

def get_database(df: pd.DataFrame, split: SPLIT):
    return db.Database(split.name + '_db', df)


def get_variables(database: db.Database):
    
    columns = database.data
    
    # User-level features.
    START_HOUR = Variable('start_local_dt_hour')
    END_HOUR = Variable('end_local_dt_hour')
    TRIP_DISTANCE = Variable('distance')
    INCOME = Variable('income_category')
    N_MEMBERS = Variable('n_residence_members')
    N_U18 = Variable('n_residents_u18')
    N_LICENSE = Variable('n_residents_with_license')
    N_VEHICLES = Variable('n_motor_vehicles')
    LICENSE = Variable('has_drivers_license')
    CONDITION = Variable('has_medical_condition')
    FT_JOB = Variable('ft_job')
    MULTIPLE_JOBS = Variable('multiple_jobs')
    
    # Sections
    DISTANCE_ARGMAX = Variable('section_distance_argmax')
    TT_ARGMAX = Variable('section_duration_argmax')
    MPH = Variable('mph')
    
    # Costs
    COST_P_MICRO = Variable('cost_p_micro')
    COST_NO_TRIP = Variable('cost_no_trip')
    COST_S_CAR = Variable('cost_s_car')
    COST_CAR = Variable('cost_car')
    COST_S_MICRO = Variable('cost_s_micro')
    COST_RIDEHAIL = Variable('cost_ridehail')
    COST_WALK = Variable('cost_walk')
    COST_UNKNOWN = Variable('cost_unknown')
    COST_TRANSIT = Variable('cost_transit')

    # Availability.
    AV_P_MICRO = Variable('av_p_micro')
    AV_NO_TRIP = Variable('av_no_trip')
    AV_S_CAR = Variable('av_s_car')
    AV_TRANSIT = Variable('av_transit')
    AV_CAR = Variable('av_car')
    AV_S_MICRO = Variable('av_s_micro')
    AV_RIDEHAIL = Variable('av_ridehail')
    AV_WALK = Variable('av_walk')
    AV_UNKNOWN = Variable('av_unknown')
    
    # OHE
    G = [Variable(x) for x in columns if 'gender_' in x]
    E = [Variable(x) for x in columns if 'highest_education' in x]
    PJ = [Variable(x) for x in columns if 'primary_job_description' in x]
    
    # Times.
    TT_P_MICRO = Variable('tt_p_micro')
    TT_NO_TRIP = Variable('tt_no_trip')
    TT_S_CAR = Variable('tt_s_car')
    TT_TRANSIT = Variable('tt_transit')
    TT_CAR = Variable('tt_car')
    TT_S_MICRO = Variable('tt_s_micro')
    TT_RIDEHAIL = Variable('tt_ridehail')
    TT_WALK = Variable('tt_walk')
    TT_UNKNOWN = Variable('tt_unknown')
    
    # Choice.
    CHOICE = Variable('target')
    
    return_dict = locals().copy()
    
    # Remove the gender list and place them in the locals dict.
    for i, val in enumerate(G):
        return_dict.update({'G_' + str(i): val})
    
    del return_dict['G']
    
    
    ## Education
    for i, val in enumerate(E):
        return_dict.update({'E_' + str(i): val})
    
    del return_dict['E']
    
    ## Job
    for i, val in enumerate(PJ):
        return_dict.update({'PJ_' + str(i): val})
    
    del return_dict['PJ']
    
    # return the filtered locals() dictionary.
    return {k:v for k,v in return_dict.items() if not k.startswith('_') and k not in ['database', 'columns']}


# def exclude_from_db(v_dict: dict, db: db.Database):
#     EXCLUDE = (v_dict['CHOICE'] == 2) + (v_dict['CHOICE'] == 9) > 0
#     db.remove(EXCLUDE)

def get_params(variables):
    
    param_dict = {'B_' + k: Beta('B_' + k, 0, None, None, 0) for k in variables.keys()}
    
    param_dict['ASC_P_MICRO'] = Beta('ASC_P_MICRO', 0, None, None, 0)
    param_dict['ASC_NO_TRIP'] = Beta('ASC_P_MICRO', 0, None, None, 0)
    param_dict['ASC_S_CAR'] = Beta('ASC_P_MICRO', 0, None, None, 0)
    param_dict['ASC_TRANSIT'] = Beta('ASC_P_MICRO', 0, None, None, 0)
    param_dict['ASC_CAR'] = Beta('ASC_P_MICRO', 0, None, None, 0)
    param_dict['ASC_S_MICRO'] = Beta('ASC_P_MICRO', 0, None, None, 0)
    param_dict['ASC_RIDEHAIL'] = Beta('ASC_P_MICRO', 0, None, None, 0)
    param_dict['ASC_WALK'] = Beta('ASC_P_MICRO', 0, None, None, 0)
    param_dict['ASC_UNKNOWN'] = Beta('ASC_P_MICRO', 0, None, None, 0)
    
    # Return filtered locals dict.
    return param_dict


def get_utility_functions(v: dict):
    
    ## User-level utility.
    user = 1.
    for var in [
        'INCOME', 'N_MEMBERS', 
        'N_U18', 'N_LICENSE', 'N_VEHICLES', 'LICENSE', 'CONDITION', 'FT_JOB', 'MULTIPLE_JOBS'
    ]:
        user += v[var] * v['B_'+var]
    
    # OHE (One-hot encoded utility.)
    ohe = 1.
    ohe_vars = [var for var in v if ('G_' in var or 'E_' in var or 'PJ_' in var) and 'B_' not in var]
    for var in ohe_vars:
        ohe += v[var] * v['B_'+var]
    
    ## Trip utility.
    trip = 1.
    for var in ['MPH', 'DISTANCE_ARGMAX', 'TT_ARGMAX', 'START_HOUR', 'END_HOUR', 'TRIP_DISTANCE']:
        trip += v[var] * v['B_' + var]
    
    
    V_P_MICRO = v['ASC_P_MICRO'] + \
        ohe + user + trip + \
        v['TT_P_MICRO'] * v['B_TT_P_MICRO'] + \
        v['COST_P_MICRO'] * v['B_COST_P_MICRO']
    
    V_S_MICRO = v['ASC_S_MICRO'] + \
        ohe + user + trip + \
        v['TT_S_MICRO'] * v['B_TT_S_MICRO'] + \
        v['COST_S_MICRO'] * v['B_COST_S_MICRO']
    
    V_S_CAR = v['ASC_S_CAR'] + \
        ohe + user + trip + \
        v['TT_S_CAR'] * v['B_TT_S_CAR'] + \
        v['COST_S_CAR'] * v['B_COST_S_CAR']
    
    V_CAR = v['ASC_CAR'] + \
        ohe + user + trip + \
        v['TT_CAR'] * v['B_TT_CAR'] + \
        v['COST_CAR'] * v['B_COST_CAR']
    
    V_TRANSIT = v['ASC_TRANSIT'] + \
        ohe + user + trip + \
        v['TT_TRANSIT'] * v['B_TT_TRANSIT'] + \
        v['COST_TRANSIT'] * v['B_COST_TRANSIT']
    
    V_WALK = v['ASC_WALK'] + \
        ohe + user + trip + \
        v['TT_WALK'] * v['B_TT_WALK'] + \
        v['COST_WALK'] * v['B_COST_WALK']
    
    V_RIDEHAIL = v['ASC_RIDEHAIL'] + \
        ohe + user + trip + \
        v['TT_RIDEHAIL'] * v['B_TT_RIDEHAIL'] + \
        v['COST_RIDEHAIL'] * v['B_COST_RIDEHAIL']
    
    V_NO_TRIP = -100
    V_UNKNOWN = -100
    
    # Remember to exclude the input argument.
    return {k:v for k,v in locals().items() if not k.startswith('_') and k != 'v'}


def get_utility_mapping(var: dict):
    # Map alterative to utility functions.
    return {
        1: var['V_P_MICRO'], 
        2: var['V_NO_TRIP'],
        3: var['V_S_CAR'], 
        4: var['V_TRANSIT'],
        5: var['V_CAR'], 
        6: var['V_S_MICRO'],
        7: var['V_RIDEHAIL'], 
        8: var['V_WALK'], 
        9: var['V_UNKNOWN']
    }


def get_availability_mapping(var: dict):
    return {
        1: var['AV_P_MICRO'],
        2: var['AV_NO_TRIP'],
        3: var['AV_S_CAR'],
        4: var['AV_TRANSIT'],
        5: var['AV_CAR'],
        6: var['AV_S_MICRO'],
        7: var['AV_RIDEHAIL'],
        8: var['AV_WALK'],
        9: var['AV_UNKNOWN']
    }

In [None]:
# # First, drop columns.

# train_data = drop_columns(train_data)

# train_data, scaler = norm_data(train_data, split=SPLIT.TRAIN)

# get dbs.
train_db = get_database(train_data, SPLIT.TRAIN)

# get vars.
train_vars = get_variables(train_db)

In [None]:
train_vars

In [None]:
train_params = get_params(train_vars)

In [None]:
train_params

In [None]:
train_vars.update(train_params)

In [None]:
train_V = get_utility_functions(train_vars)
train_vars.update(train_V)

In [None]:
V = get_utility_mapping(train_vars)
av = get_availability_mapping(train_vars)
logprob = models.loglogit(V, av, train_vars['CHOICE'])

# logit1 = models.logit(V, av, 1)
# logit2 = models.logit(V, av, 2)
# logit3 = models.logit(V, av, 3)
# logit4 = models.logit(V, av, 4)
# logit5 = models.logit(V, av, 5)
# logit6 = models.logit(V, av, 6)
# logit7 = models.logit(V, av, 7)
# logit8 = models.logit(V, av, 8)
# logit9 = models.logit(V, av, 9)

# models = {f'logit_{ix}': logit for ix, logit in enumerate(
#     [logit1, logit2, logit3, logit4, logit5, logit6, logit7, logit8, logit9]
# )}

model = bio.BIOGEME(train_db, logprob)
model.modelName = 'customUtility-new'
model.generate_html = False
model.generate_pickle = False

In [None]:
train_results = model.estimate()

In [None]:
print(train_results.short_summary())

In [None]:
print(train_results.getEstimatedParameters())

In [None]:
from biogeme.expressions import Derive


def simulate_results(V, av, db, beta_dict):
    
    wtp = {
        'WTP s_car': Derive(V[3], 'tt_s_car')/Derive(V[3], 'scaled_cost_s_car'),
        'WTP transit': Derive(V[4], 'tt_transit')/Derive(V[4], 'scaled_cost_transit'),
        'WTP car': Derive(V[5], 'tt_car')/Derive(V[5], 'scaled_cost_car'),
        'WTP s_micro': Derive(V[6], 'tt_s_micro')/Derive(V[6], 'scaled_cost_s_micro'),
        'WTP ridehail': Derive(V[7], 'tt_ridehail')/Derive(V[7], 'scaled_cost_ridehail')
    }
    
    prob_labels = ['Prob. ' + x for x in TARGETS]
    probs = [models.logit(V, av, i+1) for i in range(len(prob_labels))]
    
    simulate = dict(zip(prob_labels, probs))
    
    # simulate.update(wtp)
    
    biosim = bio.BIOGEME(db, simulate)
    biosim.modelName = 'test-3'
    
    return biosim.simulate(theBetaValues=beta_dict)

In [None]:
test_data = drop_columns(test_data)

# Scale cost.
test_data, _ = norm_data(test_data, SPLIT.TEST, scaler)

test_data.drop(columns=['section_mode_argmax'], inplace=True)

# get dbs.
test_db = get_database(test_data, SPLIT.TEST)

In [None]:
test_probs = simulate_results(V, av, test_db, train_results.getBetaValues())
# test_utilities = get_utility_df(train_results, test_data)

In [None]:
display(test_probs.head())

In [None]:
# argmax starts from 0. Offset all predicted indices by 1.
choices = np.argmax(test_probs.values, axis=1) + 1

In [None]:
y_true = test_data.chosen
score = f1_score(y_true, choices, average='weighted')

print(score)

In [None]:
fig, ax = plt.subplots()
counts = pd.Series(choices).value_counts()
ix = counts.index.tolist()
_x = [i+1 for i in range(len(TARGETS))]
height = [0 if i not in ix else counts[i] for i in _x]
ax.bar(x=_x, height=height)
ax.set_xticks(range(1, 10, 1))
ax.set_xticklabels(TARGETS, rotation=45)
plt.show()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

fig, ax = plt.subplots()
cm = ConfusionMatrixDisplay.from_predictions(y_true=y_true, y_pred=choices, ax=ax)

y_unique = np.unique(y_true)
labelset = [t for i, t in enumerate(TARGETS) if (i+1) in y_unique]

ax.set_xticklabels(labelset, rotation=45)
ax.set_yticklabels(labelset)
plt.tight_layout()
plt.show()

In [None]:
# np.diag(cm.confusion_matrix)/np.sum(cm.confusion_matrix, axis=1)

In [None]:
# u_np = test_utilities.values
# choice_df = np.exp(u_np)/np.sum(np.exp(u_np), axis=1, keepdims=True)

# choice_df = pd.DataFrame(choice_df, columns=test_utilities.columns)
# display(choice_df.head())