In [42]:
import numpy as np
import pandas as pd
from biogeme import models as models, database as db, biogeme as bio
from utils import add_utilities

In [2]:
from biogeme.expressions import Beta

In [6]:
# Read data
df = pd.read_csv("../data/lpmc.dat",sep='\t')
df.shape

(81086, 32)

In [7]:
df.head()

Unnamed: 0,trip_id,household_id,person_n,trip_n,travel_mode,purpose,fueltype,faretype,bus_scale,survey_year,...,dur_pt_access,dur_pt_rail,dur_pt_bus,dur_pt_int,pt_interchanges,dur_driving,cost_transit,cost_driving_fuel,cost_driving_ccharge,driving_traffic_percent
0,0,0,0,0,4,3,1,1,1.0,1,...,0.134444,0.0,0.016667,0.0,0,0.052222,1.5,0.14,0.0,0.111702
1,1,0,0,1,4,3,1,1,1.0,1,...,0.109444,0.0,0.055556,0.0,0,0.059444,1.5,0.15,0.0,0.11215
2,2,0,0,2,4,3,1,1,1.0,1,...,0.203056,0.0,0.210278,0.0,0,0.236667,1.5,0.79,0.0,0.203052
3,3,0,0,3,4,3,1,1,1.0,1,...,0.205556,0.0,0.258611,0.0,0,0.233333,1.5,0.78,0.0,0.160714
4,4,0,1,2,4,3,1,4,1.0,1,...,0.203056,0.0,0.189444,0.0,0,0.229167,1.5,0.78,0.0,0.130909


In [8]:
df.columns

Index(['trip_id', 'household_id', 'person_n', 'trip_n', 'travel_mode',
       'purpose', 'fueltype', 'faretype', 'bus_scale', 'survey_year',
       'travel_year', 'travel_month', 'travel_date', 'day_of_week',
       'start_time', 'age', 'female', 'driving_license', 'car_ownership',
       'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access',
       'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'pt_interchanges',
       'dur_driving', 'cost_transit', 'cost_driving_fuel',
       'cost_driving_ccharge', 'driving_traffic_percent'],
      dtype='object')

In [18]:
df['dur_pt_total'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_bus'] + df['dur_pt_int'] 
df['cost_driving'] = df['cost_driving_fuel'] + df['cost_driving_ccharge']

In [20]:
cols = ['trip_id', 'travel_mode',      
        'dur_walking', 'dur_cycling', 'dur_pt_total', 'dur_driving', 
        'cost_transit', 'cost_driving']

df = df[cols]

In [21]:
#  (1: walk, 2: cycle, 3: public transport, 4: drive)
df.travel_mode.value_counts()

4    35808
3    28605
1    14268
2     2405
Name: travel_mode, dtype: int64

In [23]:
data = db.Database("lmc",df)

In [24]:
data.variables

{'trip_id': trip_id,
 'travel_mode': travel_mode,
 'dur_walking': dur_walking,
 'dur_cycling': dur_cycling,
 'dur_pt_total': dur_pt_total,
 'dur_driving': dur_driving,
 'cost_transit': cost_transit,
 'cost_driving': cost_driving}

In [26]:
globals().update(data.variables)

In [28]:
# Create parameters
asc_walk = Beta('asc_walk',0,None,None,0)
asc_cycle = Beta('asc_cycle',0,None,None,0)
asc_trsit = Beta('asc_trsit',0,None,None,0)
asc_drive = Beta('asc_drive',0,None,None,1)

b_dur = Beta('b_dur',0,None,None,0)
b_cos = Beta('b_cos',0,None,None,0)

In [29]:
cols

['trip_id',
 'travel_mode',
 'dur_walking',
 'dur_cycling',
 'dur_pt_total',
 'dur_driving',
 'cost_transit',
 'cost_driving']

In [32]:
# Create utility function
v_walk = asc_walk + b_dur * dur_walking
v_cycle = asc_cycle + b_dur * dur_cycling
v_trsit = asc_trsit + b_dur * dur_pt_total + b_cos * cost_transit
v_drive = asc_drive + b_dur * dur_driving + b_cos * cost_driving
v = {1:v_walk, 2:v_cycle, 3:v_trsit, 4:v_drive}


In [34]:
logprob = models.logit(v, None, travel_mode)

In [35]:
mlogit = bio.BIOGEME(data, logprob)
mlogit.modelName = "11_logit_lmc"

In [49]:
%%time
res = mlogit.estimate(saveIterations=True,file_iterations="11_logit_lmc_params.txt")

CPU times: user 9min 32s, sys: 1.9 s, total: 9min 34s
Wall time: 2min 38s


In [50]:
res.getEstimatedParameters()

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
asc_cycle,-277.126484,1.797693e+308,-1.541567e-306,1.0,1.797693e+308,-1.541567e-306,1.0
asc_trsit,22.301737,3.073347,7.256498,3.972378e-13,4.829463,4.61785,4e-06
asc_walk,-73.650299,5.006554e-13,-147107800000000.0,0.0,8.653971e-13,-85105780000000.0,0.0
b_cos,-12.201773,1.723986,-7.077652,1.466161e-12,2.945159,-4.142993,3.4e-05
b_dur,-315.786761,45.34525,-6.964054,3.306244e-12,78.43443,-4.026124,5.7e-05


In [54]:
v_walk

(asc_walk(-73.65029911809745) + (b_dur(-315.786760914279) * dur_walking))

In [55]:
mlogit.loadSavedIteration("11_logit_lmc_params.txt")

In [56]:
mlogit.loglike

exp(_bioLogLogitFullChoiceSet(1:(asc_walk(-73.65029911809745) + (b_dur(-315.786760914279) * dur_walking)), 2:(asc_cycle(-277.12648351963844) + (b_dur(-315.786760914279) * dur_cycling)), 3:((asc_trsit(22.301736979691917) + (b_dur(-315.786760914279) * dur_pt_total)) + (b_cos(-12.201772975805605) * cost_transit)), 4:((asc_drive(0) + (b_dur(-315.786760914279) * dur_driving)) + (b_cos(-12.201772975805605) * cost_driving))))

In [59]:
def add_utilities(db, V):
    df = db.data
    for idx, v in V.items():
        df[f'V_{idx}'] = db.valuesFromDatabase(v)
    Vs = [f'V_{idx}' for idx in V.keys()]
    df["PRED"] = np.argmax(df[Vs].values, axis=1) + 1
    return df

In [60]:
test = add_utilities(data, v)

In [61]:
test.head()

Unnamed: 0,trip_id,travel_mode,dur_walking,dur_cycling,dur_pt_total,dur_driving,cost_transit,cost_driving,V_1,V_2,V_3,V_4,PRED
0,0,4,0.218056,0.092222,0.151111,0.052222,1.5,0.14,-142.509357,-306.24904,-43.719811,-18.199335,4
1,1,4,0.188889,0.061944,0.165,0.059444,1.5,0.15,-133.29891,-296.687719,-48.105738,-20.602034,4
2,2,4,1.037778,0.355833,0.413333,0.236667,1.5,0.79,-401.366782,-389.493939,-126.526117,-84.375601,4
3,3,4,1.039444,0.384444,0.464167,0.233333,1.5,0.78,-401.893093,-398.528949,-142.578611,-83.20096,4
4,4,4,1.037778,0.355833,0.3925,0.229167,1.5,0.78,-401.366782,-389.493939,-119.947226,-81.885182,4


In [62]:
def accuracy(actual, pred):
    res = np.sum(np.array(actual) == np.array(pred)) / len(actual)
    return res

In [63]:
accuracy(test['travel_mode'], test["PRED"])

0.5334829686998989

In [64]:
test['travel_mode'].value_counts()

4    35808
3    28605
1    14268
2     2405
Name: travel_mode, dtype: int64

In [65]:
test['PRED'].value_counts()

4    63773
3    17313
Name: PRED, dtype: int64