In [2]:
!pip list | grep larch

larch                         5.5.5


In [64]:
import os
import numpy as np
import pandas as pd
import larch
import larch.exampville
from larch import P, X

# Exampville Mode Choice

Discrete choice modeling is at the heart of many transportion planning models.
In this example, we will examine the development of a mode choice model for 
Exampville, an entirely fictional town built for the express purpose of 
demostrating the use of discrete choice modeling tools for transportation 
planning.

In [65]:
skims = larch.OMX(larch.exampville.files.skims, mode='r')

In [66]:
skims

<larch.OMX> ⋯/exampville_skims.omx
 |  shape:(40, 40)
 |  data:
 |    AUTO_COST    (float64)
 |    AUTO_DIST    (float64)
 |    AUTO_TIME    (float64)
 |    BIKE_TIME    (float64)
 |    TRANSIT_FARE (float64)
 |    TRANSIT_IVTT (float64)
 |    TRANSIT_OVTT (float64)
 |    WALK_DIST    (float64)
 |    WALK_TIME    (float64)
 |  lookup:
 |    TAZ_AREA_TYPE (40 |S3)
 |    TAZ_ID        (40 int64)

In [67]:
hh = pd.read_csv(larch.exampville.files.hh)
pp = pd.read_csv(larch.exampville.files.person)
tour = pd.read_csv(larch.exampville.files.tour)

In [68]:
hh.shape, pp.shape, tour.shape

((5000, 13), (12349, 12), (20739, 11))

In [69]:
hh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   X            5000 non-null   float64
 1   Y            5000 non-null   float64
 2   INCOME       5000 non-null   float64
 3   N_VEHICLES   5000 non-null   int64  
 4   HHSIZE       5000 non-null   int64  
 5   geometry     5000 non-null   object 
 6   HOMETAZ      5000 non-null   int64  
 7   HHID         5000 non-null   int64  
 8   N_TRIPS      5000 non-null   int64  
 9   N_TRIPS_HBW  5000 non-null   int64  
 10  N_TRIPS_HBO  5000 non-null   int64  
 11  N_TRIPS_NHB  5000 non-null   int64  
 12  N_WORKERS    5000 non-null   int64  
dtypes: float64(3), int64(9), object(1)
memory usage: 507.9+ KB


In [70]:
pp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12349 entries, 0 to 12348
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   PERSONID       12349 non-null  int64
 1   HHID           12349 non-null  int64
 2   HHIDX          12349 non-null  int64
 3   AGE            12349 non-null  int64
 4   WORKS          12349 non-null  int64
 5   N_WORK_TOURS   12349 non-null  int64
 6   N_OTHER_TOURS  12349 non-null  int64
 7   N_TOURS        12349 non-null  int64
 8   N_TRIPS        12349 non-null  int64
 9   N_TRIPS_HBW    12349 non-null  int64
 10  N_TRIPS_HBO    12349 non-null  int64
 11  N_TRIPS_NHB    12349 non-null  int64
dtypes: int64(12)
memory usage: 1.1 MB


In [71]:
tour.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20739 entries, 0 to 20738
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   TOURID       20739 non-null  int64
 1   HHID         20739 non-null  int64
 2   PERSONID     20739 non-null  int64
 3   DTAZ         20739 non-null  int64
 4   TOURMODE     20739 non-null  int64
 5   TOURPURP     20739 non-null  int64
 6   N_STOPS      20739 non-null  int64
 7   N_TRIPS      20739 non-null  int64
 8   N_TRIPS_HBW  20739 non-null  int64
 9   N_TRIPS_HBO  20739 non-null  int64
 10  N_TRIPS_NHB  20739 non-null  int64
dtypes: int64(11)
memory usage: 1.7 MB


In [72]:
tour.head()

Unnamed: 0,TOURID,HHID,PERSONID,DTAZ,TOURMODE,TOURPURP,N_STOPS,N_TRIPS,N_TRIPS_HBW,N_TRIPS_HBO,N_TRIPS_NHB
0,0,50000,60000,22,1,1,0,2,2,0,0
1,1,50000,60001,4,1,1,0,2,2,0,0
2,2,50000,60001,10,2,2,0,2,0,2,0
3,3,50000,60002,20,1,1,0,2,2,0,0
4,4,50000,60002,20,1,2,1,3,0,2,1


In [73]:
# work (purpose=1) and non-work (purpose=2).
tour.TOURPURP.value_counts()

2    13175
1     7564
Name: TOURPURP, dtype: int64

In [74]:
# Get only work trips
df = tour[tour.TOURPURP == 1]
df.shape

(7564, 11)

In [75]:
# Merge data
df = df.merge(hh, on="HHID").merge(pp, on=('HHID', 'PERSONID'))
df.shape

(7564, 33)

In [76]:
# For looking up data in the skims matrix using Larch, need zero-based numbering
df["HOMETAZi"] = df["HOMETAZ"] - 1
df["DTAZi"] = df["DTAZ"] - 1 

In [77]:
df.head().T

Unnamed: 0,0,1,2,3,4
TOURID,0,1,3,7,10
HHID,50000,50000,50000,50001,50003
PERSONID,60000,60001,60002,60004,60006
DTAZ,22,4,20,25,25
TOURMODE,1,1,1,1,1
TOURPURP,1,1,1,1,1
N_STOPS,0,0,0,2,1
N_TRIPS_x,2,2,2,4,3
N_TRIPS_HBW_x,2,2,2,1,1
N_TRIPS_HBO_x,0,0,0,1,1


In [78]:
los_data = skims.get_rc_dataframe(df["HOMETAZi"], df["DTAZi"])
los_data

Unnamed: 0,AUTO_COST,AUTO_DIST,AUTO_TIME,BIKE_TIME,TRANSIT_FARE,TRANSIT_IVTT,TRANSIT_OVTT,WALK_DIST,WALK_TIME
0,0.588461,1.681318,5.043955,8.406591,0.0,0.000000,2.241758,1.681318,33.626365
1,1.925965,5.502758,16.508274,16.843315,2.5,1.535243,47.830632,3.368663,67.373260
2,0.621348,1.775280,5.325840,8.876400,0.0,0.000000,35.505600,1.775280,35.505600
3,0.480560,1.373029,9.119086,6.865143,0.0,0.000000,27.460572,1.373029,27.460572
4,0.480560,1.373029,9.119086,6.865143,0.0,0.000000,27.460572,1.373029,27.460572
...,...,...,...,...,...,...,...,...,...
7559,0.990705,2.830584,4.842947,8.820128,2.5,1.652554,21.496711,1.764026,35.280512
7560,1.066707,3.047736,7.201386,15.238678,2.5,3.746109,11.138607,3.047736,60.954713
7561,1.294075,3.697356,7.290231,18.486780,2.5,10.514265,33.910995,3.697356,73.947121
7562,1.066707,3.047736,7.201386,15.238678,2.5,3.746109,11.138607,3.047736,60.954713


In [79]:
df = df.join(los_data)
df.shape

(7564, 44)

### Model definition

In [80]:
# Define numbers as names for modes
DA,SR,Walk,Bike,Transit = 1,2,3,4,5
dfs = larch.DataFrames(co=df, 
                       alt_codes=[DA,SR,Walk,Bike,Transit],
                       alt_names=['DA','SR','Walk','Bike','Transit'],
                       ch_name='TOUR_MODE')

In [81]:
# Define model
m = larch.Model(dataservice=dfs)
m.title = "Work Tour Mode Choice"

In [82]:
# Define Utility function
m.utility_co[DA] = P.InVehTime * X.AUTO_TIME + P.Cost * X.AUTO_COST
m.utility_co[SR] = (P.ASC_SR + P.InVehTime * X.AUTO_TIME 
                    + P.Cost * (X.AUTO_COST * 0.5) + P("LogIncome:SR") * X("log(INCOME)"))
m.utility_co[Walk] = (
        + P.ASC_Walk
        + P.NonMotorTime * X.WALK_TIME
        + P("LogIncome:Walk") * X("log(INCOME)")
)

m.utility_co[Bike] = (
        + P.ASC_Bike
        + P.NonMotorTime * X.BIKE_TIME
        + P("LogIncome:Bike") * X("log(INCOME)")
)

m.utility_co[Transit] = (
        + P.ASC_Transit
        + P.InVehTime * X.TRANSIT_IVTT
        + P.OutVehTime * X.TRANSIT_OVTT
        + P.Cost * X.TRANSIT_FARE
        + P("LogIncome:Transit") * X('log(INCOME)')
)

In [83]:
m.graph

In [84]:
# Nested logit model
Car = m.graph.new_node(parameter='Mu:Car', children=[DA,SR], name='Car')
NonMotor = m.graph.new_node(parameter='Mu:NonMotor', children=[Walk,Bike], name='NonMotor')
Motor = m.graph.new_node(parameter='Mu:Motor', children=[Car,Transit], name='Motor')

In [85]:
m.graph

In [86]:
m.choice_co_code = 'TOURMODE'

In [87]:
m.availability_co_vars = {
    DA: 'AGE >= 16',
    SR: 1,
    Walk: 'WALK_TIME < 60',
    Bike: 'BIKE_TIME < 60',
    Transit: 'TRANSIT_FARE>0',
}

## Model Estimation

In [88]:
m.load_data()

In [89]:
m.dataframes.choice_avail_summary()

Unnamed: 0,name,chosen,available
1,DA,6052.0,7564.0
2,SR,810.0,7564.0
3,Walk,196.0,4179.0
4,Bike,72.0,7564.0
5,Transit,434.0,4199.0
< Total All Alternatives >,,7564.0,


In [90]:
m.dataframes.data_co.statistics()

Unnamed: 0,n,minimum,maximum,median,histogram,mean,stdev,zeros,positives,negatives,nonzero_minimum,nonzero_maximum,nonzero_mean,nonzero_stdev
AUTO_COST,7564,0.194926,4.30796,1.00945,"2021-07-05T15:13:13.949321  image/svg+xml  Matplotlib v3.4.2, https://matplotlib.org/",1.20601,0.754844,0,7564,0,0.194926,4.30796,1.20601,0.754844
AUTO_TIME,7564,0.930008,29.4415,7.61571,"2021-07-05T15:13:14.001007  image/svg+xml  Matplotlib v3.4.2, https://matplotlib.org/",8.22287,4.58134,0,7564,0,0.930008,29.4415,8.22287,4.58134
BIKE_TIME,7564,2.78465,52.2321,13.5864,"2021-07-05T15:13:14.049226  image/svg+xml  Matplotlib v3.4.2, https://matplotlib.org/",15.9827,9.29732,0,7564,0,2.78465,52.2321,15.9827,9.29732
TRANSIT_FARE,7564,0.0,2.5,2.5,"2021-07-05T15:13:14.089349  image/svg+xml  Matplotlib v3.4.2, https://matplotlib.org/  Histograms are purple if the data is represented as discrete values.",1.38782,1.24238,3365,4199,0,2.5,2.5,2.5,0.0
TRANSIT_IVTT,7564,0.0,12.1668,1.44769,"2021-07-05T15:13:14.134147  image/svg+xml  Matplotlib v3.4.2, https://matplotlib.org/  Histograms are orange if the zeros are numerous and have been excluded.",2.62277,3.38088,3365,4199,0,0.959322,12.1668,4.72461,3.26497
TRANSIT_OVTT,7564,0.759059,128.652,37.0499,"2021-07-05T15:13:14.182851  image/svg+xml  Matplotlib v3.4.2, https://matplotlib.org/",37.6614,23.2194,0,7564,0,0.759059,128.652,37.6614,23.2194
WALK_TIME,7564,11.1386,208.928,54.3454,"2021-07-05T15:13:14.232235  image/svg+xml  Matplotlib v3.4.2, https://matplotlib.org/",63.9308,37.1893,0,7564,0,11.1386,208.928,63.9308,37.1893
log(INCOME),7564,7.59035,14.2181,10.6719,"2021-07-05T15:13:14.359513  image/svg+xml  Matplotlib v3.4.2, https://matplotlib.org/",10.8414,1.01301,0,7564,0,7.59035,14.2181,10.8414,1.01301


In [91]:
result = m.maximize_loglike(method='slsqp')

Unnamed: 0,value,initvalue,nullvalue,minimum,maximum,holdfast,note,best
ASC_Bike,-0.25843,0.0,0.0,-inf,inf,0,,-0.25843
ASC_SR,1.422554,0.0,0.0,-inf,inf,0,,1.422554
ASC_Transit,6.753916,0.0,0.0,-inf,inf,0,,6.753916
ASC_Walk,8.621143,0.0,0.0,-inf,inf,0,,8.621143
Cost,-0.175666,0.0,0.0,-inf,inf,0,,-0.175666
InVehTime,-0.123722,0.0,0.0,-inf,inf,0,,-0.123722
LogIncome:Bike,-0.196942,0.0,0.0,-inf,inf,0,,-0.196942
LogIncome:SR,-0.193766,0.0,0.0,-inf,inf,0,,-0.193766
LogIncome:Transit,-0.55711,0.0,0.0,-inf,inf,0,,-0.55711
LogIncome:Walk,-0.522755,0.0,0.0,-inf,inf,0,,-0.522755


if you get poor results, consider setting global bounds with model.set_cap()


In [92]:
m.calculate_parameter_covariance()

In [93]:
m.parameter_summary()

Unnamed: 0,Value,Std Err,t Stat,Signif,Null Value
ASC_Bike,-0.258,1.34,-0.19,,0.0
ASC_SR,1.42,1.0,1.42,,0.0
ASC_Transit,6.75,2.06,3.27,**,0.0
ASC_Walk,8.62,1.14,7.57,***,0.0
Cost,-0.176,0.12,-1.47,,0.0
InVehTime,-0.124,0.0292,-4.24,***,0.0
LogIncome:Bike,-0.197,0.124,-1.59,,0.0
LogIncome:SR,-0.194,0.135,-1.43,,0.0
LogIncome:Transit,-0.557,0.169,-3.29,***,0.0
LogIncome:Walk,-0.523,0.1,-5.21,***,0.0


In [94]:
m.estimation_statistics()

Statistic,Aggregate,Per Case
Number of Cases,7564,7564
Log Likelihood at Convergence,-3493.04,-0.46
Log Likelihood at Null Parameters,-10644.66,-1.41
Rho Squared w.r.t. Null Parameters,0.672,0.672


## Save and Report Model

In [95]:
report = larch.Reporter(title=m.title)

In [96]:
report << '# Parameter Summary' << m.parameter_summary()

Unnamed: 0,Value,Std Err,t Stat,Signif,Null Value
ASC_Bike,-0.258,1.34,-0.19,,0.0
ASC_SR,1.42,1.0,1.42,,0.0
ASC_Transit,6.75,2.06,3.27,**,0.0
ASC_Walk,8.62,1.14,7.57,***,0.0
Cost,-0.176,0.12,-1.47,,0.0
InVehTime,-0.124,0.0292,-4.24,***,0.0
LogIncome:Bike,-0.197,0.124,-1.59,,0.0
LogIncome:SR,-0.194,0.135,-1.43,,0.0
LogIncome:Transit,-0.557,0.169,-3.29,***,0.0
LogIncome:Walk,-0.523,0.1,-5.21,***,0.0


In [97]:
report.save(
    './exampville_mode_choice.html', 
    overwrite=True, 
    metadata=m,
)

'./exampville_mode_choice.html'