# Regularization: $L_1$ and $L_2$

Author: Tianyu Du
Date: Sept. 28, 2022

Also known as **weight decay** or **penalized regression**. Adding the regularization loss term would shrink coefficient magnitudes and better prevent over-fitting.

Specifically, we add the $L_1$ or $L_2$ norm of coefficients to the loss (negative log-likelihood) function.

$$
\text{Loss} = \text{NegativeLogLikelihood} + \alpha \sum_{c \in \text{model coefficients}} ||c||_p \quad p \in \{1, 2\}
$$

Readers can adjust the $\alpha$ weight to control the strength of regularization.

In [2]:
import argparse

import numpy as np
import pandas as pd
import torch

from torch_choice.data import ChoiceDataset, JointDataset, utils
from torch_choice.model.nested_logit_model import NestedLogitModel
from torch_choice.model import ConditionalLogitModel
from torch_choice.utils.run_helper import run

In [3]:
if torch.cuda.is_available():
    print(f'CUDA device used: {torch.cuda.get_device_name()}')
    device = 'cuda'
else:
    print('Running tutorial on CPU.')
    device = 'cpu'

CUDA device used: NVIDIA GeForce RTX 3090


## Conditional Logit Model

In [4]:
df = pd.read_csv('./public_datasets/ModeCanada.csv')
df = df.query('noalt == 4').reset_index(drop=True)
df.sort_values(by='case', inplace=True)
item_index = df[df['choice'] == 1].sort_values(by='case')['alt'].reset_index(drop=True)
item_names = ['air', 'bus', 'car', 'train']
num_items = 4
encoder = dict(zip(item_names, range(num_items)))
item_index = item_index.map(lambda x: encoder[x])
item_index = torch.LongTensor(item_index)
price_cost_freq_ovt = utils.pivot3d(df, dim0='case', dim1='alt',
                                    values=['cost', 'freq', 'ovt'])

price_ivt = utils.pivot3d(df, dim0='case', dim1='alt', values='ivt')
session_income = df.groupby('case')['income'].first()
session_income = torch.Tensor(session_income.values).view(-1, 1)
dataset = ChoiceDataset(item_index=item_index,
                        price_cost_freq_ovt=price_cost_freq_ovt,
                        session_income=session_income,
                        price_ivt=price_ivt
                        ).to(device)
print(dataset)

No `session_index` is provided, assume each choice instance is in its own session.
ChoiceDataset(label=[], item_index=[2779], user_index=[], session_index=[2779], item_availability=[], price_cost_freq_ovt=[2779, 4, 3], session_income=[2779, 1], price_ivt=[2779, 4, 1], device=cuda:0)


In [5]:
# shuffle the dataset.
N = len(dataset)
shuffle_index = np.random.permutation(N)

train_index = shuffle_index[:int(0.7 * N)]
test_index = shuffle_index[int(0.7 * N):]

# splits of dataset.
dataset_train, dataset_test = dataset[train_index], dataset[test_index]

In [6]:
conditional_logit_common_arguments = {
    "coef_variation_dict": {'price_cost_freq_ovt': 'constant',
                            'session_income': 'item',
                            'price_ivt': 'item-full',
                            'intercept': 'item'},
    "num_param_dict": {'price_cost_freq_ovt': 3,
                       'session_income': 1,
                       'price_ivt': 1,
                       'intercept': 1},
    "num_items": 4,
}

In [7]:
def train_conditional_logit_model(regularization, regularization_weight):
    model = ConditionalLogitModel(**conditional_logit_common_arguments,
                                regularization=regularization,
                                regularization_weight=regularization_weight).to(device)

    run(model, dataset_train, dataset_test=dataset_test, num_epochs=50000, learning_rate=0.003, batch_size=-1)
    # report total model weight
    print('Total weight L2 norm:', sum([torch.norm(param, p=2) for param in model.parameters()]))

In [7]:
train_conditional_logit_model(regularization=None, regularization_weight=None)

ConditionalLogitModel(
  (coef_dict): ModuleDict(
    (price_cost_freq_ovt): Coefficient(variation=constant, num_items=4, num_users=None, num_params=3, 3 trainable parameters in total, device=cuda:0).
    (session_income): Coefficient(variation=item, num_items=4, num_users=None, num_params=1, 3 trainable parameters in total, device=cuda:0).
    (price_ivt): Coefficient(variation=item-full, num_items=4, num_users=None, num_params=1, 4 trainable parameters in total, device=cuda:0).
    (intercept): Coefficient(variation=item, num_items=4, num_users=None, num_params=1, 3 trainable parameters in total, device=cuda:0).
  )
)
Conditional logistic discrete choice model, expects input features:

X[price_cost_freq_ovt] with 3 parameters, with constant level variation.
X[session_income] with 1 parameters, with item level variation.
X[price_ivt] with 1 parameters, with item-full level variation.
X[intercept] with 1 parameters, with item level variation.
device=cuda:0
ChoiceDataset(label=[], item_

In [10]:
train_conditional_logit_model(regularization='L1', regularization_weight=5)

ConditionalLogitModel(
  (coef_dict): ModuleDict(
    (price_cost_freq_ovt): Coefficient(variation=constant, num_items=4, num_users=None, num_params=3, 3 trainable parameters in total, device=cuda:0).
    (session_income): Coefficient(variation=item, num_items=4, num_users=None, num_params=1, 3 trainable parameters in total, device=cuda:0).
    (price_ivt): Coefficient(variation=item-full, num_items=4, num_users=None, num_params=1, 4 trainable parameters in total, device=cuda:0).
    (intercept): Coefficient(variation=item, num_items=4, num_users=None, num_params=1, 3 trainable parameters in total, device=cuda:0).
  )
)
Conditional logistic discrete choice model, expects input features:

X[price_cost_freq_ovt] with 3 parameters, with constant level variation.
X[session_income] with 1 parameters, with item level variation.
X[price_ivt] with 1 parameters, with item-full level variation.
X[intercept] with 1 parameters, with item level variation.
device=cuda:0
ChoiceDataset(label=[], item_

In [9]:
train_conditional_logit_model(regularization='L2', regularization_weight=5)

ConditionalLogitModel(
  (coef_dict): ModuleDict(
    (price_cost_freq_ovt): Coefficient(variation=constant, num_items=4, num_users=None, num_params=3, 3 trainable parameters in total, device=cuda:0).
    (session_income): Coefficient(variation=item, num_items=4, num_users=None, num_params=1, 3 trainable parameters in total, device=cuda:0).
    (price_ivt): Coefficient(variation=item-full, num_items=4, num_users=None, num_params=1, 4 trainable parameters in total, device=cuda:0).
    (intercept): Coefficient(variation=item, num_items=4, num_users=None, num_params=1, 3 trainable parameters in total, device=cuda:0).
  )
)
Conditional logistic discrete choice model, expects input features:

X[price_cost_freq_ovt] with 3 parameters, with constant level variation.
X[session_income] with 1 parameters, with item level variation.
X[price_ivt] with 1 parameters, with item-full level variation.
X[intercept] with 1 parameters, with item level variation.
device=cuda:0
ChoiceDataset(label=[], item_

In [8]:
train_conditional_logit_model(regularization='L1', regularization_weight=1E5)

ConditionalLogitModel(
  (coef_dict): ModuleDict(
    (price_cost_freq_ovt): Coefficient(variation=constant, num_items=4, num_users=None, num_params=3, 3 trainable parameters in total, device=cuda:0).
    (session_income): Coefficient(variation=item, num_items=4, num_users=None, num_params=1, 3 trainable parameters in total, device=cuda:0).
    (price_ivt): Coefficient(variation=item-full, num_items=4, num_users=None, num_params=1, 4 trainable parameters in total, device=cuda:0).
    (intercept): Coefficient(variation=item, num_items=4, num_users=None, num_params=1, 3 trainable parameters in total, device=cuda:0).
  )
)
Conditional logistic discrete choice model, expects input features:

X[price_cost_freq_ovt] with 3 parameters, with constant level variation.
X[session_income] with 1 parameters, with item level variation.
X[price_ivt] with 1 parameters, with item-full level variation.
X[intercept] with 1 parameters, with item level variation.
device=cuda:0
ChoiceDataset(label=[], item_

## On Nested Logit Model

In [12]:
df = pd.read_csv('./public_datasets/HC.csv', index_col=0)
df = df.reset_index(drop=True)
df.head()

# what was actually chosen.
item_index = df[df['depvar'] == True].sort_values(by='idx.id1')['idx.id2'].reset_index(drop=True)
item_names = ['ec', 'ecc', 'er', 'erc', 'gc', 'gcc', 'hpc']
num_items = df['idx.id2'].nunique()
# cardinal encoder.
encoder = dict(zip(item_names, range(num_items)))
item_index = item_index.map(lambda x: encoder[x])
item_index = torch.LongTensor(item_index)

# category feature: no category feature, all features are item-level.
category_dataset = ChoiceDataset(item_index=item_index.clone()).to(device)

# item feature.
item_feat_cols = ['ich', 'och', 'icca', 'occa', 'inc.room', 'inc.cooling', 'int.cooling']
price_obs = utils.pivot3d(df, dim0='idx.id1', dim1='idx.id2', values=item_feat_cols)

item_dataset = ChoiceDataset(item_index=item_index, price_obs=price_obs).to(device)

dataset = JointDataset(category=category_dataset, item=item_dataset)

category_to_item = {0: ['gcc', 'ecc', 'erc', 'hpc'],
                    1: ['gc', 'ec', 'er']}

# encode items to integers.
for k, v in category_to_item.items():
    v = [encoder[item] for item in v]
    category_to_item[k] = sorted(v)

No `session_index` is provided, assume each choice instance is in its own session.
No `session_index` is provided, assume each choice instance is in its own session.


In [14]:
def train_nested_logit_model(regularization, regularization_weight):
    model = NestedLogitModel(category_to_item=category_to_item,
                         category_coef_variation_dict={},
                         category_num_param_dict={},
                         item_coef_variation_dict={'price_obs': 'constant'},
                         item_num_param_dict={'price_obs': 7},
                         regularization=regularization,
                         regularization_weight=regularization_weight,
                         shared_lambda=True).to(device)
    run(model, dataset, num_epochs=10000)

In [18]:
train_nested_logit_model(None, None)

NestedLogitModel(
  (category_coef_dict): ModuleDict()
  (item_coef_dict): ModuleDict(
    (price_obs): Coefficient(variation=constant, num_items=7, num_users=None, num_params=7, 7 trainable parameters in total, device=cuda:0).
  )
)
JointDataset with 2 sub-datasets: (
	category: ChoiceDataset(label=[], item_index=[250], user_index=[], session_index=[250], item_availability=[], device=cuda:0)
	item: ChoiceDataset(label=[], item_index=[250], user_index=[], session_index=[250], item_availability=[], price_obs=[250, 7, 7], device=cuda:0)
)
Epoch 1000: Log-likelihood=-226.63345336914062
Epoch 2000: Log-likelihood=-189.08030700683594
Epoch 3000: Log-likelihood=-181.08639526367188
Epoch 4000: Log-likelihood=-179.11544799804688
Epoch 5000: Log-likelihood=-178.78994750976562
Epoch 6000: Log-likelihood=-178.64102172851562
Epoch 7000: Log-likelihood=-178.50711059570312
Epoch 8000: Log-likelihood=-178.36279296875
Epoch 9000: Log-likelihood=-178.23562622070312
Epoch 10000: Log-likelihood=-178.1572

In [16]:
train_nested_logit_model("L1", 10)

NestedLogitModel(
  (category_coef_dict): ModuleDict()
  (item_coef_dict): ModuleDict(
    (price_obs): Coefficient(variation=constant, num_items=7, num_users=None, num_params=7, 7 trainable parameters in total, device=cuda:0).
  )
)
JointDataset with 2 sub-datasets: (
	category: ChoiceDataset(label=[], item_index=[250], user_index=[], session_index=[250], item_availability=[], device=cuda:0)
	item: ChoiceDataset(label=[], item_index=[250], user_index=[], session_index=[250], item_availability=[], price_obs=[250, 7, 7], device=cuda:0)
)
Epoch 1000: Log-likelihood=-186.81593322753906
Epoch 2000: Log-likelihood=-187.0428924560547
Epoch 3000: Log-likelihood=-188.46871948242188
Epoch 4000: Log-likelihood=-187.3245849609375
Epoch 5000: Log-likelihood=-187.10488891601562
Epoch 6000: Log-likelihood=-187.18087768554688
Epoch 7000: Log-likelihood=-187.34005737304688
Epoch 8000: Log-likelihood=-187.11846923828125
Epoch 9000: Log-likelihood=-187.3697509765625
Epoch 10000: Log-likelihood=-187.0865

In [17]:
train_nested_logit_model("L2", 10)

NestedLogitModel(
  (category_coef_dict): ModuleDict()
  (item_coef_dict): ModuleDict(
    (price_obs): Coefficient(variation=constant, num_items=7, num_users=None, num_params=7, 7 trainable parameters in total, device=cuda:0).
  )
)
JointDataset with 2 sub-datasets: (
	category: ChoiceDataset(label=[], item_index=[250], user_index=[], session_index=[250], item_availability=[], device=cuda:0)
	item: ChoiceDataset(label=[], item_index=[250], user_index=[], session_index=[250], item_availability=[], price_obs=[250, 7, 7], device=cuda:0)
)
Epoch 1000: Log-likelihood=-219.621826171875
Epoch 2000: Log-likelihood=-200.87660217285156
Epoch 3000: Log-likelihood=-192.0721435546875
Epoch 4000: Log-likelihood=-183.12820434570312
Epoch 5000: Log-likelihood=-182.87225341796875
Epoch 6000: Log-likelihood=-183.52407836914062
Epoch 7000: Log-likelihood=-183.50723266601562
Epoch 8000: Log-likelihood=-183.5075225830078
Epoch 9000: Log-likelihood=-183.50465393066406
Epoch 10000: Log-likelihood=-183.50736