## Train and compare the performance of residential location choice models

In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from functions import set_seed, train, evaluate_nn, Config
from models import MNL_Choice, SCL_Choice, GNNChoiceModel, MLP_Choice
from data_process import load_data, spatial_choice_dataset_interact
comm, hh, edge_index, distance_to_work = load_data()

comm_features = [
    "pop_density",
    "white_prop",
    "black_prop",
    "single_res",
    "multi_res",
    "office",
    "retail",
    "land_mix",
    "transit_a_scaled",
    "med_house_age_scaled",
    "med_value_scaled",
    "h_units_scaled",
    "median_inc_scaled",
]

hh_features = ["hh_income_scaled", "race_white", "race_black"]

### The GNN model

In [None]:
# Set the training and evaluation procedure
device = torch.device("cpu")
config = Config()
config.bs = 32
config.num_hidden = 64
config.dropout = 0
config.optimizer = "adam"  # one of [adam, sgd]
config.lr = 0.01
config.lr_scheduler = "one_cycle"  # one of [step, one_cycle, exp, none]
config.n_epoch = 20
config.model = "GATConv"  # or any other model name
config.heads = 4  # Number of attention heads for GAT
config.mode = "disabled"  # online or disabled
config.residual = True
config.seed = 100

my_dataset = spatial_choice_dataset_interact
train_results = []
test_results = []
for i in range(10):
    test_idx = np.arange(i, len(hh), 10)
    train_idx = np.setdiff1d(np.arange(len(hh)), test_idx)
    train_dataset = my_dataset(
        comm,
        hh.loc[train_idx, :],
        distance_to_work[train_idx],
        comm_features,
        hh_features,
    )
    test_dataset = my_dataset(
        comm,
        hh.loc[test_idx, :],
        distance_to_work[test_idx],
        comm_features,
        hh_features,
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.bs, shuffle=True
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=len(test_dataset), shuffle=False
    )
    criterion = nn.NLLLoss()

    set_seed(config.seed)
    model = GNNChoiceModel(
        train_dataset[0][0].shape[-1],
        config.num_hidden,
        edge_index,
        dropout=config.dropout,
        heads=config.heads,
        residual=config.residual,
    ).to(device)

    set_seed(config.seed)
    model = train(
        model,
        criterion,
        train_loader,
        None,
        test_loader,
        config,
        device,
        verbose=False,
    )
    train_results.append(evaluate_nn(model, train_loader, comm))
    test_results.append(evaluate_nn(model, test_loader, comm))

    print(
        f"LLL_train: {train_results[-1]['LLL']:.4f} \t LLL_test: {test_results[-1]['LLL']:.4f} \t \
        accuracy_train: {train_results[-1]['accuracy']:.4f} \t accuracy_test: {test_results[-1]['accuracy']:.4f} \t \
        Fold: {i + 1}/10"
    )  # Log the results

train_results = {
    key: [d[key] for d in train_results] for key in train_results[0].keys()
}
test_results = {
    key: [d[key] for d in test_results] for key in test_results[0].keys()
}
train_results = pd.DataFrame(train_results)
test_results = pd.DataFrame(test_results)
train_results.to_csv(f"results/{config.model}_train_results.csv", index=False)
test_results.to_csv(f"results/{config.model}_test_results.csv", index=False)

print(f"Train results mean: \n{train_results.mean()}")
print(f"Test results mean: \n{test_results.mean()}")

LLL_train: -11566.5605 	 LLL_test: -1350.1887 	         accuracy_train: 0.1384 	 accuracy_test: 0.1094 	         Fold: 1/10
LLL_train: -11564.4082 	 LLL_test: -1333.9615 	         accuracy_train: 0.1395 	 accuracy_test: 0.0990 	         Fold: 2/10
LLL_train: -11558.5752 	 LLL_test: -1326.8843 	         accuracy_train: 0.1367 	 accuracy_test: 0.1458 	         Fold: 3/10
LLL_train: -11613.8643 	 LLL_test: -1313.5713 	         accuracy_train: 0.1364 	 accuracy_test: 0.1276 	         Fold: 4/10
LLL_train: -11607.5010 	 LLL_test: -1284.3704 	         accuracy_train: 0.1346 	 accuracy_test: 0.1641 	         Fold: 5/10
LLL_train: -11612.0410 	 LLL_test: -1329.2964 	         accuracy_train: 0.1387 	 accuracy_test: 0.1172 	         Fold: 6/10
LLL_train: -11564.8408 	 LLL_test: -1311.0835 	         accuracy_train: 0.1361 	 accuracy_test: 0.1016 	         Fold: 7/10
LLL_train: -11628.8145 	 LLL_test: -1267.0181 	         accuracy_train: 0.1335 	 accuracy_test: 0.1589 	         Fold: 8/10
LLL_trai

### MLP model

In [3]:
# Set the training and evaluation procedure
config = Config()
config.bs = 32
config.num_hidden = 64
config.dropout = 0
config.optimizer = "adam"  # one of [adam, sgd]
config.lr = 0.01
config.lr_scheduler = "one_cycle"  # one of [step, one_cycle, exp, none]
config.n_epoch = 20
config.model = "MLP"  # or any other model name
config.mode = "disabled"  # online or disabled
config.seed = 100
my_dataset = spatial_choice_dataset_interact

train_results = []
test_results = []
for i in range(10):
    test_idx = np.arange(i, len(hh), 10)
    train_idx = np.setdiff1d(np.arange(len(hh)), test_idx)
    train_dataset = my_dataset(
        comm,
        hh.loc[train_idx, :],
        distance_to_work[train_idx],
        comm_features,
        hh_features,
    )
    test_dataset = my_dataset(
        comm,
        hh.loc[test_idx, :],
        distance_to_work[test_idx],
        comm_features,
        hh_features,
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.bs, shuffle=True
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=len(test_dataset), shuffle=False
    )
    criterion = nn.NLLLoss()

    set_seed(config.seed)
    model = MLP_Choice(
        train_dataset[0][0].shape[-1], config.num_hidden, config.dropout
    ).to(device)

    set_seed(config.seed)
    model = train(
        model,
        criterion,
        train_loader,
        None,
        test_loader,
        config,
        device,
        verbose=False,
    )
    train_results.append(evaluate_nn(model, train_loader, comm))
    test_results.append(evaluate_nn(model, test_loader, comm))

    print(
        f"LLL_train: {train_results[-1]['LLL']:.4f} \t LLL_test: {test_results[-1]['LLL']:.4f} \t \
        accuracy_train: {train_results[-1]['accuracy']:.4f} \t accuracy_test: {test_results[-1]['accuracy']:.4f} \t \
        Fold: {i + 1}/10"
    )  # Log the results

train_results = {
    key: [d[key] for d in train_results] for key in train_results[0].keys()
}
test_results = {key: [d[key] for d in test_results] for key in test_results[0].keys()}
train_results = pd.DataFrame(train_results)
test_results = pd.DataFrame(test_results)
train_results.to_csv(f"results/{config.model}_train_results.csv", index=False)
test_results.to_csv(f"results/{config.model}_test_results.csv", index=False)

print(f"Train results mean: \n{train_results.mean()}")
print(f"Test results mean: \n{test_results.mean()}")


LLL_train: -11756.6230 	 LLL_test: -1352.9230 	         accuracy_train: 0.1283 	 accuracy_test: 0.1172 	         Fold: 1/10
LLL_train: -11761.9961 	 LLL_test: -1351.3311 	         accuracy_train: 0.1314 	 accuracy_test: 0.0964 	         Fold: 2/10
LLL_train: -11758.9531 	 LLL_test: -1341.5723 	         accuracy_train: 0.1265 	 accuracy_test: 0.1302 	         Fold: 3/10
LLL_train: -11800.4072 	 LLL_test: -1317.3148 	         accuracy_train: 0.1265 	 accuracy_test: 0.1276 	         Fold: 4/10
LLL_train: -11814.6699 	 LLL_test: -1296.9275 	         accuracy_train: 0.1210 	 accuracy_test: 0.1562 	         Fold: 5/10
LLL_train: -11775.7949 	 LLL_test: -1335.1088 	         accuracy_train: 0.1323 	 accuracy_test: 0.1016 	         Fold: 6/10
LLL_train: -11772.7666 	 LLL_test: -1321.4036 	         accuracy_train: 0.1314 	 accuracy_test: 0.1016 	         Fold: 7/10
LLL_train: -11823.1211 	 LLL_test: -1280.6267 	         accuracy_train: 0.1251 	 accuracy_test: 0.1484 	         Fold: 8/10
LLL_trai

### SCL model

In [4]:
# Set the training and evaluation procedure
config = Config()
config.num_hidden = 64
config.dropout = 0
config.optimizer = "lbfgs"  # one of [adam, sgd]
config.lr = 0.1
config.n_epoch = 20
config.model = "SCL"  # or any other model name
config.mode = "disabled"  # online or disabled
config.seed = 100
my_dataset = spatial_choice_dataset_interact

train_results = []
test_results = []
for i in range(10):
    test_idx = np.arange(i, len(hh), 10)
    train_idx = np.setdiff1d(np.arange(len(hh)), test_idx)
    train_dataset = my_dataset(
        comm,
        hh.loc[train_idx, :],
        distance_to_work[train_idx],
        comm_features,
        hh_features,
    )
    test_dataset = my_dataset(
        comm,
        hh.loc[test_idx, :],
        distance_to_work[test_idx],
        comm_features,
        hh_features,
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=len(train_dataset), shuffle=True
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=len(test_dataset), shuffle=False
    )
    criterion = nn.NLLLoss()

    set_seed(config.seed)
    model = SCL_Choice(train_dataset[0][0].shape[-1], edge_index).to(device)

    set_seed(config.seed)
    model = train(
        model,
        criterion,
        train_loader,
        None,
        test_loader,
        config,
        device,
        verbose=False,
    )
    train_results.append(evaluate_nn(model, train_loader, comm))
    test_results.append(evaluate_nn(model, test_loader, comm))

    print(
        f"LLL_train: {train_results[-1]['LLL']:.4f} \t LLL_test: {test_results[-1]['LLL']:.4f} \t \
        accuracy_train: {train_results[-1]['accuracy']:.4f} \t accuracy_test: {test_results[-1]['accuracy']:.4f} \t \
        Fold: {i + 1}/10"
    )  # Log the results
    print(f"Estimated mu value={torch.sigmoid(model.mu_raw).detach().numpy()}")

train_results = {
    key: [d[key] for d in train_results] for key in train_results[0].keys()
}
test_results = {key: [d[key] for d in test_results] for key in test_results[0].keys()}
train_results = pd.DataFrame(train_results)
test_results = pd.DataFrame(test_results)
train_results.to_csv(f"results/{config.model}_train_results.csv", index=False)
test_results.to_csv(f"results/{config.model}_test_results.csv", index=False)

print(f"Train results mean: \n{train_results.mean()}")
print(f"Test results mean: \n{test_results.mean()}")


LLL_train: -11981.8516 	 LLL_test: -1360.5092 	         accuracy_train: 0.1233 	 accuracy_test: 0.1094 	         Fold: 1/10
Estimated mu value=0.9999631643295288
LLL_train: -11996.9883 	 LLL_test: -1344.0950 	         accuracy_train: 0.1236 	 accuracy_test: 0.1094 	         Fold: 2/10
Estimated mu value=0.999929666519165
LLL_train: -11975.9453 	 LLL_test: -1366.6655 	         accuracy_train: 0.1248 	 accuracy_test: 0.1224 	         Fold: 3/10
Estimated mu value=0.9999314546585083
LLL_train: -11988.4316 	 LLL_test: -1354.8347 	         accuracy_train: 0.1257 	 accuracy_test: 0.1224 	         Fold: 4/10
Estimated mu value=0.9999300241470337
LLL_train: -12026.8535 	 LLL_test: -1315.6599 	         accuracy_train: 0.1184 	 accuracy_test: 0.1484 	         Fold: 5/10
Estimated mu value=0.9999678134918213
LLL_train: -11981.1895 	 LLL_test: -1360.4285 	         accuracy_train: 0.1265 	 accuracy_test: 0.1016 	         Fold: 6/10
Estimated mu value=0.9999291896820068
LLL_train: -11995.2080 	 LLL_

### MNL model

In [5]:
# Set the training and evaluation procedure
config = Config()
config.num_hidden = 64
config.dropout = 0
config.optimizer = "lbfgs"  # one of [adam, sgd]
config.lr = 0.1
config.n_epoch = 20
config.model = "MNL"  # or any other model name
config.mode = "disabled"  # online or disabled
config.seed = 100
my_dataset = spatial_choice_dataset_interact

train_results = []
test_results = []
for i in range(10):
    test_idx = np.arange(i, len(hh), 10)
    train_idx = np.setdiff1d(np.arange(len(hh)), test_idx)
    train_dataset = my_dataset(
        comm,
        hh.loc[train_idx, :],
        distance_to_work[train_idx],
        comm_features,
        hh_features,
    )
    test_dataset = my_dataset(
        comm,
        hh.loc[test_idx, :],
        distance_to_work[test_idx],
        comm_features,
        hh_features,
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=len(train_dataset), shuffle=True
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=len(test_dataset), shuffle=False
    )
    criterion = nn.NLLLoss()

    set_seed(config.seed)
    model = MNL_Choice(train_dataset[0][0].shape[-1],
                       train_dataset[0][0].shape[0]).to(device)

    set_seed(config.seed)
    model = train(
        model,
        criterion,
        train_loader,
        None,
        test_loader,
        config,
        device,
        verbose=False,
    )
    train_results.append(evaluate_nn(model, train_loader, comm))
    test_results.append(evaluate_nn(model, test_loader, comm))

    print(
        f"LLL_train: {train_results[-1]['LLL']:.4f} \t LLL_test: {test_results[-1]['LLL']:.4f} \t \
        accuracy_train: {train_results[-1]['accuracy']:.4f} \t accuracy_test: {test_results[-1]['accuracy']:.4f} \t \
        Fold: {i + 1}/10"
    )  # Log the results

train_results = {
    key: [d[key] for d in train_results] for key in train_results[0].keys()
}
test_results = {key: [d[key] for d in test_results] for key in test_results[0].keys()}
train_results = pd.DataFrame(train_results)
test_results = pd.DataFrame(test_results)
train_results.to_csv(f"results/{config.model}_train_results.csv", index=False)
test_results.to_csv(f"results/{config.model}_test_results.csv", index=False)

print(f"Train results mean: \n{train_results.mean()}")
print(f"Test results mean: \n{test_results.mean()}")


LLL_train: -11981.8496 	 LLL_test: -1360.5039 	         accuracy_train: 0.1233 	 accuracy_test: 0.1094 	         Fold: 1/10
LLL_train: -11996.9863 	 LLL_test: -1344.0957 	         accuracy_train: 0.1236 	 accuracy_test: 0.1094 	         Fold: 2/10
LLL_train: -11975.9414 	 LLL_test: -1366.6665 	         accuracy_train: 0.1248 	 accuracy_test: 0.1224 	         Fold: 3/10
LLL_train: -11988.4287 	 LLL_test: -1354.8396 	         accuracy_train: 0.1257 	 accuracy_test: 0.1224 	         Fold: 4/10
LLL_train: -12026.8516 	 LLL_test: -1315.6589 	         accuracy_train: 0.1184 	 accuracy_test: 0.1484 	         Fold: 5/10
LLL_train: -11981.1875 	 LLL_test: -1360.4270 	         accuracy_train: 0.1265 	 accuracy_test: 0.1016 	         Fold: 6/10
LLL_train: -11995.2070 	 LLL_test: -1347.7151 	         accuracy_train: 0.1262 	 accuracy_test: 0.1146 	         Fold: 7/10
LLL_train: -12041.0664 	 LLL_test: -1300.9236 	         accuracy_train: 0.1213 	 accuracy_test: 0.1380 	         Fold: 8/10
LLL_trai