# Data Modeling
lets use the cleaned up data to try and predict house prices!
We'll build the following models:
 - An XGBoost model
 - A simple feed-forward network with Keras
 - An optimized feed-forward network with Keras and optuna

In [102]:
%load_ext autoreload
%autoreload 2
import datetime
import numpy as np
import pandas as pd
import optuna
from keras import backend as K
from optuna.trial import TrialState
# mute output to declutter notebook
from tensorflow import keras
import wandb
from wandb.integration.keras import WandbCallback
import os

os.environ["WANDB_SILENT"] = "True"

optuna.logging.set_verbosity(optuna.logging.WARNING)
from optuna.visualization import plot_optimization_history, plot_slice, plot_param_importances
from utils.utils import get_log_dir, get_project_root, create_submission

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


We'll load the data from our HDF. Since the Dataset has no real validation set, we will split the training set and use 10% as our validation set.

In [103]:
# load the data from the preprocessed h5 file
with pd.HDFStore(get_project_root() / "data/formatted/dataset_clean.h5") as store:
    x_train = store["x_train"]
    y_train = store["y_train"]
    x_test = store["x_test"]

x_val = x_train[round(len(x_train) * 0.9):]
x_train = x_train[:round(len(x_train) * 0.9)]
y_val = y_train[round(len(y_train) * 0.9):]
y_train = y_train[:round(len(y_train) * 0.9)]

# create log targets
y_train_log = np.log(y_train)
y_val_log = np.log(y_val)

## Building Models

### 1. Simple FFN Network with Keras

We want to use root mean squared error as our loss function. This loss function doesn't exist in Keras, so we'll write it ourselves.
To track our progress, we'll use weights & biases and save the best model in a log folder, so we can recover them quickly

In [104]:
def rmsle(y_pred, y_true):
    return K.sqrt(K.mean(K.square(K.log(float(y_pred + 1)) - K.log(float(y_true + 1)))))

In [105]:
metrics = ["msle", rmsle]

In [106]:
def simple_ffn(X, Y, validation_data):
    # build the model
    model = keras.Sequential(
        [
            keras.layers.Input(shape=(len(X.columns),)),
            keras.layers.Dense(units=1000, activation="relu"),
            keras.layers.Dense(units=1000, activation="relu"),
            keras.layers.Dense(units=100, activation="relu"),
            keras.layers.Dense(units=10, activation="relu"),
            keras.layers.Dense(units=1),
        ])
    logs = get_log_dir() / datetime.datetime.now().strftime("%d-%m-%Y--%H-%M-%S")
    # create folder to save models at
    logs.mkdir(parents=True, exist_ok=True)
    best_model_path = logs / "best.hdf5"
    # save the best model, so we can compare the best, not the latest at the end.
    callbacks = [keras.callbacks.ModelCheckpoint(
        filepath=best_model_path.as_posix(),
        monitor="val_rmsle",
        mode="min",
        save_best_only=True,
        verbose=False
    )]
    opt = keras.optimizers.Adam()
    model.compile(optimizer=opt,
                  loss=rmsle,
                  metrics=metrics)
    wandb.init(project="kaggle-house-prices", group="simple_ffn", entity="fogx")
    callbacks.append(WandbCallback())
    model.fit(X, Y,
              epochs=100,
              validation_data=validation_data,
              verbose=1,
              callbacks=callbacks)
    wandb.finish()
    try:
        # load the best weights after training.
        model.load_weights(best_model_path)
    except FileNotFoundError:
        # if the model didn't improve just use the original
        pass
    return model

In [107]:
# call our method to fit the model
simple_ffn = simple_ffn(x_train, y_train_log, (x_val, y_val_log))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### 2. Modelling with optuna
Optuna tries to find optimal values by training surrogate models, and searching for good parameters. This means we don’t have to train the entire model, but only a small version of it. It also tries to optimize its search by “following” good leads and dropping/pruning bad ones.

In [108]:
sampler = optuna.samplers.TPESampler(seed=10)  # Make the sampler behave in a deterministic way.
study = optuna.create_study(study_name="stock_tpe_optuna", direction="minimize", sampler=sampler,
                            # pruner=optuna.pruners.NopPruner() # add the NopPruner to disable pruning
                            )

In [109]:
from optuna.integration import TFKerasPruningCallback
from keras.callbacks import ModelCheckpoint


def optuna_tpe(trial):
    n_layers = trial.suggest_int("n_layers", 1, 10)
    layers = []
    # create input and regularization layers
    input_layer = keras.layers.Input(len(x_train.columns))
    reg_layer = keras.layers.Normalization()
    reg_layer.adapt(x_train)
    model_list = [input_layer]
    # generate deep layers
    for i in range(n_layers):
        model_list.append(keras.layers.Dense(
            units=trial.suggest_int(f"layer_{i}", 16, 1024),
            activation=trial.suggest_categorical(f"activation_{i}", ["relu", "tanh", "linear"]),
        ))
        model_list.append(keras.layers.Dropout(trial.suggest_float(f"dropout_l{i}", 0, 0.5)))
    # add output layer without activation function to get a regression value
    model_list.append(keras.layers.Dense(1))
    model = keras.Sequential(model_list)
    optimizer = create_optimizer(trial)
    model.compile(
        loss="mse",
        optimizer=optimizer,
        metrics=metrics,
    )
    # define logs_dir for the best models and check that it exists
    logs_dir = get_log_dir() / f'{trial.number}--{datetime.datetime.now().strftime("%d-%m-%H-%M")}'
    logs_dir.mkdir(parents=True, exist_ok=True)
    callbacks = [
        ModelCheckpoint(filepath=(logs_dir / "best.hdf5").as_posix(), verbose=False),
        TFKerasPruningCallback(trial, f"val_{metrics[0]}")]
    # train the model
    model.fit(
        x_train,
        y_train_log,
        validation_data=(x_val, y_val_log),
        epochs=trial.suggest_int("epochs", 1, 500),
        batch_size=trial.suggest_int("batch_size", 16, 500),
        callbacks=callbacks,
        verbose=False
    )
    # evaluate on validation set and return just the mse. since our loss function is also MSE we can just return that
    model_eval = model.evaluate(x_val, y_val)
    return model_eval[0]


def create_optimizer(trial):
    # We optimize the choice of optimizers as well as their parameters.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop"])
    # optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    if optimizer_name == "Adam":
        adam_lr = trial.suggest_float("adam_lr", 1e-5, 1e-1, log=True)
        return keras.optimizers.Adam(learning_rate=adam_lr)
    elif optimizer_name == "RMSprop":
        rms_lr = trial.suggest_float("rmsprop_lr", 1e-5, 1e-1, log=True)
        rms_decay = trial.suggest_float("rmsprop_decay", 0.85, 0.99)
        rms_momentum = trial.suggest_float("rmsprop_momentum", 1e-5, 1e-1, log=True)
        return keras.optimizers.RMSprop(learning_rate=rms_lr, momentum=rms_momentum, decay=rms_decay)
    else:
        sgd_lr = trial.suggest_float("sgd_lr", 1e-5, 1e-1, log=True)
        sgd_momentum = trial.suggest_float("sgd_momentum", 1e-5, 1e-1, log=True)
        return keras.optimizers.SGD(learning_rate=sgd_lr, momentum=sgd_momentum)

In [110]:
start = datetime.datetime.now()
study.optimize(optuna_tpe,
               n_trials=100,  # run 100 trials
               # timeout=1800 # run for 30 minutes
               )
end = datetime.datetime.now()



First we can look at how our trials improved over time.

In [111]:
plot_optimization_history(study)

We can also look at which features had the most impact and what the algorithm preferred when training

In [112]:
plot_param_importances(study)

We see that it trended towards layers of size 300, and preferred 7+ layers. RELU was also much more used than the other activation functions.

In [113]:
plot_slice(study, ["layer_0", "layer_1", "layer_2", "n_layers", "epochs", "activation_1"])

We can view the best trial with `study.best_trial` and `study.best_params`. To load the trial, we have to access our logs and load the network with these parameters.

In [117]:
print(study.best_value)
study.best_params

36092211200.0


{'n_layers': 6,
 'layer_0': 242,
 'activation_0': 'relu',
 'dropout_l0': 0.10359288564510727,
 'layer_1': 157,
 'activation_1': 'relu',
 'dropout_l1': 0.1763777378855086,
 'layer_2': 482,
 'activation_2': 'relu',
 'dropout_l2': 0.45285103548806827,
 'layer_3': 493,
 'activation_3': 'tanh',
 'dropout_l3': 0.11917891003150101,
 'layer_4': 403,
 'activation_4': 'linear',
 'dropout_l4': 0.4653187381634932,
 'layer_5': 187,
 'activation_5': 'relu',
 'dropout_l5': 0.12813268247518356,
 'optimizer': 'RMSprop',
 'rmsprop_lr': 0.005058843190771526,
 'rmsprop_decay': 0.9360055719915432,
 'rmsprop_momentum': 0.012415987431436591,
 'epochs': 124,
 'batch_size': 93}

ERROR! Session/line number was not unique in database. History logging moved to new session 166


In [115]:
best_model = keras.models.load_model(
    get_log_dir() / f'{study.best_trial.number}--{study.best_trial.datetime_start.strftime("%d-%m-%H-%M")}' / "best.hdf5",
    custom_objects={"rmsle": rmsle})

To predict a single value in our pandas dataframe, we have to convert it to an (1,-1) ndarray first

In [116]:
best_model.predict(x_test.iloc[1].to_numpy().reshape(1, -1))

array([[11.713863]], dtype=float32)

We'll use our utility function `create_submission` to create the csv that we want to upload

In [None]:
create_submission(best_model, "test")