In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [34]:
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

In [4]:
import plotly
import plotly.graph_objects as go

In [5]:
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
from functions import read_data, seed_everything
from mlp import objective, EarlyStopper

In [7]:
from config import list_cols_with_na

In [8]:
seed_everything()

In [9]:
df_train, df_test = read_data()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_final.drop(columns="SalePrice", inplace=True)


In [10]:
# Pipeline for scaling
# One hot encoder
cols_oh = [i for i in df_test.columns if df_test[i].dtype == "O"]
cols_mm = [i for i in df_test.columns if i not in cols_oh]

ct = ColumnTransformer([("oh", OneHotEncoder(sparse_output=False,
                                             handle_unknown="infrequent_if_exist"), cols_oh),
                        ("mm", MinMaxScaler(), cols_mm)],
                        remainder="passthrough",
                        verbose_feature_names_out=False)

pipe = Pipeline([("ct", ct)])

In [11]:
# Fit and transform the pipeline
x_train, x_valid, y_train, y_valid = train_test_split(df_train.drop(columns="SalePrice"),
                                                      df_train["SalePrice"],
                                                      train_size=0.8, shuffle=True,
                                                      random_state=12)
x_train = pipe.fit_transform(x_train)
x_valid = pipe.transform(x_valid)

x_test = df_test.copy()
x_test = pipe.transform(x_test)

In [12]:
# Load data
tensor_train = TensorDataset(torch.from_numpy(x_train).to(dtype=torch.float), torch.from_numpy(y_train.values.reshape(-1, 1)).to(dtype=torch.float))
x_valid = torch.from_numpy(x_valid).to(dtype=torch.float)
y_valid = torch.from_numpy(y_valid.values.reshape(-1, 1)).to(dtype=torch.float)

In [13]:
# MLP initialisation
# Device
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device, torch.cuda.get_device_name(device))

cuda NVIDIA GeForce GTX 1660 SUPER


In [None]:
# Tune model
study = optuna.create_study()
study.optimize(lambda trial: objective(trial, tensor_train, x_valid, y_valid), n_jobs=-1, n_trials=500)

In [39]:
# Get best params
class MLP(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        # Input layer
        self.input = nn.Linear(input_size, 1193)

        # Output
        self.output = nn.Linear(1193, 1)


    def forward(self, x):
        x = F.sigmoid(self.input(x))
        return self.output(x)

study.best_params

{'n_layers': 1,
 'n_outputs_0': 1193,
 'act_func_0': 'Sigmoid',
 'do_0': False,
 'bn_0': False,
 'learning_rate': 0.000832146685052974,
 'batch_size': 4,
 'optimiser_name': 'Adam'}

In [40]:
# Initialise parameters
model = MLP(x_train.shape[1]).to(device)
num_epochs = 700
criterion = nn.MSELoss()
learning_rate = study.best_params["learning_rate"]
batch_size = study.best_params["batch_size"]
optimiser = optim.Adam(model.parameters(), lr=learning_rate)
print(model)

MLP(
  (input): Linear(in_features=296, out_features=1193, bias=True)
  (output): Linear(in_features=1193, out_features=1, bias=True)
)


In [25]:
# Load data
loader_train = DataLoader(tensor_train, batch_size=batch_size)

In [42]:
# train the model
list_errors_train = []
list_errors_valid = []
for epoch in range(num_epochs):
    # Train
    model.train()
    for batch, (inputs_train, targets_train) in enumerate(loader_train):
        inputs_train = inputs_train.to(device)
        targets_train = targets_train.to(device)
        outputs_train = model(inputs_train)

        optimiser.zero_grad()
        loss_train = criterion(outputs_train, targets_train)
        list_errors_train.append(loss_train.item())
        loss_train.backward()
        optimiser.step()

    # Validation
    model.eval()
    with torch.no_grad():
        x_valid = x_valid.to(device)
        y_valid = y_valid.to(device)
        outputs_valid = model(x_valid)

        loss_valid = criterion(outputs_valid, y_valid)
        list_errors_valid.append(loss_valid.item())

    # Early stopping
    early_stopper = EarlyStopper(patience=10, min_delta=0.01)
    if early_stopper.early_stop(loss_valid):
        break

In [48]:
# Plot loss graph
import plotly
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(num_epochs),
                         y=list_errors_train,
                         name="Train loss",
                         mode="lines"))
fig.add_trace(go.Scatter(x=np.arange(num_epochs),
                         y=list_errors_valid,
                         name="Valid loss",
                         mode="lines"))
fig.update_layout(title="Loss graph",
                  xaxis_title="Epochs",
                  yaxis_title="Loss")
plotly.offline.iplot(fig)

In [60]:
# Predict
with torch.no_grad():
    predict = model(torch.from_numpy(x_test).to(device, dtype=torch.float)).cpu().numpy().reshape(-1)
predict

array([11.880913 , 11.9201145, 12.281382 , ..., 12.035653 , 11.336559 ,
       12.161473 ], dtype=float32)

In [61]:
# Output submission
submission = pd.DataFrame({"Id": np.arange(1461, 1461 + len(df_test)),
                           "SalePrice": np.exp(predict)})
submission.to_csv("./submission/submission.csv", index=False)

In [66]:
predict

array([11.880913 , 11.9201145, 12.281382 , ..., 12.035653 , 11.336559 ,
       12.161473 ], dtype=float32)