In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [3]:
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [4]:
import plotly
import plotly.graph_objects as go

In [5]:
from functions import read_data
from mlp import MLP

In [6]:
from config import list_cols_with_na

In [7]:
df_train, df_test = read_data()

found 0 physical cores < 1
  File "c:\Users\Aaron\projects\kaggle-house_prices\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_final.drop(columns="SalePrice", inplace=True)


In [8]:
# Pipeline for scaling
# One hot encoder
cols_oh = [i for i in df_test.columns if df_test[i].dtype == "O"]
cols_mm = [i for i in df_test.columns if i not in cols_oh]

ct = ColumnTransformer([("oh", OneHotEncoder(sparse_output=False), cols_oh),
                        ("mm", MinMaxScaler(), cols_mm)],
                        remainder="passthrough",
                        verbose_feature_names_out=False)

pipe = Pipeline([("ct", ct)])

In [9]:
# Fit and transform the pipeline
x_train, y_train = df_train.drop(columns="SalePrice"), df_train["SalePrice"]
x_test = df_test.copy()
x_train = pipe.fit_transform(x_train)
x_test = pipe.transform(x_test)

In [10]:
# MLP initialisation
# Device
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
device

device(type='cuda')

In [11]:
# Hyperparameters
num_epochs = 200
batch_size = 2 ** 4
learning_rate = 0.012
criterion = nn.MSELoss()

In [12]:
# Load data
x_train = torch.from_numpy(x_train).to(device, torch.float32)
x_test = torch.from_numpy(x_test).to(device, torch.float32)

y_train = torch.from_numpy(y_train.values.reshape(-1, 1)).to(device, torch.float32)

# Dataloader
data_train = TensorDataset(x_train, y_train)
loader_train = DataLoader(data_train, batch_size=batch_size, shuffle=True)

In [13]:
# Model
model = MLP(input_size=x_train.shape[1]).to(device)
optimiser = optim.Adam(model.parameters(), lr=learning_rate)
model

MLP(
  (input1): Linear(in_features=302, out_features=250, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (hidden1): Linear(in_features=250, out_features=100, bias=True)
  (output1): Linear(in_features=100, out_features=1, bias=True)
)

In [14]:
# Train model
list_errors = []   # errors per epoch, errors are average of each mini-batch
for epoch in range(num_epochs):
    list_epoch_errors = []
    for i, (inputs, targets) in enumerate(loader_train):
        optimiser.zero_grad()
        outputs = model(inputs)
        loss = criterion(targets, outputs)
        loss.backward()
        optimiser.step()
        list_epoch_errors.append(loss.item())
    list_errors.append(np.mean(list_epoch_errors))
    if (epoch + 1) % 25 == 0:
        print(f"Epoch {epoch + 1} error: {np.mean(list_epoch_errors):.2f}")

Epoch 25 error: 861836729.91
Epoch 50 error: 625729769.35
Epoch 75 error: 473507370.70
Epoch 100 error: 415588881.39
Epoch 125 error: 414785513.22
Epoch 150 error: 338217052.13
Epoch 175 error: 365558934.74
Epoch 200 error: 317301683.30


In [15]:
# Plot loss graph
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(num_epochs)),
                         y=list_errors,
                         name="Training loss"))
fig.update_layout(title="Training Loss Graph",
                  xaxis_title="No. of epochs",
                  yaxis_title="Loss")
plotly.offline.iplot(fig)

In [16]:
# Predict
with torch.no_grad():
    predict = model(x_test).cpu().numpy().reshape(-1)

In [17]:
# Output submission
submission = pd.DataFrame({"Id": np.arange(1461, 1461 + len(df_test)),
                           "SalePrice": predict})
submission.to_csv("./submission/submission.csv", index=False)