In [1]:
import random
import numpy as np
import pandas as pd

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [3]:
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

In [4]:
import plotly
import plotly.graph_objects as go

In [5]:
from functions import read_data
from mlp import MLP

In [6]:
from config import list_cols_with_na

In [7]:
df_train, df_test = read_data()

found 0 physical cores < 1
  File "c:\Users\Aaron\projects\kaggle-house_prices\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_final.drop(columns="SalePrice", inplace=True)


In [8]:
# Random seeding
torch.manual_seed(12)
np.random.seed(12)
random.seed(12)

In [9]:
# Pipeline for scaling
# One hot encoder
cols_oh = [i for i in df_test.columns if df_test[i].dtype == "O"]
cols_mm = [i for i in df_test.columns if i not in cols_oh]

ct = ColumnTransformer([("oh", OneHotEncoder(sparse_output=False,
                                             handle_unknown="infrequent_if_exist"), cols_oh),
                        ("mm", MinMaxScaler(), cols_mm)],
                        remainder="passthrough",
                        verbose_feature_names_out=False)

pipe = Pipeline([("ct", ct)])

In [10]:
# Fit and transform the pipeline
x_train_o, y_train_o = df_train.drop(columns="SalePrice"), df_train["SalePrice"]
x_train, x_valid, y_train, y_valid = train_test_split(x_train_o, y_train_o,
                                                      train_size=0.85, shuffle=True,
                                                      random_state=12)
x_test = df_test.copy()
x_train = pipe.fit_transform(x_train)
x_valid = pipe.transform(x_valid)
x_test = pipe.transform(x_test)

In [11]:
# MLP initialisation
# Device
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
device

device(type='cuda')

In [12]:
# Hyperparameters
num_epochs = 50
batch_size = 2 ** 4
learning_rate = 0.012
criterion = nn.MSELoss()

In [13]:
class MLP(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        # Input layer
        self.input1 = nn.Linear(input_size, 100)
        # Dropout here
        self.dropout1 = nn.Dropout(p=0.4)
        # Hidden layer
        self.hidden1 = nn.Linear(100, 50)
        # Output layer
        self.output1 = nn.Linear(50, 1)

    def forward(self, x):
        x = F.relu(self.input1(x))
        x = self.dropout1(x)
        x = F.relu(self.hidden1(x))
        return self.output1(x)


In [14]:
# Model
model = MLP(input_size=x_train.shape[1]).to(device)
optimiser = optim.Adam(model.parameters(), lr=learning_rate)
model

MLP(
  (input1): Linear(in_features=297, out_features=100, bias=True)
  (dropout1): Dropout(p=0.4, inplace=False)
  (hidden1): Linear(in_features=100, out_features=50, bias=True)
  (output1): Linear(in_features=50, out_features=1, bias=True)
)

In [15]:
# Load data
x_train = torch.from_numpy(x_train).to(device, torch.float32)
x_valid = torch.from_numpy(x_valid).to(device, torch.float32)
x_test = torch.from_numpy(x_test).to(device, torch.float32)

y_train = torch.from_numpy(y_train.values.reshape(-1, 1)).to(device, torch.float32)
y_valid = torch.from_numpy(y_valid.values.reshape(-1, 1)).to(device, torch.float32)

# Dataloader
data_train = TensorDataset(x_train, y_train)
loader_train = DataLoader(data_train, batch_size=batch_size, shuffle=True)
data_valid = TensorDataset(x_valid, y_valid)
loader_valid = DataLoader(data_valid, batch_size=batch_size)

In [16]:
# Train model
list_errors_train = []
list_errors_valid = []
for epoch in range(num_epochs):
    list_epoch_errors_train = []
    list_epoch_errors_valid = []
    # Train
    model.train()
    for i, (inputs, targets) in enumerate(loader_train):
        optimiser.zero_grad()
        outputs = model(inputs)
        loss = criterion(targets, outputs)
        loss.backward()
        optimiser.step()
        list_epoch_errors_train.append(loss.item())
    list_errors_train.append(np.mean(list_epoch_errors_train))
    # Valid
    model.eval()
    for i, (inputs_v, targets_v) in enumerate(loader_valid):
        loss_v = criterion(targets_v, model(inputs_v))
        list_epoch_errors_valid.append(loss_v.item())
    list_errors_valid.append(np.mean(list_epoch_errors_valid))
    # Print epoch count
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1} finished")

Epoch 10 finished
Epoch 20 finished
Epoch 30 finished
Epoch 40 finished
Epoch 50 finished


In [17]:
# Plot loss graph
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(num_epochs)),
                         y=list_errors_train,
                         name="Training loss"))
fig.add_trace(go.Scatter(x=list(range(num_epochs)),
                         y=list_errors_valid,
                         name="Valid loss"))
fig.update_layout(title="Loss Graph",
                  xaxis_title="No. of epochs",
                  yaxis_title="Loss")
plotly.offline.iplot(fig)

In [18]:
# Predict
model.eval()
# Train
with torch.no_grad():
    predict_train = model(x_train).cpu().numpy().reshape(-1)
# Valid
with torch.no_grad():
    predict_valid = model(x_valid).cpu().numpy().reshape(-1)
# Test
with torch.no_grad():
    predict_test = model(x_test).cpu().numpy().reshape(-1)

In [19]:
# Write to working file
with open("working.txt", "a") as f:
    f.write(f"train: {mean_squared_error(y_train.cpu().numpy(), predict_train, squared=False)}\n")
    f.write(f"valid: {mean_squared_error(y_valid.cpu().numpy(), predict_valid, squared=False)}\n")
    f.write(f"Epochs {num_epochs}: {model}")
    f.write("\n")