In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 1024, 288, 128, 1
from sklearn.utils import shuffle

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

### Data preprocessing. Inspiration - https://www.kaggle.com/vikassingh1996/extensive-data-preprocessing-and-modeling#4.-Feature-Engineering-

In [2]:
df = pd.read_csv('train.csv') ## Import data
labe = df['SalePrice'].to_numpy()
df.drop(columns = ['SalePrice','Id'], inplace=True)
df = pd.get_dummies(df, dummy_na=True, drop_first=True)
df.fillna(df.median(),inplace=True)
a = df.to_numpy()

# col = [i for i in df] 
# print(col)

RS = RobustScaler().fit(a)
scaledA = RS.transform(a)
data = scaledA
scF = np.max(labe)
labe = labe/np.max(labe)
train_data, test_data, train_labels, test_labels = train_test_split(data, labe, test_size=0.2)

# print(len(data),len(data[0]))
# print(data)
# print(labe)

In [3]:
# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H, D_out),
)
model.cuda()
loss_fn = torch.nn.MSELoss(reduction='mean')
learning_rate = 5e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
epochs = 1500
for t in range(epochs):
    j = 0
    data, labe = shuffle(train_data, train_labels)
    while j < len(train_data):
        x = data[j:j+N]
        x = torch.tensor(x, dtype=torch.float32)
        y = labe[j:j+N]
        y = torch.tensor(y, dtype=torch.float32)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        j += N
    print(t, loss.item())

RuntimeError: Expected object of backend CUDA but got backend CPU for argument #4 'mat1'

In [None]:
with torch.no_grad():
    x = torch.tensor(test_data, dtype=torch.float32)
    y = torch.tensor(test_labels, dtype=torch.float32)
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(loss.item()*scF)

### Thats a lot of error when thinking about the loss - that it's averaged over all the samples and still the error is that high. Introducing more layers and reducing the depth of each layer might help

Also reducing the learning rate just to see if that is the issue

In [None]:
N, D_in, H1, H2, D_out = 256, 288, 32, 32, 1
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H1),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H1, H2),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H2, D_out)
)
model.cuda()

loss_fn = torch.nn.MSELoss(reduction='mean')
learning_rate = 5e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
epochs = 1500
for t in range(epochs):
    j = 0
    data, labe = shuffle(train_data, train_labels)
    while j < len(train_data):
        x = data[j:j+N]
        x = torch.tensor(x, dtype=torch.float32)
        y = labe[j:j+N]
        y = torch.tensor(y, dtype=torch.float32)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        j += N
    print(t, loss.item())

In [None]:
with torch.no_grad():
    x = torch.tensor(test_data, dtype=torch.float32)
    y = torch.tensor(test_labels, dtype=torch.float32)
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(loss.item()*scF)
    print(loss.item())    

Sometimes this model gives worse results than the previos network. Tweaking the second network to improve results

Trying to introduce weights decay and dropout and see if that helps.

In [None]:
N, D_in, H1, H2, D_out = 32, 288, 64, 64, 1
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H1),
    torch.nn.Dropout(p=0.2),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H1, H2),
    torch.nn.Dropout(p=0.2),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H2, D_out)
)
model.cuda()

loss_fn = torch.nn.MSELoss(reduction='mean')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
epochs = 1500
for t in range(epochs):
    j = 0
    data, labe = shuffle(train_data, train_labels)
    while j < len(train_data):
        x = data[j:j+N]
        x = torch.tensor(x, dtype=torch.float32)
        y = labe[j:j+N]
        y = torch.tensor(y, dtype=torch.float32)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        j += N
    print(t, loss.item())

In [None]:
with torch.no_grad():
    x = torch.tensor(test_data, dtype=torch.float32)
    y = torch.tensor(test_labels, dtype=torch.float32)
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(loss.item()*scF)
    print(loss.item())    

Much better than the previous case. Dropout indeed works :)

In [None]:
N, D_in, H1, H2, D_out = 32, 288, 64, 64, 1
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H1),
    torch.nn.Dropout(p=0.2),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H1, H2),
    torch.nn.Dropout(p=0.2),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(H2, D_out)
)
model.cuda()

loss_fn = torch.nn.MSELoss(reduction='mean')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-2) ## Has inbuilt weights decay param of 1e-2
epochs = 1500
for t in range(epochs):
    j = 0
    data, labe = shuffle(train_data, train_labels)
    while j < len(train_data):
        x = data[j:j+N]
        x = torch.tensor(x, dtype=torch.float32)
        y = labe[j:j+N]
        y = torch.tensor(y, dtype=torch.float32)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        j += N
    print(t, loss.item())

In [None]:
with torch.no_grad():
    x = torch.tensor(test_data, dtype=torch.float32)
    y = torch.tensor(test_labels, dtype=torch.float32)
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(loss.item()*scF)
    print(loss.item())

Weights decay + dropout + more depth with shallower layers has given the best results till now. This is mainly because the data has lesser samples and more features per sample. We have to base the network architecture accordingly to get the maximum benifit.