In [1]:
import pandas as pd
import torch
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from sklearn.model_selection import train_test_split

input_dir = '/kaggle/input/house-prices-advanced-regression-techniques/'

In [2]:
def read_and_preprocess_csv(input_dir):
    '''
    Reads and preprocesses train and test csv's

    Args:
        input_dir (str): directory of the house prices dataset

    Returns:
        X_train (pd.DataFrame): input for the model training
        y_train (pd.Series): labels for training
        X_test (pd.DataFrame): input for the model testing
    '''
    train_csv = pd.read_csv(os.path.join(input_dir, 'train.csv'))
    test_csv = pd.read_csv(os.path.join(input_dir, 'test.csv'))

    ground_truth = 'SalePrice'

    features = pd.concat(
        (train_csv.drop(columns=['Id', ground_truth]),
         test_csv.drop(columns=['Id']))
    )

    numerical_features = features.dtypes[features.dtypes != 'object'].index

    features[numerical_features] = features[numerical_features].fillna(features[numerical_features].mean())

    features = pd.get_dummies(features, dummy_na=True)
        
    scaler = StandardScaler()
    features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

    features = features.astype('float32')
    
    y_train = np.log1p(train_csv[ground_truth])
    # y_train = train_csv[ground_truth]

    X_train = features.iloc[:len(y_train)]
    X_test = features.iloc[len(y_train):]

    return X_train, y_train, X_test

X_train_df, y_train_df, X_test_df = read_and_preprocess_csv(input_dir)

print(f'X_train shape: {X_train_df.shape}, y_train shape: {y_train_df.shape}, X_test shape: {X_test_df.shape}')

X_train_df

X_train shape: (1460, 330), y_train shape: (1460,), X_test shape: (1459, 330)


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
0,0.067331,-0.202068,-0.217879,0.646183,-0.507284,1.046258,0.896833,0.525202,0.580907,-0.293130,...,-0.049029,0.395018,-0.018512,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,0.0
1,-0.873616,0.501870,-0.072044,-0.063185,2.188279,0.154764,-0.395604,-0.572250,1.178112,-0.293130,...,-0.049029,0.395018,-0.018512,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,0.0
2,0.067331,-0.061280,0.137197,0.646183,-0.507284,0.980221,0.848965,0.334828,0.097873,-0.293130,...,-0.049029,0.395018,-0.018512,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,0.0
3,0.302568,-0.436714,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.572250,-0.494941,-0.293130,...,-0.049029,0.395018,-0.018512,3.789876,-0.064249,-0.09105,-0.126535,-2.155466,-0.302693,0.0
4,0.067331,0.689587,0.518903,1.355551,-0.507284,0.947203,0.753229,1.387486,0.468931,-0.293130,...,-0.049029,0.395018,-0.018512,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.067331,-0.342855,-0.285470,-0.063185,-0.507284,0.914184,0.753229,-0.572250,-0.969192,-0.293130,...,-0.049029,0.395018,-0.018512,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,0.0
1456,-0.873616,0.736516,0.381311,-0.063185,0.391237,0.220801,0.178812,0.094060,0.765338,0.670525,...,-0.049029,0.395018,-0.018512,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,0.0
1457,0.302568,-0.155138,-0.142806,0.646183,3.086800,-1.000876,1.040437,-0.572250,-0.365400,-0.293130,...,-0.049029,0.395018,-0.018512,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,0.0
1458,-0.873616,-0.061280,-0.057207,-0.772552,0.391237,-0.703711,0.561757,-0.572250,-0.861608,5.790313,...,-0.049029,0.395018,-0.018512,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693,0.0


In [3]:
X_train_df, X_valid_df, y_train_df, y_valid_df = train_test_split(X_train_df, y_train_df, test_size=0.2, random_state=1)

X_train = torch.tensor(X_train_df.values, dtype=torch.float32)
y_train = torch.tensor(y_train_df.values, dtype=torch.float32).unsqueeze(1)
X_valid = torch.tensor(X_valid_df.values, dtype=torch.float32)
y_valid = torch.tensor(y_valid_df.values, dtype=torch.float32).unsqueeze(1)

print(X_train.shape, y_train.shape)

train_ds = TensorDataset(X_train, y_train)
valid_ds = TensorDataset(X_valid, y_valid)

batch_size = 64

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=False)

torch.Size([1168, 330]) torch.Size([1168, 1])


In [4]:
class CustomMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(330, 256),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2),
            nn.Linear(256, 64),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
        )

    def forward(self, x):
        return self.net(x)

def get_gpus():
    gpu_count = torch.cuda.device_count()
    return [torch.device(f'cuda:{i}') for i in range(gpu_count)]

devices = get_gpus()
print(devices)

net = CustomMLP()

[device(type='cuda', index=0), device(type='cuda', index=1)]


In [6]:
def train(net, dataloader, num_epochs, lr, momentum, devices):
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])

    loss = nn.MSELoss()
    optim = torch.optim.SGD(net.parameters(), lr=lr, momentum=momentum)

    for epoch in range(num_epochs):
        net.train()
        for X, y in dataloader:
            X, y = X.to(devices[0]), y.to(devices[0])
            optim.zero_grad()
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optim.step()
        if (epoch+1) % 5 == 0:
            print(f'Epoch {epoch+1} Loss: {l.item():.5f}')

train(net, train_loader, 100, lr=0.001, momentum=0.9, devices=devices)

  return F.linear(input, self.weight, self.bias)


Epoch 5 Loss: 2.25123
Epoch 10 Loss: 3.37110
Epoch 15 Loss: 3.54134
Epoch 20 Loss: 2.13126
Epoch 25 Loss: 1.66895
Epoch 30 Loss: 1.49272
Epoch 35 Loss: 1.63901
Epoch 40 Loss: 1.06410
Epoch 45 Loss: 4.29376
Epoch 50 Loss: 0.81860
Epoch 55 Loss: 2.06815
Epoch 60 Loss: 0.90963
Epoch 65 Loss: 1.28576
Epoch 70 Loss: 0.77915
Epoch 75 Loss: 0.82414
Epoch 80 Loss: 0.83678
Epoch 85 Loss: 0.91088
Epoch 90 Loss: 1.65972
Epoch 95 Loss: 1.51636
Epoch 100 Loss: 0.95176


In [7]:
def eval_log_rmse(net, dataloader, devices):
    net.eval()
    total_loss, count = 0.0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(devices[0]), y.to(devices[0])
            y = torch.clamp(y, min=1.0)
            outputs = net(X)
            pred = torch.clamp(outputs, min=1.0)
            loss = torch.sqrt(torch.mean( (torch.log(pred) - torch.log(y) )**2))
            total_loss += loss.item() * y.size(0)
            count += y.size(0)

    return total_loss / count
            
rmse = eval_log_rmse(net, valid_loader, devices)
print(f'Validation accuracy: {rmse:.5f}')

Validation accuracy: 0.24596


In [8]:
X_test_tensor = torch.tensor(X_test_df.values, dtype=torch.float32).to(devices[0])

net.eval()
with torch.no_grad():
    preds = net(X_test_tensor).squeeze(1).cpu().numpy()

house_ids = pd.read_csv(os.path.join(input_dir, 'test.csv'))['Id']

submission = pd.DataFrame({
    'Id': house_ids,
    'SalePrice': np.expm1(preds).squeeze()
})
print(submission)
submission.to_csv('submission.csv', index=False)

        Id      SalePrice
0     1461   55909.746094
1     1462   15938.531250
2     1463  170394.812500
3     1464   94239.445312
4     1465  210807.046875
...    ...            ...
1454  2915   49645.968750
1455  2916   44455.550781
1456  2917  169528.140625
1457  2918   90549.406250
1458  2919  129695.460938

[1459 rows x 2 columns]
