## Setup

In the cell below we load the data in and split it into training and validating data. We also load the testing data.

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('./input/train.csv', index_col='Id')
X_test_full = pd.read_csv('./input/test.csv', index_col='Id')

# Remove rows with missing Price info, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# Select categorical columns to perform Ordinal Encoding
ord_categorical_cols = ['Street', 'Alley', 'ExterQual', 'ExterCond',
                       'BsmtQual', 'BsmtCond', 'BsmtExposure',
                       'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
                       'CentralAir', 'KitchenQual', 'Functional',
                       'FireplaceQu', 'GarageFinish', 'GarageQual',
                       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence']

# Select categorical columns to perform OneHotEncoding
ohe_categorical_cols = [cname for cname in X_full.columns if 
                    X_full[cname].dtype == "object" and
                    cname not in ord_categorical_cols]

# For now we drop ohe columns ...
X_full.drop(ohe_categorical_cols, axis=1, inplace=True)
X_full.drop(ord_categorical_cols, axis=1, inplace=True)


# Split the data into training and validating
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

## Preprocessing

Transforming the data to `tensor` type so that we can use it in the `PyTorch` model.

In [9]:
# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = ord_categorical_cols + ohe_categorical_cols + numerical_cols
my_cols = numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

X_deploy = X_full[my_cols].copy()
y_deploy = y.copy()


from torch import tensor

X_train = tensor(X_train.to_numpy()).float()
X_valid = tensor(X_valid.to_numpy()).float()
y_train = tensor(y_train.values.reshape(-1,1)).float()
y_valid = tensor(y_valid.values.reshape(-1,1)).float()

X_test = tensor(X_test.to_numpy()).float()

X_deploy = tensor(X_deploy.to_numpy()).float()
y_deploy = tensor(y_deploy.values.reshape(-1,1)).float()

### Creating the pipeline

First we define the preproessing steps which describe how we handle the missing data. After trying different ways of dealing with missing data, the following work best:
- Replace any missing numerical value with the `mean` value in a particular column.

In this step, we also create the Regressor Neural Network model which we wrap in a `NeuralNetRegressor` object. Lastly, we bundle everything together into a ML Pipeline which we call `model_pipeline`.

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'))
])

In [11]:
## Model Definition

from torch import nn
import torch.nn.functional as F

class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()

        self.first_layer = nn.Linear(36, 26)
        self.second_layer = nn.Linear(26,52)
        self.final_layer = nn.Linear(52,1)

    def forward(self, x_batch):
        # For some weird reason .float() is needed here...
        # Although the data has already been casted to float...
        X = self.first_layer(x_batch.float())
        X = F.relu(X)

        X = self.second_layer(X)
        X = F.relu(X)

        return self.final_layer(X)

## Declare Model

from skorch import NeuralNetRegressor
from torch import optim

skorch_regressor = NeuralNetRegressor(module=Regressor, optimizer=optim.Adam, max_epochs=500, verbose=0)

skorch_regressor

<class 'skorch.regressor.NeuralNetRegressor'>[uninitialized](
  module=<class '__main__.Regressor'>,
)

In [12]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

# Bundle preprocessing and modeling code in a pipeline
model_pipeline = Pipeline(steps=[('preprocessor', numerical_transformer),
                                 ('normalize', RobustScaler()),
                                 ('model', skorch_regressor)])
model_pipeline.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('imputer', SimpleImputer())])),
                ('normalize', RobustScaler()),
                ('model',
                 <class 'skorch.regressor.NeuralNetRegressor'>[initialized](
  module_=Regressor(
    (first_layer): Linear(in_features=36, out_features=26, bias=True)
    (second_layer): Linear(in_features=26, out_features=52, bias=True)
    (final_layer): Linear(in_features=52, out_features=1, bias=True)
  ),
))])

In [13]:
### Evaluate Model
from sklearn.metrics import mean_absolute_error

print("Train MAE : {}".format(mean_absolute_error(y_train, model_pipeline.predict(X_train).reshape(-1))))
print("Test  MAE : {}".format(mean_absolute_error(y_valid, model_pipeline.predict(X_valid).reshape(-1))))

print("\nTrain R^2 : {}".format(model_pipeline.score(X_train, y_train)))
print("Test  R^2 : {}".format(model_pipeline.score(X_valid, y_valid)))

Train MAE : 16703.05859375
Test  MAE : 21337.0859375

Train R^2 : 0.877506186505159
Test  R^2 : 0.6845423939729588
