## Importing packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

##  Metric definition

In [2]:
def rmse(predictions, reals):
    error = np.sqrt(mean_squared_error(predictions, reals))
    return error

## Loading data

In [3]:
DATA_PATH = '../data/'
train_data = pd.read_csv(DATA_PATH + 'train.csv')

In [4]:
target = np.log(train_data['SalePrice'])
exclude_columns = ['SalePrice','Id','PoolQC','MiscFeature','Fence','Alley']
train_data = train_data.drop(exclude_columns, axis=1)

In [5]:
numerical_features = train_data.select_dtypes(['int64','float64']).columns
categorical_features = train_data.select_dtypes(['object']).columns

## Pipeline creation

Creating the pipelines for numerical and categorical variables.
For numerical features, the strategy for missing values is to impute the median of that features
For categorical, the strategy is to imput the most frequent value of that category and then to apply one hot encoding to the features. If a category is present only in the test set, then it should be ignored.

In [6]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log', FunctionTransformer(np.log1p, validate=True))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ])

In [8]:
rf_model = RandomForestRegressor(n_estimators=100)

In [9]:
rf_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', rf_model)
])

## Creating a test set for evaluation

In [10]:
X_train, X_test, y_train, y_test = train_test_split(train_data, target, test_size=0.33, shuffle=True, random_state=0)

In [11]:
rf_pipeline.fit(X_train, y_train)
train_predictions = rf_pipeline.predict(X_train)
train_error = rmse(train_predictions, y_train)
test_predictions = rf_pipeline.predict(X_test)
test_error = rmse(test_predictions, y_test)

print('RMSE on the training set {}'.format(train_error))
print('RMSE on the testing set {}'.format(rmse(test_predictions, y_test)))

RMSE on the training set 0.05506999845145948
RMSE on the testing set 0.13257448656554563


## Experimenting with XGBoost

In [12]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [53]:
d_train = xgb.DMatrix(X_train, label=np.array(y_train).reshape(-1))

In [122]:
xgbm = xgb.XGBRegressor(max_depth=3,
                        learning_rate=0.01,
                        n_estimators=5000,
                        subsample=0.5,
                        colsample_bytree=0.5,
                        objective='reg:squarederror',
                        n_jobs=-1)

In [123]:
cv_result = xgb.cv(xgbm.get_xgb_params(),
                   d_train,
                   num_boost_round=10000,
                   nfold=10,
                   early_stopping_rounds=100,
                   shuffle=True,
                   verbose_eval=200,
                   seed=0)

print('\nRSME:\n{}'.format(cv_result.min()))

[0]	train-rmse:11.4181+0.00528713	test-rmse:11.418+0.0476852
[200]	train-rmse:1.5637+0.000988714	test-rmse:1.56402+0.0386249
[400]	train-rmse:0.252671+0.000990898	test-rmse:0.266444+0.0217045
[600]	train-rmse:0.108279+0.00147489	test-rmse:0.143585+0.0169133
[800]	train-rmse:0.0906773+0.00149908	test-rmse:0.133711+0.0178225
[1000]	train-rmse:0.0826122+0.00144531	test-rmse:0.130762+0.0184208
[1200]	train-rmse:0.0764472+0.00134489	test-rmse:0.129319+0.0188328
[1400]	train-rmse:0.0712989+0.00122234	test-rmse:0.12844+0.0193539
[1600]	train-rmse:0.0668175+0.00121404	test-rmse:0.127648+0.0197561
[1800]	train-rmse:0.0628387+0.00116555	test-rmse:0.127333+0.0197329
[2000]	train-rmse:0.0593611+0.00109491	test-rmse:0.127016+0.0198
[2200]	train-rmse:0.056186+0.00104709	test-rmse:0.12664+0.0198073
[2400]	train-rmse:0.053323+0.00100908	test-rmse:0.126605+0.0197935
[2600]	train-rmse:0.0507126+0.000971153	test-rmse:0.126399+0.0197713

RSME:
train-rmse-mean    0.050957
train-rmse-std     0.000684
test-r

In [124]:
print('Best number of trees = {}'.format(cv_result.shape[0]))
xgbm.set_params(n_estimators=cv_result.shape[0])

Best number of trees = 2582


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=2582,
             n_jobs=-1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.5, verbosity=1)

## Training full model

In [125]:
train_data = preprocessor.transform(train_data)
target = np.array(target).reshape((-1, ))

In [126]:
xgbm.fit(train_data, target)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=2582,
             n_jobs=-1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.5, verbosity=1)

## Making submission

In [127]:
test_data = pd.read_csv(DATA_PATH + 'test.csv')

In [128]:
X_test = preprocessor.transform(test_data)
test_prediction = np.exp(xgbm.predict(X_test))

In [129]:
test_prediction = pd.Series(test_prediction, name='SalePrice')
test_prediction = pd.concat([test_prediction, test_data['Id']], axis=1).set_index('Id')
test_prediction.to_csv('../output/submission.csv')