# Problem

This is the regression task, the evaluation metric is the RMSE between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

In [36]:
import numpy as np
import pandas as pd

In [37]:
train_dataset = pd.read_csv('/content/train.csv')
train_dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [38]:
X = train_dataset.loc[:, train_dataset.columns.drop('SalePrice')]
Y = train_dataset.loc[:, 'SalePrice']
X.shape, Y.shape

((1460, 80), (1460,))

In [39]:
numerical_features_ids = np.where(X.dtypes != 'object')[0]
categorical_features_ids = np.where(X.dtypes == 'object')[0]
len(numerical_features_ids), len(categorical_features_ids)

(37, 43)

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2)
X_train.shape, X_val.shape, Y_train.shape, Y_val.shape

((1168, 80), (292, 80), (1168,), (292,))

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame):
        numerical_features_mask = X.dtypes != 'object'
        X.loc[:, numerical_features_mask].fillna(value=-999, inplace=True)
        X.loc[:, ~numerical_features_mask].fillna(value='-999', inplace=True)
        return X

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

preprocessing_pipeline = Pipeline(steps=[
    ('imputer', CustomTransformer()),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

In [43]:
X_train = preprocessing_pipeline.fit_transform(X_train)
X_val = preprocessing_pipeline.transform(X_val)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, numerical_features_mask].fillna(value=-999, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, ~numerical_features_mask].fillna(value='-999', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, numerical_features_mask].fillna(value=-999, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.ht

In [44]:
def mean_squared_error(
    y_true,
    y_pred,
    *,
    sample_weight=None,
    multioutput="uniform_average",
    squared="deprecated",
):

    output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)

    if isinstance(multioutput, str):
        if multioutput == "raw_values":
            return output_errors
        elif multioutput == "uniform_average":
            # pass None as weights to np.average: uniform mean
            multioutput = None

    return np.average(output_errors, weights=multioutput)

In [45]:
def root_mean_squared_error(
    y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
):

    output_errors = np.sqrt(
        mean_squared_error(
            y_true, y_pred, sample_weight=sample_weight, multioutput="raw_values"
        )
    )

    if isinstance(multioutput, str):
        if multioutput == "raw_values":
            return output_errors
        elif multioutput == "uniform_average":
            # pass None as weights to np.average: uniform mean
            multioutput = None

    return np.average(output_errors, weights=multioutput)


In [46]:
def root_mean_squared_log_error(
    y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
):

    if (y_true < 0).any() or (y_pred < 0).any():
        raise ValueError(
            "Root Mean Squared Logarithmic Error cannot be used when "
            "targets contain negative values."
        )

    return root_mean_squared_error(
        np.log1p(y_true),
        np.log1p(y_pred),
        sample_weight=sample_weight,
        multioutput=multioutput,
    )

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

cv_params = {'n_estimators': [100, 250, 500],
             'max_depth': [10, 25, 50],
             'min_samples_leaf': [1, 10, 20]}
cv_reg = GridSearchCV(estimator=RandomForestRegressor(),
                      param_grid=cv_params,
                      scoring=make_scorer(root_mean_squared_log_error, greater_is_better=False),
                      verbose=2,
                      cv=5)

In [48]:
cv_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END .max_depth=10, min_samples_leaf=1, n_estimators=100; total time=   7.5s
[CV] END .max_depth=10, min_samples_leaf=1, n_estimators=100; total time=   6.4s
[CV] END .max_depth=10, min_samples_leaf=1, n_estimators=100; total time=   7.7s
[CV] END .max_depth=10, min_samples_leaf=1, n_estimators=100; total time=   6.3s
[CV] END .max_depth=10, min_samples_leaf=1, n_estimators=100; total time=   7.3s
[CV] END .max_depth=10, min_samples_leaf=1, n_estimators=250; total time=  16.9s
[CV] END .max_depth=10, min_samples_leaf=1, n_estimators=250; total time=  17.8s
[CV] END .max_depth=10, min_samples_leaf=1, n_estimators=250; total time=  17.4s
[CV] END .max_depth=10, min_samples_leaf=1, n_estimators=250; total time=  16.8s
[CV] END .max_depth=10, min_samples_leaf=1, n_estimators=250; total time=  17.7s
[CV] END .max_depth=10, min_samples_leaf=1, n_estimators=500; total time=  34.2s
[CV] END .max_depth=10, min_samples_leaf=1, n_e

In [49]:
cv_reg.best_estimator_

In [50]:
cv_reg.score(X_val, Y_val)

-0.16418320145516244

In [51]:
X = preprocessing_pipeline.fit_transform(X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, numerical_features_mask].fillna(value=-999, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, ~numerical_features_mask].fillna(value='-999', inplace=True)


In [52]:
final_est = cv_reg.best_estimator_

In [53]:
final_est.fit(X, Y)

In [54]:
test_dataset = pd.read_csv('/content/test.csv')
X_test = test_dataset.copy()
X_test = preprocessing_pipeline.transform(X_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, numerical_features_mask].fillna(value=-999, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, ~numerical_features_mask].fillna(value='-999', inplace=True)


In [55]:
submission = pd.DataFrame()
submission['Id'] = test_dataset.loc[:, 'Id']
submission['SalePrice'] = cv_reg.predict(X_test)

In [56]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,116148.053127
1,1462,148852.965099
2,1463,175452.007573
3,1464,185225.015311
4,1465,206113.159451


In [57]:
submission.to_csv('/content/rfc.csv', index=False)