In [152]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

In [153]:
df_orig = pd.read_csv("train.csv")
print(df_orig.shape)
print("Null Counts:")
print(df_orig.isnull().sum()[df_orig.isnull().sum() > 0])

(1460, 81)
Null Counts:
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [154]:
df = df_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id", "SalePrice"])
print(df.shape)

(1460, 75)


In [155]:
def fill_nulls(df, mean, mode):
    df.fillna(mean, inplace=True)
    df.fillna(mode, inplace=True)

mean = df.mean()
mode = df.mode().iloc[0]
fill_nulls(df, mean, mode)
print(df.isnull().sum()[df.isnull().sum() > 0])

Series([], dtype: int64)


In [156]:
def encode_categorical_features(df, enc):
    # Flatten enc.categories_ which is a list of np.array
    cat_list = np.concatenate(enc.categories_).ravel()
    df[cat_list] = pd.DataFrame(enc.transform(df[categorical_features]).toarray(), index=df.index)
    df = df.drop(columns=categorical_features)
    return df

categorical_features = df.select_dtypes(exclude=[np.number]).columns
onehot = OneHotEncoder(handle_unknown='ignore')
onehot.fit(df[categorical_features])
df = encode_categorical_features(df, onehot)
print(df.shape)

(1460, 203)


In [157]:
X = df
y = df_orig.SalePrice
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1168, 203) (1168,)
(292, 203) (292,)


In [158]:
param_dist = {"min_samples_leaf": randint(1, 50),
              "max_features": randint(100, 200),
              "min_samples_split": randint(2, 50),
              "n_estimators": randint(10, 100),
              "max_depth": randint(1, 50)
             }

regr = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=1), random_state=1, param_distributions=param_dist, n_jobs=-1, n_iter=20, verbose=100, scoring="neg_mean_squared_log_error", cv=5)
regr.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to new file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_14056_7343377822\14056-283106440-05c03f279d824c819c934a156bce6a1d.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64

[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.6s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_14056_7343377822\14056-283106440-05c03f279d824c819c934a156bce6a1d.pkl[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.6s

Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 


Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_14056_7343377822\14056-283106440-05c03f279d824c819c934a156bce6a1d.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.2s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_fold

[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    2.3s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_14056_7343377822\14056-283106440-05c03f279d824c819c934a156bce6a1d.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:    2.3s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.2s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_14056_7343377822\14056-283106440-05c03f279d824c819c934a156bce6a1d.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dt

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:    4.3s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_14056_7343377822\14056-283106440-05c03f279d824c819c934a156bce6a1d.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed:    4.3s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64

[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    5.2s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_14056_7343377822\14056-283106440-05c03f279d824c819c934a156bce6a1d.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    5.2s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    5.9s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_14056_7343377822\14056-283106440-05c03f279d824c819c934a156bce6a1d.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:    5.9s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    7.1s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_14056_7343377822\14056-283106440-05c03f279d824c819c934a156bce6a1d.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  91 tasks      | elapsed:    7.1s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [159]:
train_rmsle = np.sqrt(-regr.score(X_train, y_train))
test_rmsle = np.sqrt(-regr.score(X_test, y_test))
print(train_rmsle, test_rmsle)
print(regr.best_params_)

0.06990406450897191 0.1531766109835723
{'max_depth': 19, 'max_features': 115, 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 35}


In [160]:
df_test_orig = pd.read_csv("test.csv")
df_test = df_test_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id"])
fill_nulls(df_test, mean, mode)
df_test = encode_categorical_features(df_test, onehot)
predictions = regr.predict(df_test)
predictions = np.squeeze(predictions)
df_submit = pd.DataFrame({'Id': df_test_orig.Id, 'SalePrice': predictions})
df_submit.to_csv('submission.csv', index=False)