In [352]:
import numpy as np
np.random.seed(1)

In [353]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

In [354]:
df_orig = pd.read_csv("train.csv")
print(df_orig.shape)
print("Null Counts:")
print(df_orig.isnull().sum()[df_orig.isnull().sum() > 0])

(1460, 81)
Null Counts:
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [355]:
df = df_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id", "SalePrice"])
print(df.shape)

(1460, 75)


In [356]:
def fill_nulls(df, mean, mode):
    df.fillna(mean, inplace=True)
    df.fillna(mode, inplace=True)

mean = df.mean()
mode = df.mode().iloc[0]
fill_nulls(df, mean, mode)
print(df.isnull().sum()[df.isnull().sum() > 0])

Series([], dtype: int64)


In [357]:
def encode_categorical_features(df, enc):
    # Flatten enc.categories_ which is a list of np.array
    cat_list = np.concatenate(enc.categories_).ravel()
    df[cat_list] = pd.DataFrame(enc.transform(df[categorical_features]).toarray(), index=df.index)
    df = df.drop(columns=categorical_features)
    return df

categorical_features = df.select_dtypes(exclude=[np.number]).columns
onehot = OneHotEncoder(handle_unknown='ignore')
onehot.fit(df[categorical_features])
df = encode_categorical_features(df, onehot)
print(df.shape)

(1460, 203)


In [358]:
X = df
y = df_orig.SalePrice
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1168, 203) (1168,)
(292, 203) (292,)


In [359]:
class LogUniform:
    def __init__(self, low, high):
        self.low = low
        self.high = high
    
    def rvs(self, *args, **kwds):
        return 10 ** np.random.uniform(np.log10(self.low), np.log10(self.high))

In [360]:
param_dist = {"eta": LogUniform(0.01, 1),
              "max_depth": randint(1, 10),
              "min_child_weight" : randint(1, 40),
              "subsample": uniform(0.1, 0.9),
              "gamma": uniform(1, 100),
             }

regr = RandomizedSearchCV(estimator=XGBRegressor(), random_state=1, param_distributions=param_dist, n_jobs=-1, n_iter=20, verbose=100, scoring="neg_mean_squared_log_error", cv=5)
regr.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to new file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_12496_7197157971\12496-104958888-e9a3998e84e3416e82bbe285ae94bb1c.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64

Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.3s
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_12496_7197157971\12496-104958888-e9a3998e84e3416e82bbe285ae94bb1c.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.3s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

[Parallel(n_jobs=-1)]: Done  19 tasks      | elapsed:    5.2s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_12496_7197157971\12496-104958888-e9a3998e84e3416e82bbe285ae94bb1c.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    5.3s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    8.0s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_12496_7197157971\12496-104958888-e9a3998e84e3416e82bbe285ae94bb1c.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:    8.5s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:   11.9s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_12496_7197157971\12496-104958888-e9a3998e84e3416e82bbe285ae94bb1c.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
Pickling array (shape=(203,), dtype=object).[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.9s

Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

Pickling array (shape=(203,), dtype=object).[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:   14.0s

Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_12496_7197157971\12496-104958888-e9a3998e84e3416e82bbe285ae94bb1c.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   14.2s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:   16.2s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_12496_7197157971\12496-104958888-e9a3998e84e3416e82bbe285ae94bb1c.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   16.3s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   18.7s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_12496_7197157971\12496-104958888-e9a3998e84e3416e82bbe285ae94bb1c.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:   18.7s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:   22.1s
Pickling array (shape=(203,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping (shape=(170, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_12496_7197157971\12496-104958888-e9a3998e84e3416e82bbe285ae94bb1c.pkl
Pickling array (shape=(33,), dtype=object).
Pickling array (shape=(170,), dtype=object).
Pickling array (shape=(33,), dtype=int64).
Pickling array (shape=(170,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
Pickling array (shape=(203,), dtype=object).[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   22.2s

Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(33, 1168), dtype=int64).
Memmapping 

  if getattr(data, 'base', None) is not None and \


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          random_st...
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000011640D68>,
                                        'min_child_weight': 

In [361]:
train_rmsle = np.sqrt(-regr.score(X_train, y_train))
test_rmsle = np.sqrt(-regr.score(X_test, y_test))
print(train_rmsle, test_rmsle)
print(regr.best_params_)

0.09629301921452238 0.13995129192127811
{'eta': 0.024900208186207435, 'gamma': 54.58964059155116, 'max_depth': 5, 'min_child_weight': 22, 'subsample': 0.5634002008524778}


In [362]:
df_test_orig = pd.read_csv("test.csv")
df_test = df_test_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id"])
fill_nulls(df_test, mean, mode)
df_test = encode_categorical_features(df_test, onehot)
predictions = regr.predict(df_test)
predictions = np.squeeze(predictions)
df_submit = pd.DataFrame({'Id': df_test_orig.Id, 'SalePrice': predictions})
df_submit.to_csv('submission.csv', index=False)