In [76]:
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [77]:
import pandas as pd

def load_housing_data(housing_path='data'):
    train_csv_path = os.path.join(housing_path, "train.csv")
    test_csv_path = os.path.join(housing_path, "test.csv")
    return pd.read_csv(train_csv_path), pd.read_csv(test_csv_path)

train, test = load_housing_data()

In [78]:
corr_matrix = train.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr

In [79]:
num_att = list(train.select_dtypes([np.number]).columns)

# att = ['OverallQual', 'GrLivArea']

to_remove = ['BsmtFinSF2', 'MoSold', '3SsnPorch', 'BsmtHalfBath', 'MiscVal', 'LowQualFinSF', 'YrSold',
            'PoolArea', 'OverallCond', 'MSSubClass', 'ScreenPorch', 'EnclosedPorch']
for t in to_remove:
    num_att.remove(t)
num_att.remove('Id')
num_att.remove('SalePrice')
print(num_att)

['LotFrontage', 'LotArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF']


In [80]:
train.info()
# train.select_dtypes(include=['object'])
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [103]:
cat_att = ['BldgType', 'CentralAir', 'Foundation', 
           'PavedDrive', 'SaleCondition', 'MSZoning', 'BldgType', 'HouseStyle',
           'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Condition1', 'RoofStyle',
           'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'BsmtExposure', 'BsmtFinType1',
           'Heating', 'HeatingQC', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
           'GarageType', 'GarageQual', 'GarageCond', 'Fence', 'SaleType', 'MiscFeature', 'PoolQC']
# cat_att = ['Neighborhood', 'BldgType']
print(cat_att)

['BldgType', 'CentralAir', 'Foundation', 'PavedDrive', 'SaleCondition', 'MSZoning', 'BldgType', 'HouseStyle', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Condition1', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'BsmtExposure', 'BsmtFinType1', 'Heating', 'HeatingQC', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageQual', 'GarageCond', 'Fence', 'SaleType', 'MiscFeature', 'PoolQC']


In [104]:
for a in num_att:
    median = train[a].median()
    train[a].fillna(median, inplace=True)
    
#     median = test[a].median()
    test[a].fillna(median, inplace=True)

In [105]:
for a in cat_att:
    median = train[a].mode()
    train[a].fillna(train[a].mode()[0], inplace=True)
    test[a].fillna(train[a].mode()[0], inplace=True)

In [106]:
train[num_att].info()
test[num_att].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 24 columns):
LotFrontage     1460 non-null float64
LotArea         1460 non-null int64
OverallQual     1460 non-null int64
YearBuilt       1460 non-null int64
YearRemodAdd    1460 non-null int64
MasVnrArea      1460 non-null float64
BsmtFinSF1      1460 non-null int64
BsmtUnfSF       1460 non-null int64
TotalBsmtSF     1460 non-null int64
1stFlrSF        1460 non-null int64
2ndFlrSF        1460 non-null int64
GrLivArea       1460 non-null int64
BsmtFullBath    1460 non-null int64
FullBath        1460 non-null int64
HalfBath        1460 non-null int64
BedroomAbvGr    1460 non-null int64
KitchenAbvGr    1460 non-null int64
TotRmsAbvGrd    1460 non-null int64
Fireplaces      1460 non-null int64
GarageYrBlt     1460 non-null float64
GarageCars      1460 non-null int64
GarageArea      1460 non-null int64
WoodDeckSF      1460 non-null int64
OpenPorchSF     1460 non-null int64
dtypes: float64(3), int

In [108]:
train[cat_att].info()
test[cat_att].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 33 columns):
BldgType         1460 non-null object
CentralAir       1460 non-null object
Foundation       1460 non-null object
PavedDrive       1460 non-null object
SaleCondition    1460 non-null object
MSZoning         1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Condition1       1460 non-null object
RoofStyle        1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1460 non-null object
ExterQual        1460 non-null object
BsmtExposure     1460 non-null object
BsmtFinType1     1460 non-null object
Heating          1460 non-null object
HeatingQC        1460 non-null object
Electrical       1460 non-null object
KitchenQual      14

In [109]:
train_y = train["SalePrice"].copy()
# train = train.drop("SalePrice", axis=1)
# train = train.drop("Id", axis=1)
# test = test.drop("Id", axis=1)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [110]:
train2 = train[num_att + cat_att]
train2.info()
test2 = test[num_att + cat_att]
test2.info()

merged_df = pd.concat([train2, test2])
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 57 columns):
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
OverallQual      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
MasVnrArea       1460 non-null float64
BsmtFinSF1       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
TotRmsAbvGrd     1460 non-null int64
Fireplaces       1460 non-null int64
GarageYrBlt      1460 non-null float64
GarageCars       1460 non-null int64
GarageArea       1460 non-null int64
WoodDeckSF       1460 non-null int64
OpenPorchSF      1460 non-null int64

In [111]:
for n in cat_att:
    if test[n].isnull().values.any():
        print(n)
merged_df.isnull().values.any()
merged_df.isnull().sum().sum()

0

In [112]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
train_prepared = num_pipeline.fit_transform(train[num_att])
test_prepared  = num_pipeline.transform(test[num_att])


full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_att),
        ("cat", OneHotEncoder(), cat_att),
    ])

train_test_full_prepared = full_pipeline.fit_transform(merged_df)
train_full_prepared = full_pipeline.transform(train2)
test_full_prepared = full_pipeline.transform(test2)


In [113]:
print(train_prepared.shape)
print(test_prepared.shape)
print(train_full_prepared.shape)
print(test_full_prepared.shape)

(1460, 24)
(1459, 24)
(1460, 222)
(1459, 222)


In [115]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=300, max_features=32, random_state=42)
forest_reg.fit(train_full_prepared, train_y)

housing_predictions = forest_reg.predict(train_full_prepared)
forest_mse = mean_squared_error(train_y, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

10614.950842589253

In [116]:
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print("scores:", scores)
    print("mean:", scores.mean())
    print("std:", scores.std())

forest_reg = RandomForestRegressor(n_estimators=300, max_features=32, random_state=42)
forest_reg.fit(train_full_prepared, train_y)

forest_scores = cross_val_score(forest_reg, train_full_prepared, train_y,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

print(pd.Series(forest_rmse_scores).describe())

scores: [20988.0080271  26273.3779047  23438.16482539 40917.36596236
 35814.44361144 25667.22005173 23017.67277609 23794.92531868
 38921.75863405 24229.90888372]
mean: 28306.284599526425
std: 6938.240871180807
count       10.000000
mean     28306.284600
std       7313.548036
min      20988.008027
25%      23527.354949
50%      24948.564468
75%      33429.177185
max      40917.365962
dtype: float64


In [117]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # sprawdza 12 (3×4) kombinacji hiperparametrów
    {'n_estimators': [50, 100, 150, 200, 250, 300], 'max_features': [4, 8, 16, 32, 64]},
    # następnie sprawdza 6 (2×3) kombinacji z wyłączonym parametrem bootstrap (False)
    {'bootstrap': [False], 'n_estimators': [50, 100, 150, 200, 250, 300], 'max_features': [4, 8, 16, 32, 64]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# przeprowadza proces uczenia na pięciu podzbiorach, czyli łącznie (12+6)*5=90 przebiegów 
grid_search = GridSearchCV(forest_reg, param_grid, cv=10,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(train_full_prepared, train_y)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [4, 8, 16, 32, 

In [118]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 64, 'n_estimators': 100}

In [119]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features=64, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [120]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

34987.77185550234 {'max_features': 4, 'n_estimators': 50}
34182.31393435224 {'max_features': 4, 'n_estimators': 100}
33805.532692712 {'max_features': 4, 'n_estimators': 150}
33715.756186325314 {'max_features': 4, 'n_estimators': 200}
33707.96607701389 {'max_features': 4, 'n_estimators': 250}
33875.75612181687 {'max_features': 4, 'n_estimators': 300}
33090.39194571431 {'max_features': 8, 'n_estimators': 50}
32336.08949723862 {'max_features': 8, 'n_estimators': 100}
32121.76030387724 {'max_features': 8, 'n_estimators': 150}
31954.9921465388 {'max_features': 8, 'n_estimators': 200}
31798.09508892874 {'max_features': 8, 'n_estimators': 250}
31751.745956948434 {'max_features': 8, 'n_estimators': 300}
30909.48740994244 {'max_features': 16, 'n_estimators': 50}
30418.795699785456 {'max_features': 16, 'n_estimators': 100}
30104.042662588494 {'max_features': 16, 'n_estimators': 150}
29991.49769429499 {'max_features': 16, 'n_estimators': 200}
29938.117290457132 {'max_features': 16, 'n_estimators'

In [121]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([7.66198616e-03, 1.84540786e-02, 2.60429396e-01, 4.16467122e-02,
       1.18537992e-02, 8.18335072e-03, 2.48725547e-02, 6.56646217e-03,
       5.59121520e-02, 4.24560428e-02, 3.13996731e-02, 1.19933983e-01,
       2.43311862e-03, 1.77956259e-02, 1.87437187e-03, 4.77049722e-03,
       1.18436270e-03, 1.32727752e-02, 8.89250676e-03, 1.05086972e-02,
       9.49643414e-02, 2.33362292e-02, 4.70182360e-03, 5.51571691e-03,
       4.79911870e-04, 1.91359739e-05, 1.72303505e-04, 4.56185123e-05,
       7.79310043e-05, 8.88500701e-04, 3.93575458e-05, 2.79030991e-05,
       1.79314025e-05, 9.61327573e-05, 2.35329643e-03, 2.10822827e-03,
       3.84046955e-04, 3.48411294e-04, 6.33012417e-04, 1.03660318e-04,
       2.08696249e-05, 2.44414227e-05, 2.65451701e-04, 9.75871993e-05,
       4.42871402e-04, 6.68159510e-04, 3.24035233e-06, 5.36615770e-05,
       2.62461307e-04, 7.52429025e-04, 6.88587318e-04, 2.17188353e-04,
       1.70748952e-04, 4.76922159e-05, 1.15322165e-03, 1.69927154e-03,
      

In [122]:
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = []
for a in cat_encoder.categories_:
    for b in a:
        cat_one_hot_attribs.append(b)
# print(cat_one_hot_attribs)
attributes = num_att + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.26042939577370905, 'OverallQual'),
 (0.11993398310444597, 'GrLivArea'),
 (0.09496434138212155, 'GarageCars'),
 (0.0809049464811959, 'TA'),
 (0.05591215198218918, 'TotalBsmtSF'),
 (0.04245604277558613, '1stFlrSF'),
 (0.0416467121791882, 'YearBuilt'),
 (0.031399673097164474, '2ndFlrSF'),
 (0.02487255471571128, 'BsmtFinSF1'),
 (0.023336229224114297, 'GarageArea'),
 (0.018454078563561002, 'LotArea'),
 (0.017795625879863015, 'FullBath'),
 (0.014676059764177674, 'Ex'),
 (0.013272775240978852, 'TotRmsAbvGrd'),
 (0.011853799177181743, 'YearRemodAdd'),
 (0.010508697217466308, 'GarageYrBlt'),
 (0.008892506762243832, 'Fireplaces'),
 (0.008183350723486022, 'MasVnrArea'),
 (0.007661986158090684, 'LotFrontage'),
 (0.007260492777413584, 'Gd'),
 (0.0066276840450397514, 'TA'),
 (0.006566462169311741, 'BsmtUnfSF'),
 (0.005999635946706515, 'Gd'),
 (0.005515716914899853, 'OpenPorchSF'),
 (0.004770497223724585, 'BedroomAbvGr'),
 (0.004701823599784949, 'WoodDeckSF'),
 (0.0039232122083896455, 'Ex'),
 (0.

In [123]:
y_pred = grid_search.predict(test_full_prepared)
sub = pd.DataFrame({'Id': test['Id'], 'SalePrice': y_pred})

In [124]:
sub.to_csv("data/submission_script11_rf_gridsearch_morre_cat_att.csv", index=False)

In [125]:
sub.shape

(1459, 2)