In [1]:
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
import pandas as pd

def load_housing_data(housing_path='data'):
    train_csv_path = os.path.join(housing_path, "train.csv")
    test_csv_path = os.path.join(housing_path, "test.csv")
    return pd.read_csv(train_csv_path), pd.read_csv(test_csv_path)

train, test = load_housing_data()

In [3]:
train['TotalLivingSF'] = train['BsmtFinSF1'] + train['BsmtFinSF2'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalLivingSF'] = test['BsmtFinSF1'] + test['BsmtFinSF2'] + test['1stFlrSF'] + test['2ndFlrSF']

train['TotalLivingSF'] = np.log(train['TotalLivingSF'])
test['TotalLivingSF'] = np.log(test['TotalLivingSF'])

In [4]:
train['TotalBthm'] = train['FullBath'] + 0.5*train['HalfBath'] + train['BsmtFullBath'] + 0.5*train['BsmtHalfBath']
test['TotalBthm'] = test['FullBath'] + 0.5*test['HalfBath'] + test['BsmtFullBath'] + 0.5*test['BsmtHalfBath']

train['TotalBthm'] = np.log(train['TotalBthm'])
test['TotalBthm'] = np.log(test['TotalBthm'])

In [5]:
corr_matrix = train.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
TotalLivingSF    0.698827
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
TotalBthm        0.603681
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorc

In [6]:
train['MSSubClass'] = train['MSSubClass'].apply(str)
train['YrSold'] = train['YrSold'].astype(str)
train['MoSold'] = train['MoSold'].astype(str)

test['MSSubClass'] = test['MSSubClass'].apply(str)
test['YrSold'] = test['YrSold'].astype(str)
test['MoSold'] = test['MoSold'].astype(str)

In [7]:
num_att = list(train.select_dtypes([np.number]).columns)

# att = ['OverallQual', 'GrLivArea']

# to_remove = ['BsmtFinSF2', 'MoSold', '3SsnPorch', 'BsmtHalfBath', 'MiscVal', 'LowQualFinSF', 'YrSold',
#             'PoolArea', 'OverallCond', 'MSSubClass', 'ScreenPorch']

to_remove = ['PoolArea', 'BsmtHalfBath', 'MiscVal', 'LowQualFinSF', 'ScreenPorch', '3SsnPorch',
            'BsmtFinSF2', 'MoSold', 'YrSold',
            'GarageArea', 'TotalBsmtSF']

to_remove = ['PoolArea', 'BsmtHalfBath', 'MiscVal', 'LowQualFinSF', 'ScreenPorch', '3SsnPorch',
            'BsmtFinSF2']

for t in to_remove:
    num_att.remove(t)
num_att.remove('Id')
num_att.remove('SalePrice')

# num_att = ['OverallQual', 'GrLivArea', 'GarageCars', '1stFlrSF', 'FullBath', 'YearBuilt']

print(num_att)

['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'TotalLivingSF', 'TotalBthm']


In [8]:
cat_att = ['BldgType', 'CentralAir', 'Foundation', 
           'PavedDrive', 'SaleCondition']
cat_att = ['Neighborhood', 'BldgType', 'Heating', 'HeatingQC', 'CentralAir', 
           'Street', 'LotShape', 'LandContour', 'LotConfig',
           'LandSlope', 'Condition1', 'HouseStyle', 'RoofStyle', 
           'ExterQual', 'ExterCond', 'Foundation', 'PavedDrive', 'SaleCondition'
          , 'MoSold', 'YrSold', 'MSSubClass']

# TO ADD = Exterior1st, KitchenQual, Functional, SaleType, Alley, Condition2

# cat_att = ['Neighborhood', 'BldgType']
print(cat_att)

['Neighborhood', 'BldgType', 'Heating', 'HeatingQC', 'CentralAir', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Condition1', 'HouseStyle', 'RoofStyle', 'ExterQual', 'ExterCond', 'Foundation', 'PavedDrive', 'SaleCondition', 'MoSold', 'YrSold', 'MSSubClass']


In [9]:
for a in num_att:
    median = train[a].median()
    train[a].fillna(median, inplace=True)
    
#     median = test[a].median()
    test[a].fillna(median, inplace=True)

In [10]:
import seaborn as sns
from scipy.stats import norm
from scipy import stats

train['SalePrice'] = np.log(train['SalePrice'])
train_y = train["SalePrice"].copy()

train['GrLivArea'] = np.log(train['GrLivArea'])
test['GrLivArea'] = np.log(test['GrLivArea'])

train['HasBsmt'] = pd.Series(len(train['TotalBsmtSF']), index=train.index)
train['HasBsmt'] = 0 
train.loc[train['TotalBsmtSF']>0,'HasBsmt'] = 1
train.loc[train['HasBsmt']==1,'TotalBsmtSF'] = np.log(train['TotalBsmtSF'])
test['HasBsmt'] = pd.Series(len(test['TotalBsmtSF']), index=test.index)
test['HasBsmt'] = 0 
test.loc[test['TotalBsmtSF']>0,'HasBsmt'] = 1
test.loc[test['HasBsmt']==1,'TotalBsmtSF'] = np.log(test['TotalBsmtSF'])

train['1stFlrSF'] = np.log(train['1stFlrSF'])
test['1stFlrSF'] = np.log(test['1stFlrSF'])

train['HasBsmtFin'] = pd.Series(len(train['BsmtFinSF1']), index=train.index)
train['HasBsmtFin'] = 0 
train.loc[train['BsmtFinSF1']>0,'HasBsmtFin'] = 1
train.loc[train['HasBsmtFin']==1,'BsmtFinSF1'] = np.log(train['BsmtFinSF1'])
test['HasBsmtFin'] = pd.Series(len(test['BsmtFinSF1']), index=test.index)
test['HasBsmtFin'] = 0 
test.loc[test['BsmtFinSF1']>0,'HasBsmtFin'] = 1
test.loc[test['HasBsmtFin']==1,'BsmtFinSF1'] = np.log(test['BsmtFinSF1'])

train['HasWoodDeck'] = pd.Series(len(train['WoodDeckSF']), index=train.index)
train['HasWoodDeck'] = 0 
train.loc[train['WoodDeckSF']>0,'HasWoodDeck'] = 1
train.loc[train['HasWoodDeck']==1,'WoodDeckSF'] = np.log(train['WoodDeckSF'])
test['HasWoodDeck'] = pd.Series(len(test['WoodDeckSF']), index=test.index)
test['HasWoodDeck'] = 0 
test.loc[test['WoodDeckSF']>0,'HasWoodDeck'] = 1
test.loc[test['HasWoodDeck']==1,'WoodDeckSF'] = np.log(test['WoodDeckSF'])


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [11]:
corr_matrix = train.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.817184
TotalLivingSF    0.741062
GrLivArea        0.730255
GarageCars       0.680625
TotalBthm        0.668685
GarageArea       0.650888
1stFlrSF         0.608947
FullBath         0.594771
YearBuilt        0.586570
YearRemodAdd     0.565608
TotRmsAbvGrd     0.534422
GarageYrBlt      0.495794
Fireplaces       0.489449
MasVnrArea       0.426775
TotalBsmtSF      0.373009
WoodDeckSF       0.343269
LotFrontage      0.335292
OpenPorchSF      0.321053
HasWoodDeck      0.320349
2ndFlrSF         0.319300
HalfBath         0.313982
LotArea          0.257320
BsmtFullBath     0.236224
BsmtUnfSF        0.221985
BedroomAbvGr     0.209044
BsmtFinSF1       0.207962
HasBsmt          0.199634
HasBsmtFin       0.162231
ScreenPorch      0.121208
PoolArea         0.069798
3SsnPorch        0.054900
BsmtFinSF2       0.004832
BsmtHalfBath    -0.005149
Id              -0.017942
MiscVal         -0.020021
OverallCond     -0.036868
LowQualFinSF    -0.037963
KitchenAbvGr

In [12]:
train2 = train[num_att + cat_att]
train2.info()
test2 = test[num_att + cat_att]
test2.info()

merged_df = pd.concat([train2, test2])
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 49 columns):
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
MasVnrArea       1460 non-null float64
BsmtFinSF1       1460 non-null float64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null float64
1stFlrSF         1460 non-null float64
2ndFlrSF         1460 non-null int64
GrLivArea        1460 non-null float64
BsmtFullBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
TotRmsAbvGrd     1460 non-null int64
Fireplaces       1460 non-null int64
GarageYrBlt      1460 non-null float64
GarageCars       1460 non-null int64
GarageArea       1460 non-null int64
WoodDeckSF       1460 non-nu

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
train_prepared = num_pipeline.fit_transform(train[num_att])
test_prepared  = num_pipeline.transform(test[num_att])


full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_att),
        ("cat", OneHotEncoder(), cat_att),
    ])

train_test_full_prepared = full_pipeline.fit_transform(merged_df)
train_full_prepared = full_pipeline.transform(train2)
test_full_prepared = full_pipeline.transform(test2)


In [14]:
print(train_prepared.shape)
print(test_prepared.shape)
print(train_full_prepared.shape)
print(test_full_prepared.shape)

(1460, 28)
(1459, 28)
(1460, 169)
(1459, 169)


In [15]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(bootstrap=False, n_estimators=350, max_features=32, random_state=42)
forest_reg.fit(train_full_prepared, train_y)

housing_predictions = forest_reg.predict(train_full_prepared)

forest_mse = mean_squared_error(train_y, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.0010585272760561223

In [16]:
print(housing_predictions[0])
print(train_y[0])

12.247694320220951
12.247694320220994


In [17]:
train_y_normal = np.e**train_y
pred_normal = np.e**housing_predictions

forest_mse = mean_squared_error(train_y_normal, pred_normal)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

153.62512485857778

In [18]:
print(train_y_normal[0])
print(pred_normal[0])

208500.0
208499.9999999911


In [19]:
# forest_reg.feature_importances_

cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = []
for a in cat_encoder.categories_:
    for b in a:
        cat_one_hot_attribs.append(b)
# print(cat_one_hot_attribs)
attributes = num_att + cat_one_hot_attribs
sorted(zip(forest_reg.feature_importances_, attributes), reverse=True)

[(0.17165062284247423, 'OverallQual'),
 (0.12243428163748775, 'TotalLivingSF'),
 (0.10652061868102344, 'GrLivArea'),
 (0.07386382484376063, 'YearBuilt'),
 (0.045896432976724355, 'TA'),
 (0.04074335735051051, 'TotalBthm'),
 (0.04013810031210792, 'TotalBsmtSF'),
 (0.039388626374676466, 'GarageArea'),
 (0.03932470997601245, 'GarageCars'),
 (0.037229133863270016, '1stFlrSF'),
 (0.02597715740889589, 'GarageYrBlt'),
 (0.024326094204899864, 'FullBath'),
 (0.018073382096580812, 'Fireplaces'),
 (0.016141880687307528, 'LotArea'),
 (0.015659166343435803, 'YearRemodAdd'),
 (0.011861072588633123, 'BsmtFinSF1'),
 (0.011772459041788703, '2ndFlrSF'),
 (0.009979434370559672, 'OverallCond'),
 (0.009381700352678386, 'LotFrontage'),
 (0.008988483430512879, 'Y'),
 (0.008047860810650787, 'BsmtUnfSF'),
 (0.007858956588449862, 'N'),
 (0.007670356541884721, 'Gd'),
 (0.007625417779627717, 'PConc'),
 (0.007300706568555277, 'TotRmsAbvGrd'),
 (0.0058434255528495415, 'OpenPorchSF'),
 (0.004892580106427971, 'MasVnrA

In [20]:
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print("scores:", scores)
    print("mean:", scores.mean())
    print("std:", scores.std())

forest_reg = RandomForestRegressor(bootstrap=False, n_estimators=200, max_features=32, random_state=42)
forest_reg.fit(train_full_prepared, train_y)

forest_scores = cross_val_score(forest_reg, train_full_prepared, train_y,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

print(pd.Series(forest_rmse_scores).describe())

scores: [0.12616554 0.1202385  0.13232691 0.16505326 0.15857345 0.11361385
 0.13296055 0.11908776 0.13273273 0.13271584]
mean: 0.13334683943625408
std: 0.015683608249294528
count    10.000000
mean      0.133347
std       0.016532
min       0.113614
25%       0.121720
50%       0.132521
75%       0.132904
max       0.165053
dtype: float64


In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # sprawdza 12 (3×4) kombinacji hiperparametrów
    {'n_estimators': [100, 300, 400, 500, 600], 'max_features': [16, 32, 48, 'auto']},
    # następnie sprawdza 6 (2×3) kombinacji z wyłączonym parametrem bootstrap (False)
    {'bootstrap': [False], 'n_estimators': [100, 300, 400, 500, 600], 
     'max_features': [16, 32, 48, 'auto']},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# przeprowadza proces uczenia na pięciu podzbiorach, czyli łącznie (12+6)*5=90 przebiegów 
grid_search = GridSearchCV(forest_reg, param_grid, cv=10,
                           scoring='neg_mean_squared_error',
                           return_train_score=True, verbose=3, n_jobs=-1)
grid_search.fit(train_full_prepared, train_y)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 12.2min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'max_features': [16, 32, 48, 'aut

In [22]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 32, 'n_estimators': 500}

In [23]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features=32, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=500,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [24]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.13889134458088828 {'max_features': 16, 'n_estimators': 100}
0.13805678156225662 {'max_features': 16, 'n_estimators': 300}
0.1380635530082062 {'max_features': 16, 'n_estimators': 400}
0.13789514131133998 {'max_features': 16, 'n_estimators': 500}
0.1379803034048139 {'max_features': 16, 'n_estimators': 600}
0.1369595542003199 {'max_features': 32, 'n_estimators': 100}
0.13646495407230075 {'max_features': 32, 'n_estimators': 300}
0.1363620971701219 {'max_features': 32, 'n_estimators': 400}
0.1361826243724687 {'max_features': 32, 'n_estimators': 500}
0.1362476559885007 {'max_features': 32, 'n_estimators': 600}
0.1369431516820824 {'max_features': 48, 'n_estimators': 100}
0.13628079248333705 {'max_features': 48, 'n_estimators': 300}
0.13613019287566547 {'max_features': 48, 'n_estimators': 400}
0.1361451466484754 {'max_features': 48, 'n_estimators': 500}
0.13616022330141261 {'max_features': 48, 'n_estimators': 600}
0.14314179428475174 {'max_features': 'auto', 'n_estimators': 100}
0.1426375340

In [25]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([9.50740547e-03, 1.62363958e-02, 1.67177552e-01, 9.76609364e-03,
       7.02958912e-02, 1.75063867e-02, 5.26490454e-03, 1.16286820e-02,
       8.57789790e-03, 4.03981369e-02, 3.69606724e-02, 1.24654629e-02,
       1.02200052e-01, 1.17308809e-03, 2.67036641e-02, 1.62179372e-03,
       3.47699820e-03, 1.72895512e-03, 7.75531939e-03, 1.70216275e-02,
       2.37261925e-02, 4.09665111e-02, 4.01289944e-02, 3.15340965e-03,
       5.56506752e-03, 1.84429078e-03, 1.19448350e-01, 4.92040080e-02,
       3.83963448e-05, 4.95050293e-06, 1.63461558e-04, 2.85917386e-04,
       3.47370892e-04, 2.90014218e-04, 1.72611941e-03, 1.39373740e-03,
       1.54559963e-04, 7.32835271e-04, 3.40718720e-04, 1.67758759e-04,
       6.32139559e-04, 3.22663265e-05, 2.77589037e-04, 3.15846380e-04,
       1.03586371e-03, 1.34976404e-03, 1.47354654e-04, 4.79566468e-04,
       1.65735317e-04, 3.01587048e-04, 1.98440709e-04, 1.13596233e-04,
       7.52276099e-05, 1.42291039e-03, 1.30608457e-04, 3.46731086e-04,
      

In [26]:
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = []
for a in cat_encoder.categories_:
    for b in a:
        cat_one_hot_attribs.append(b)
# print(cat_one_hot_attribs)
attributes = num_att + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.16717755179889904, 'OverallQual'),
 (0.11944834968079046, 'TotalLivingSF'),
 (0.10220005197126726, 'GrLivArea'),
 (0.07029589117531986, 'YearBuilt'),
 (0.049204008043703604, 'TotalBthm'),
 (0.04677041193628597, 'TA'),
 (0.04096651109215919, 'GarageCars'),
 (0.04039813692062273, 'TotalBsmtSF'),
 (0.04012899436108428, 'GarageArea'),
 (0.03696067235129968, '1stFlrSF'),
 (0.026703664145254352, 'FullBath'),
 (0.02372619250951935, 'GarageYrBlt'),
 (0.017506386748021577, 'YearRemodAdd'),
 (0.017021627494244275, 'Fireplaces'),
 (0.01623639575215714, 'LotArea'),
 (0.012465462873777037, '2ndFlrSF'),
 (0.011628682049304184, 'BsmtFinSF1'),
 (0.009766093637729213, 'OverallCond'),
 (0.009507405465183498, 'LotFrontage'),
 (0.008608640953262522, 'N'),
 (0.008577897903046172, 'BsmtUnfSF'),
 (0.00828584693823881, 'Gd'),
 (0.0081199128726032, 'PConc'),
 (0.008076894288407655, 'Y'),
 (0.007755319392383836, 'TotRmsAbvGrd'),
 (0.005565067523328516, 'OpenPorchSF'),
 (0.0052649045351827, 'MasVnrArea'),
 (

In [27]:
# y_pred = forest_reg.predict(test_full_prepared)
y_pred = grid_search.predict(test_full_prepared)
y_pred_normal = np.e**y_pred
sub = pd.DataFrame({'Id': test['Id'], 'SalePrice': y_pred_normal})

In [28]:
sub.to_csv("data/submission_script31_some_num_to_str.csv", index=False)

In [31]:
sub.shape

(1459, 2)