In [1]:
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
import pandas as pd

def load_housing_data(housing_path='data'):
    train_csv_path = os.path.join(housing_path, "train.csv")
    test_csv_path = os.path.join(housing_path, "test.csv")
    return pd.read_csv(train_csv_path), pd.read_csv(test_csv_path)

train, test = load_housing_data()

In [4]:
train = train[train.GrLivArea < 4500]
train.shape

(1458, 81)

In [20]:
outliers = [30, 88, 462, 631, 1322]
train = train.drop(train.index[outliers])

In [21]:
train.shape

(1453, 81)

In [22]:
train['TotalLivingSF'] = train['BsmtFinSF1'] + train['BsmtFinSF2'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalLivingSF'] = test['BsmtFinSF1'] + test['BsmtFinSF2'] + test['1stFlrSF'] + test['2ndFlrSF']

train['TotalLivingSF'] = np.log(train['TotalLivingSF'])
test['TotalLivingSF'] = np.log(test['TotalLivingSF'])

In [23]:
train['MSSubClass'] = train['MSSubClass'].apply(str)
train['YrSold'] = train['YrSold'].astype(str)
train['MoSold'] = train['MoSold'].astype(str)

test['MSSubClass'] = test['MSSubClass'].apply(str)
test['YrSold'] = test['YrSold'].astype(str)
test['MoSold'] = test['MoSold'].astype(str)

In [24]:
train['Electrical'] = train['Electrical'].fillna("SBrkr")

In [25]:
train['FireplaceQu'] = train['FireplaceQu'].fillna("None") 
test['FireplaceQu'] = test['FireplaceQu'].fillna("None") 

In [26]:
train['TotalPorchSF'] = train['OpenPorchSF'] + train['3SsnPorch'] + train['EnclosedPorch'] + train['ScreenPorch'] + train['WoodDeckSF']
test['TotalPorchSF'] = test['OpenPorchSF'] + test['3SsnPorch'] + test['EnclosedPorch'] + test['ScreenPorch'] + test['WoodDeckSF']

In [27]:
test['SaleType'] = test['SaleType'].fillna(test['SaleType'].mode()[0])

In [28]:
test['KitchenQual'] = test['KitchenQual'].fillna("TA") 

In [29]:
bsmt_cat_att = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']

for a in bsmt_cat_att:
    train[a].fillna('NB', inplace=True)
    test[a].fillna('NB', inplace=True)

In [30]:
test['Exterior1st'] = test['Exterior1st'].fillna(test['Exterior1st'].mode()[0])

In [31]:
test['MSZoning'] = test.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

In [32]:
train['Alley'] = train['Alley'].fillna("NA")
test['Alley'] = test['Alley'].fillna("NA")

In [33]:
train['Has2ndFloor'] = train['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
test['Has2ndFloor'] = test['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)

In [34]:
test['Exterior2nd'] = test['Exterior2nd'].fillna(test['Exterior2nd'].mode()[0])

In [35]:
train['MasVnrType'] = train['MasVnrType'].fillna(train['MasVnrType'].mode()[0])
test['MasVnrType'] = test['MasVnrType'].fillna(test['MasVnrType'].mode()[0])

In [36]:
garage_cat_att = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']

for a in garage_cat_att:
    train[a].fillna('NG', inplace=True)
    test[a].fillna('NG', inplace=True)
    

In [37]:
train['Fence'] = train['Fence'].fillna("NF")
test['Fence'] = test['Fence'].fillna("NF")

In [38]:
test['Functional'] = test['Functional'].fillna("Typ")

In [39]:
train['MiscFeature'] = train['MiscFeature'].fillna("None")
test['MiscFeature'] = test['MiscFeature'].fillna("None")

In [40]:
corr_matrix = train.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.797383
GrLivArea        0.735921
TotalLivingSF    0.709391
TotalBsmtSF      0.653681
GarageCars       0.641738
1stFlrSF         0.632977
GarageArea       0.630535
FullBath         0.562426
TotRmsAbvGrd     0.537797
YearBuilt        0.523215
YearRemodAdd     0.506928
GarageYrBlt      0.486373
MasVnrArea       0.484192
Fireplaces       0.471084
BsmtFinSF1       0.409789
TotalPorchSF     0.394582
LotFrontage      0.372842
WoodDeckSF       0.324164
OpenPorchSF      0.320491
2ndFlrSF         0.320327
HalfBath         0.282490
LotArea          0.268457
BsmtFullBath     0.225922
BsmtUnfSF        0.216433
BedroomAbvGr     0.168186
Has2ndFloor      0.137344
ScreenPorch      0.110612
PoolArea         0.099514
3SsnPorch        0.044207
BsmtFinSF2      -0.010707
BsmtHalfBath    -0.017955
LowQualFinSF    -0.018110
MiscVal         -0.021638
Id              -0.025531
OverallCond     -0.084097
EnclosedPorch   -0.121991
KitchenAbvGr    -0.137255
Name: SalePr

In [41]:
num_att = list(train.select_dtypes([np.number]).columns)

# att = ['OverallQual', 'GrLivArea']

# to_remove = ['BsmtFinSF2', 'MoSold', '3SsnPorch', 'BsmtHalfBath', 'MiscVal', 'LowQualFinSF', 'YrSold',
#             'PoolArea', 'OverallCond', 'MSSubClass', 'ScreenPorch']

to_remove = ['PoolArea', 'BsmtHalfBath', 'MiscVal', 'LowQualFinSF', 'ScreenPorch', '3SsnPorch',
            'BsmtFinSF2', 'MoSold', 'YrSold',
            'GarageArea', 'TotalBsmtSF']

to_remove = ['PoolArea', 'BsmtHalfBath', 'MiscVal', 'LowQualFinSF', 'ScreenPorch', '3SsnPorch',
            'BsmtFinSF2']

for t in to_remove:
    num_att.remove(t)
num_att.remove('Id')
num_att.remove('SalePrice')

# num_att = ['OverallQual', 'GrLivArea', 'GarageCars', '1stFlrSF', 'FullBath', 'YearBuilt']

print(num_att)

['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'TotalLivingSF', 'TotalPorchSF', 'Has2ndFloor']


In [42]:
cat_att = ['BldgType', 'CentralAir', 'Foundation', 
           'PavedDrive', 'SaleCondition']
cat_att = ['Neighborhood', 'BldgType', 'Heating', 'HeatingQC', 'CentralAir', 
           'Street', 'LotShape', 'LandContour', 'LotConfig',
           'LandSlope', 'Condition1', 'HouseStyle', 'RoofStyle', 
           'ExterQual', 'ExterCond', 'Foundation', 'PavedDrive', 'SaleCondition',
           'KitchenQual', 'BsmtQual', 'MoSold', 'YrSold', 'MSSubClass',
           'SaleType', 'Electrical', 'BsmtExposure', 'FireplaceQu',
           'Exterior1st', 'MSZoning', 'Alley',
           'Condition2', 'RoofMatl', 'Exterior2nd', 'MasVnrType', 'BsmtCond',
           'BsmtFinType1', 'BsmtFinType2', 'Functional',
           'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
           'Fence', 'MiscFeature']

# TO ADD = Exterior1st, KitchenQual, Functional, SaleType, Alley, Condition2

# cat_att = ['Neighborhood', 'BldgType']
print(cat_att)
print(len(cat_att))
print(len(set(cat_att)))

['Neighborhood', 'BldgType', 'Heating', 'HeatingQC', 'CentralAir', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Condition1', 'HouseStyle', 'RoofStyle', 'ExterQual', 'ExterCond', 'Foundation', 'PavedDrive', 'SaleCondition', 'KitchenQual', 'BsmtQual', 'MoSold', 'YrSold', 'MSSubClass', 'SaleType', 'Electrical', 'BsmtExposure', 'FireplaceQu', 'Exterior1st', 'MSZoning', 'Alley', 'Condition2', 'RoofMatl', 'Exterior2nd', 'MasVnrType', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'Fence', 'MiscFeature']
44
44


In [43]:
for a in num_att:
    median = train[a].median()
    train[a].fillna(0, inplace=True)
    
#     median = test[a].median()
    test[a].fillna(0, inplace=True)

In [44]:
import seaborn as sns
from scipy.stats import norm
from scipy import stats

train['SalePrice'] = np.log(train['SalePrice'])
train_y = train["SalePrice"].copy()

train['GrLivArea'] = np.log(train['GrLivArea'])
test['GrLivArea'] = np.log(test['GrLivArea'])

train['HasBsmt'] = pd.Series(len(train['TotalBsmtSF']), index=train.index)
train['HasBsmt'] = 0 
train.loc[train['TotalBsmtSF']>0,'HasBsmt'] = 1
train.loc[train['HasBsmt']==1,'TotalBsmtSF'] = np.log(train['TotalBsmtSF'])
test['HasBsmt'] = pd.Series(len(test['TotalBsmtSF']), index=test.index)
test['HasBsmt'] = 0 
test.loc[test['TotalBsmtSF']>0,'HasBsmt'] = 1
test.loc[test['HasBsmt']==1,'TotalBsmtSF'] = np.log(test['TotalBsmtSF'])

train['1stFlrSF'] = np.log(train['1stFlrSF'])
test['1stFlrSF'] = np.log(test['1stFlrSF'])

train['HasBsmtFin'] = pd.Series(len(train['BsmtFinSF1']), index=train.index)
train['HasBsmtFin'] = 0 
train.loc[train['BsmtFinSF1']>0,'HasBsmtFin'] = 1
train.loc[train['HasBsmtFin']==1,'BsmtFinSF1'] = np.log(train['BsmtFinSF1'])
test['HasBsmtFin'] = pd.Series(len(test['BsmtFinSF1']), index=test.index)
test['HasBsmtFin'] = 0 
test.loc[test['BsmtFinSF1']>0,'HasBsmtFin'] = 1
test.loc[test['HasBsmtFin']==1,'BsmtFinSF1'] = np.log(test['BsmtFinSF1'])

train['HasWoodDeck'] = pd.Series(len(train['WoodDeckSF']), index=train.index)
train['HasWoodDeck'] = 0 
train.loc[train['WoodDeckSF']>0,'HasWoodDeck'] = 1
train.loc[train['HasWoodDeck']==1,'WoodDeckSF'] = np.log(train['WoodDeckSF'])
test['HasWoodDeck'] = pd.Series(len(test['WoodDeckSF']), index=test.index)
test['HasWoodDeck'] = 0 
test.loc[test['WoodDeckSF']>0,'HasWoodDeck'] = 1
test.loc[test['HasWoodDeck']==1,'WoodDeckSF'] = np.log(test['WoodDeckSF'])


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [45]:
corr_matrix = train.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.824807
TotalLivingSF    0.753830
GrLivArea        0.741652
GarageCars       0.683090
GarageArea       0.658926
1stFlrSF         0.617590
FullBath         0.597619
YearBuilt        0.587447
YearRemodAdd     0.565675
TotRmsAbvGrd     0.539522
Fireplaces       0.495189
MasVnrArea       0.433066
TotalPorchSF     0.404321
TotalBsmtSF      0.377081
GarageYrBlt      0.350285
WoodDeckSF       0.343042
OpenPorchSF      0.325600
2ndFlrSF         0.321461
HasWoodDeck      0.319724
HalfBath         0.312048
LotArea          0.261875
BsmtFullBath     0.233851
BsmtUnfSF        0.224678
BedroomAbvGr     0.210421
BsmtFinSF1       0.207179
HasBsmt          0.202460
LotFrontage      0.187297
HasBsmtFin       0.160396
Has2ndFloor      0.151175
ScreenPorch      0.120472
PoolArea         0.074549
3SsnPorch        0.054600
BsmtFinSF2       0.006237
BsmtHalfBath    -0.006809
MiscVal         -0.020782
Id              -0.024932
LowQualFinSF    -0.027807
OverallCond 

In [46]:
train2 = train[num_att + cat_att]
# train2.info()
test2 = test[num_att + cat_att]
# test2.info()

merged_df = pd.concat([train2, test2])
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2912 entries, 0 to 1458
Data columns (total 73 columns):
LotFrontage      2912 non-null float64
LotArea          2912 non-null int64
OverallQual      2912 non-null int64
OverallCond      2912 non-null int64
YearBuilt        2912 non-null int64
YearRemodAdd     2912 non-null int64
MasVnrArea       2912 non-null float64
BsmtFinSF1       2912 non-null float64
BsmtUnfSF        2912 non-null float64
TotalBsmtSF      2912 non-null float64
1stFlrSF         2912 non-null float64
2ndFlrSF         2912 non-null int64
GrLivArea        2912 non-null float64
BsmtFullBath     2912 non-null float64
FullBath         2912 non-null int64
HalfBath         2912 non-null int64
BedroomAbvGr     2912 non-null int64
KitchenAbvGr     2912 non-null int64
TotRmsAbvGrd     2912 non-null int64
Fireplaces       2912 non-null int64
GarageYrBlt      2912 non-null float64
GarageCars       2912 non-null float64
GarageArea       2912 non-null float64
WoodDeckSF       291

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
train_prepared = num_pipeline.fit_transform(train[num_att])
test_prepared  = num_pipeline.transform(test[num_att])


full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_att),
        ("cat", OneHotEncoder(), cat_att),
    ])

train_test_full_prepared = full_pipeline.fit_transform(merged_df)
train_full_prepared = full_pipeline.transform(train2)
test_full_prepared = full_pipeline.transform(test2)


ValueError: Found unknown categories ['150'] in column 22 during transform

In [48]:
print(train_prepared.shape)
print(test_prepared.shape)
print(train_full_prepared.shape)
print(test_full_prepared.shape)

(1453, 29)
(1459, 29)
(1453, 321)
(1459, 321)


In [30]:
from numpy import arange
from pandas import read_csv
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
# define model
model = Lasso(max_iter=1e9)
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
grid['alpha'] = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
grid['alpha'] = np.arange(0.0002, 0.0008, 0.00002)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, verbose=3)
# perform the search
results = search.fit(train_full_prepared, train_y)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)


Fitting 30 folds for each of 30 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   22.7s


MAE: -0.015
Config: {'alpha': 0.00058}


[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   25.3s finished


In [31]:
cvres = results.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
    
# 12483177514831173

0.12708768637722961 {'alpha': 0.0002}
0.12667320204673427 {'alpha': 0.00022}
0.12630477311471594 {'alpha': 0.00024}
0.12598402493157335 {'alpha': 0.00026000000000000003}
0.1257129730874295 {'alpha': 0.00028}
0.12548442326734002 {'alpha': 0.00030000000000000003}
0.12529002954946755 {'alpha': 0.00031999999999999997}
0.12514133701993416 {'alpha': 0.00034}
0.125010165291219 {'alpha': 0.00035999999999999997}
0.12488741971962233 {'alpha': 0.00038}
0.12478838179967253 {'alpha': 0.00039999999999999996}
0.12470496186827641 {'alpha': 0.00042}
0.12463158967728216 {'alpha': 0.00043999999999999996}
0.12457914728033141 {'alpha': 0.00046}
0.12453393075791347 {'alpha': 0.00047999999999999996}
0.12449360770415874 {'alpha': 0.0005}
0.12446458196101874 {'alpha': 0.00052}
0.1244488622969336 {'alpha': 0.00054}
0.12444202054531156 {'alpha': 0.00056}
0.1244410956525053 {'alpha': 0.00058}
0.124449715266775 {'alpha': 0.0006}
0.12446587748483798 {'alpha': 0.00062}
0.12449404184806981 {'alpha': 0.000639999999999

In [33]:
print(np.arange(0.00002, 0.00008, 0.00002))

[2.e-05 4.e-05 6.e-05]


In [50]:
from numpy import arange
from pandas import read_csv
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor

cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
model = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=42)

grid = dict()
grid['reg_alpha'] = np.arange(0.00002, 0.00008, 0.00002)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, verbose=3)
# perform the search
results = search.fit(train_full_prepared, train_y)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)




Fitting 20 folds for each of 3 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed:  1.0min remaining:    2.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.0min finished


MAE: -0.011
Config: {'reg_alpha': 6.000000000000001e-05}


In [51]:
cvres = results.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
    
# 12483177514831173

0.10505575038626985 {'reg_alpha': 2e-05}
0.10510867217970199 {'reg_alpha': 4e-05}
0.10502389789968433 {'reg_alpha': 6.000000000000001e-05}


In [None]:
#for tuning parameters
#parameters_for_testing = {
#    'colsample_bytree':[0.4,0.6,0.8],
#    'gamma':[0,0.03,0.1,0.3],
#    'min_child_weight':[1.5,6,10],
#    'learning_rate':[0.1,0.07],
#    'max_depth':[3,5],
#    'n_estimators':[10000],
#    'reg_alpha':[1e-5, 1e-2,  0.75],
#    'reg_lambda':[1e-5, 1e-2, 0.45],
#    'subsample':[0.6,0.95]  
#}

In [52]:
housing_predictions = results.predict(train_full_prepared)

In [53]:
print(housing_predictions[0])
print(train_y[0])

12.223593
12.247694320220994


In [54]:
from sklearn.metrics import mean_squared_error
train_y_normal = np.e**train_y
pred_normal = np.e**housing_predictions

forest_mse = mean_squared_error(train_y_normal, pred_normal)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

8753.587273891837

In [55]:
print(train_y_normal[0])
print(pred_normal[0])

208500.0
203534.83


In [56]:
# y_pred = forest_reg.predict(test_full_prepared)
# y_pred = grid_search.predict(test_full_prepared)
y_pred = results.predict(test_full_prepared)


y_pred_normal = np.e**y_pred
sub = pd.DataFrame({'Id': test['Id'], 'SalePrice': y_pred_normal})

In [57]:
sub.to_csv("data/submission_script62_xgb_outliers.csv", index=False)

In [26]:
sub.shape

(1459, 2)