In [1]:
import math
import pandas as pd
import numpy as np
import seaborn as sns
import pandas_profiling as pp
import missingno as msno
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# 1. read train and test data

In [2]:

# read data
X_full = pd.read_csv('./input/train.csv')
X_test_full = pd.read_csv('./input/test.csv')
# remove rows with missing target
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

In [3]:
pd.set_option('display.max_columns', X_full.shape[1])
pd.set_option('display.max_rows', X_full.shape[0])

# 2. explore data

In [4]:
X_full.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [5]:
missing_value_series = X_full.isna().sum().sort_values(ascending=False)
missing_value_series[missing_value_series.values > 0]

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageQual        81
GarageCond        81
GarageFinish      81
BsmtFinType2      38
BsmtExposure      38
BsmtCond          37
BsmtFinType1      37
BsmtQual          37
MasVnrArea         8
MasVnrType         8
Electrical         1
dtype: int64

In [6]:
msno.heatmap(X_full)
plt.show()

  


In [7]:
# pp.ProfileReport(pd.concat([y, X_full], axis=1))

In [8]:
# profile 报告，可以发现，以下特征分布比较‘偏’
skew_cols = ['BsmtUnfSF','GrLivArea','LotArea','OpenPorchSF','TotalBsmtSF']

In [9]:
# 类别型变量有哪些
category_feature = X_full.columns[X_full.dtypes==object].tolist()
print(category_feature)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [10]:
for col in category_feature:
    print(X_full[col].unique())

['RL' 'RM' 'C (all)' 'FV' 'RH']
['Pave' 'Grvl']
[nan 'Grvl' 'Pave']
['Reg' 'IR1' 'IR2' 'IR3']
['Lvl' 'Bnk' 'Low' 'HLS']
['AllPub' 'NoSeWa']
['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
['Gtl' 'Mod' 'Sev']
['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']
['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTile']
['VinylSd' 'MetalSd' 'Wd Sdng' 'HdBoard' 'BrkFace' 'WdShing' 'CemntBd'
 'Plywood' 'AsbShng' 'Stucco' 'BrkComm' 'AsphShn' 'Stone' 'ImStucc'
 'CBlock']
['VinylSd' 'MetalSd' 'Wd Shng

In [11]:
# 数值型变量有哪些
numeric_feature = X_full.columns[X_full.dtypes!=object].tolist()
print(numeric_feature)
print(X_test_full.columns[X_test_full.dtypes!=object].tolist())

['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSol

# 3. change some numeric columns to categories since it looks like categories

In [12]:
X_full['MSSubClass'] = X_full['MSSubClass'].apply(str)
X_test_full['MSSubClass'] = X_test_full['MSSubClass'].apply(str)

X_full['MoSold'] = X_full['MoSold'].apply(str)
X_test_full['MoSold'] = X_test_full['MoSold'].apply(str)

X_full['OverallCond'] = X_full['OverallCond'].astype(str)
X_test_full['OverallCond'] = X_test_full['OverallCond'].astype(str)

# 4. do feature engineering
This step is an iteration. After calculate permuation importance, we try to explore new feature. The iteration end when we satisfy with the score :)  
* Exterior2nd is depends on Exterior1st so we merge Exterior1st and Exterior2nd to Exterior.  
* Condition2 is depends on Condition1 so we merge Condition1 and Condition2 to Condition.  
* Explore new feature which is importance for the model:  
    - TotalSF: Total square feet of the whole house  

    - TotalBathroom: Total bathroom of the whole house (include full bathroomo and half bathroom).

In [13]:
# merge "Exterior1st", "Exterior2nd" to "Exterior"
X_full['Exterior'] = X_full.apply(lambda x: x['Exterior1st'] if (pd.isnull(x['Exterior2nd'])) else str(x['Exterior1st'])+'-'+str(x['Exterior2nd']), axis=1)
X_test_full['Exterior'] =  X_test_full.apply(lambda x: x['Exterior1st'] if (pd.isnull(x['Exterior2nd'])) else str(x['Exterior1st'])+'-'+str(x['Exterior2nd']), axis=1)
X_full.drop(['Exterior1st', 'Exterior2nd'],axis=1,inplace=True)
X_test_full.drop(['Exterior1st', 'Exterior2nd'],axis=1,inplace=True)

# merge 'condition1', 'condition2' to 'Condition'
X_full['Condition'] = X_full.apply(lambda x: x['Condition1'] if (pd.isnull(x['Condition2'])) else str(x['Condition1'])+'-'+str(x['Condition2']), axis=1)
X_test_full['Condition'] =  X_test_full.apply(lambda x: x['Condition1'] if (pd.isnull(x['Condition2'])) else str(x['Condition1'])+'-'+str(x['Condition2']), axis=1)
X_full.drop(['Condition1', 'Condition2'],axis=1,inplace=True)
X_test_full.drop(['Condition1', 'Condition2'],axis=1,inplace=True)

# generate total square
X_full['TotalSF'] = X_full['TotalBsmtSF'] + X_full['1stFlrSF'] + X_full['2ndFlrSF']
X_test_full['TotalSF'] = X_test_full['TotalBsmtSF'] + X_test_full['1stFlrSF'] + X_test_full['2ndFlrSF']
X_full.drop(columns=['TotalBsmtSF','1stFlrSF','2ndFlrSF'], axis=1, inplace=True)
X_test_full.drop(columns=['TotalBsmtSF','1stFlrSF','2ndFlrSF'], axis=1, inplace=True)

# drop columns that have too many missing values
X_full.drop(columns=['Alley','MiscFeature','PoolQC','PoolArea'],axis=1,inplace=True)
X_test_full.drop(columns=['Alley','MiscFeature','PoolQC','PoolArea'],axis=1,inplace=True)
skew_cols = list(set(skew_cols)&set(X_full.columns))

# 5. choose type for each featrue
* All columns with dtype int64, float64 will be numeric feature and others will be categories  
* Seperate categories into large variety and small variety  


In [14]:
categorical_cols = X_full.columns[X_full.dtypes==object].tolist()
numerical_cols = X_full.columns[X_full.dtypes!=object].tolist()
X_full[numerical_cols].head(5)

Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal,YrSold,TotalSF
0,1,65.0,8450,7,2003,2003,196.0,706,0,150,0,1710,1,0,2,1,3,1,8,0,2003.0,2,548,0,61,0,0,0,0,2008,2566
1,2,80.0,9600,6,1976,1976,0.0,978,0,284,0,1262,0,1,2,0,3,1,6,1,1976.0,2,460,298,0,0,0,0,0,2007,2524
2,3,68.0,11250,7,2001,2002,162.0,486,0,434,0,1786,1,0,2,1,3,1,6,1,2001.0,2,608,0,42,0,0,0,0,2008,2706
3,4,60.0,9550,7,1915,1970,0.0,216,0,540,0,1717,1,0,1,0,3,1,7,1,1998.0,3,642,0,35,272,0,0,0,2006,2473
4,5,84.0,14260,8,2000,2000,350.0,655,0,490,0,2198,1,0,2,1,4,1,9,1,2000.0,3,836,192,84,0,0,0,0,2008,3343


In [15]:
print(sorted({x:X_full[x].nunique() for x in categorical_cols}.items(), key=lambda x:x[1], reverse=True))
X_full[categorical_cols].head(5)

[('Exterior', 67), ('Neighborhood', 25), ('Condition', 18), ('MSSubClass', 15), ('MoSold', 12), ('OverallCond', 9), ('SaleType', 9), ('HouseStyle', 8), ('RoofMatl', 8), ('Functional', 7), ('RoofStyle', 6), ('Foundation', 6), ('BsmtFinType1', 6), ('BsmtFinType2', 6), ('Heating', 6), ('GarageType', 6), ('SaleCondition', 6), ('MSZoning', 5), ('LotConfig', 5), ('BldgType', 5), ('ExterCond', 5), ('HeatingQC', 5), ('Electrical', 5), ('FireplaceQu', 5), ('GarageQual', 5), ('GarageCond', 5), ('LotShape', 4), ('LandContour', 4), ('MasVnrType', 4), ('ExterQual', 4), ('BsmtQual', 4), ('BsmtCond', 4), ('BsmtExposure', 4), ('KitchenQual', 4), ('Fence', 4), ('LandSlope', 3), ('GarageFinish', 3), ('PavedDrive', 3), ('Street', 2), ('Utilities', 2), ('CentralAir', 2)]


Unnamed: 0,MSSubClass,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,BldgType,HouseStyle,OverallCond,RoofStyle,RoofMatl,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,Fence,MoSold,SaleType,SaleCondition,Exterior,Condition
0,60,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,1Fam,2Story,5,Gable,CompShg,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,,Attchd,RFn,TA,TA,Y,,2,WD,Normal,VinylSd-VinylSd,Norm-Norm
1,20,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,1Fam,1Story,8,Gable,CompShg,,TA,TA,CBlock,Gd,TA,Gd,ALQ,Unf,GasA,Ex,Y,SBrkr,TA,Typ,TA,Attchd,RFn,TA,TA,Y,,5,WD,Normal,MetalSd-MetalSd,Feedr-Norm
2,60,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,1Fam,2Story,5,Gable,CompShg,BrkFace,Gd,TA,PConc,Gd,TA,Mn,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,,9,WD,Normal,VinylSd-VinylSd,Norm-Norm
3,70,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,1Fam,2Story,5,Gable,CompShg,,TA,TA,BrkTil,TA,Gd,No,ALQ,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,,2,WD,Abnorml,Wd Sdng-Wd Shng,Norm-Norm
4,60,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,1Fam,2Story,5,Gable,CompShg,BrkFace,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,,12,WD,Normal,VinylSd-VinylSd,Norm-Norm


In [16]:
print(sorted({x:X_full[x].nunique() for x in numerical_cols}.items(), key=lambda a:a[1], reverse=True))
print(X_full['MSSubClass'].unique())
X_full[numerical_cols].head(5)

[('Id', 1460), ('LotArea', 1073), ('TotalSF', 963), ('GrLivArea', 861), ('BsmtUnfSF', 780), ('BsmtFinSF1', 637), ('GarageArea', 441), ('MasVnrArea', 327), ('WoodDeckSF', 274), ('OpenPorchSF', 202), ('BsmtFinSF2', 144), ('EnclosedPorch', 120), ('YearBuilt', 112), ('LotFrontage', 110), ('GarageYrBlt', 97), ('ScreenPorch', 76), ('YearRemodAdd', 61), ('LowQualFinSF', 24), ('MiscVal', 21), ('3SsnPorch', 20), ('TotRmsAbvGrd', 12), ('OverallQual', 10), ('BedroomAbvGr', 8), ('GarageCars', 5), ('YrSold', 5), ('BsmtFullBath', 4), ('FullBath', 4), ('KitchenAbvGr', 4), ('Fireplaces', 4), ('BsmtHalfBath', 3), ('HalfBath', 3)]
['60' '20' '70' '50' '190' '45' '90' '120' '30' '85' '80' '160' '75' '180'
 '40']


Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal,YrSold,TotalSF
0,1,65.0,8450,7,2003,2003,196.0,706,0,150,0,1710,1,0,2,1,3,1,8,0,2003.0,2,548,0,61,0,0,0,0,2008,2566
1,2,80.0,9600,6,1976,1976,0.0,978,0,284,0,1262,0,1,2,0,3,1,6,1,1976.0,2,460,298,0,0,0,0,0,2007,2524
2,3,68.0,11250,7,2001,2002,162.0,486,0,434,0,1786,1,0,2,1,3,1,6,1,2001.0,2,608,0,42,0,0,0,0,2008,2706
3,4,60.0,9550,7,1915,1970,0.0,216,0,540,0,1717,1,0,1,0,3,1,7,1,1998.0,3,642,0,35,272,0,0,0,2006,2473
4,5,84.0,14260,8,2000,2000,350.0,655,0,490,0,2198,1,0,2,1,4,1,9,1,2000.0,3,836,192,84,0,0,0,0,2008,3343


In [17]:
categorical_small_variety_cols = [col for col in X_full.columns if X_full[col].nunique()<=15 and X_full[col].dtype==object]

categorical_large_variety_cols = [col for col in X_full.columns if X_full[col].nunique()>15 and X_full[col].dtype!=object]
categorical_label_cols = []

print('numerical_cols: ',numerical_cols)
print('categorical_cols: ',categorical_cols)
print('categorical_label_cols: ',categorical_label_cols )
print('categorical_small_variety_cols: ', categorical_small_variety_cols)
print('categorical_large_variety_cols: ',categorical_large_variety_cols)

numerical_cols:  ['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'YrSold', 'TotalSF']
categorical_cols:  ['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'BldgType', 'HouseStyle', 'OverallCond', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'MoSold', 'SaleType', 'SaleCondition', 'Exterior

# 6. Create pipeline

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
import category_encoders as ce
from xgboost import XGBRegressor

In [19]:
# preprocessing for numerical data
numerical_transformer = Pipeline(verbose=False, steps=[
    ('imputer_num', SimpleImputer(strategy='median')),
    
])
# preprocessing for categorical data
categorical_onehot_transformer = Pipeline(verbose=False, steps=[
    ('imputer_onehot', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
categorical_label_transformer = Pipeline(verbose=False, steps=[
    ('imputer_label', SimpleImputer(strategy='most_frequent')),
    ('label', ce.OrdinalEncoder())
])

categorical_count_transformer = Pipeline(verbose=False, steps=[
    ('imputer_count', SimpleImputer(strategy='most_frequent')),
    ('count', ce.TargetEncoder(handle_missing='count'))
])

#  Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(verbose=False, transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cox_box', PowerTransformer(method='yeo-johnson', standardize=False), skew_cols),
    ('cat_label', categorical_label_transformer, categorical_label_cols),
    ('cat_onehot', categorical_onehot_transformer, categorical_small_variety_cols),
    ('cat_count', categorical_count_transformer, categorical_large_variety_cols),
])
train_pipeline = Pipeline(verbose=False, steps=[
    ('preprocessor', preprocessor),
    ('scale', StandardScaler(with_mean=True, with_std=True)),
    ('model', XGBRegressor(random_state=0))
])

# 7. Calculate importance of each feature to iterate feature engineering
* Use Permutation Importance to calculate how important of each feature and use it in the iteration of feature engineering

In [20]:
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

In [21]:
X_train, X_valid, y_train, y_valid = train_test_split(X_full[numerical_cols], y, train_size=0.8, test_size=0.2, random_state=0)
transform_pipeline = Pipeline(verbose=False, steps=[
    ('imputer_num', SimpleImputer(strategy='median')),
    ('scale', StandardScaler(with_mean=True, with_std=True)),
])
# Transform data
transform_pipeline.fit(X_train, y_train)
pi_X_train = pd.DataFrame(transform_pipeline.transform(X_train))
pi_X_valid = pd.DataFrame(transform_pipeline.transform(X_valid))
pi_X_train.columns = X_train.columns
pi_X_valid.columns = X_valid.columns

# Define a model and calculate permutation importance of all numeric columns
pi_model = RandomForestRegressor(n_estimators=700,max_depth=4,random_state=0)
pi_model.fit(pi_X_train,y_train)
perm = PermutationImportance(pi_model, random_state=1).fit(pi_X_valid, y_valid)
eli5.show_weights(perm, feature_names=pi_X_valid.columns.to_list(), top=100)

Weight,Feature
0.5788  ± 0.0658,TotalSF
0.3288  ± 0.0862,OverallQual
0.0206  ± 0.0051,YearBuilt
0.0059  ± 0.0017,YearRemodAdd
0.0056  ± 0.0022,BsmtFinSF1
0.0038  ± 0.0006,GarageCars
0.0025  ± 0.0009,KitchenAbvGr
0.0025  ± 0.0011,GarageArea
0.0018  ± 0.0017,GrLivArea
0.0012  ± 0.0011,Fireplaces


# Train the model
* Use XGBoost to train to model
* Using GridSearchCV to search for best hyper parameter of XGBoost  
PS: You can add more parameters. I keep small range of params to make it run faster in Kaggle.

In [22]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
param_grid = {
    'model__nthread':[2],
    'model__learning_rate':[0.04, 0.05],
    'model__max_depth':range(3,5,1),
     'model__importance_type': ['weight', 'gain', 'cover'],
#       "model__min_child_weight" : [ 1 ],
      "model__gamma": [0.0, 0.1],
#     'model__silent': [1],
    'model__n_estimators': [600, 700], #number of trees
#        'model__n_estimators': range(595,600,1), #number of trees   

}
searched_model = GridSearchCV(estimator=train_pipeline, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5, error_score='raise', verbose=1)
searched_model.fit(X_full, y)
print(searched_model.best_estimator_)
print(searched_model.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['Id', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBath',
                                                  

In [26]:
preds_test = searched_model.predict(X_test_full)
# Save test predictions to file
output = pd.DataFrame({'Id': X_test_full.Id,'SalePrice': preds_test})
output.to_csv('./output/submission_2.csv', index=False)

In [24]:
X_test_full.Id

0       1461
1       1462
2       1463
3       1464
4       1465
5       1466
6       1467
7       1468
8       1469
9       1470
10      1471
11      1472
12      1473
13      1474
14      1475
15      1476
16      1477
17      1478
18      1479
19      1480
20      1481
21      1482
22      1483
23      1484
24      1485
25      1486
26      1487
27      1488
28      1489
29      1490
30      1491
31      1492
32      1493
33      1494
34      1495
35      1496
36      1497
37      1498
38      1499
39      1500
40      1501
41      1502
42      1503
43      1504
44      1505
45      1506
46      1507
47      1508
48      1509
49      1510
50      1511
51      1512
52      1513
53      1514
54      1515
55      1516
56      1517
57      1518
58      1519
59      1520
60      1521
61      1522
62      1523
63      1524
64      1525
65      1526
66      1527
67      1528
68      1529
69      1530
70      1531
71      1532
72      1533
73      1534
74      1535
75      1536
76      1537