In [31]:
import numpy as np
import pandas as pd
import seaborn as sb

from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

In [280]:
# load data

train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HousePrices/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HousePrices/test.csv')
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HousePrices/sample_submission.csv')

train.set_index('Id', inplace=True)
test.set_index('Id', inplace=True)

In [281]:
# combine train and test data

all_data = pd.concat([train, test], sort=False)
all_data_index = all_data.index

train_y = train['SalePrice']

In [282]:
# find deficient data

na_check = all_data.isna().sum() / len(all_data)
na_check = na_check[na_check > 0.3]
na_check

Alley          0.932169
FireplaceQu    0.486468
PoolQC         0.996574
Fence          0.804385
MiscFeature    0.964029
SalePrice      0.499829
dtype: float64

In [283]:
# analyze NA data

# fill NA of fireplaceQu because these don't have fireplaces
all_data.loc[all_data['FireplaceQu'].isna(), 'Fireplaces'] = 0

# NA of Alley means there is no path
all_data.loc[all_data['Alley'].isna(), 'Alley'] = 'noPath'

# NA of PoolQC means there is no pool
all_data.loc[all_data['PoolQC'].isna(), 'PoolQC'] = 'noPool'

# NA of Fence means there is no fence
all_data.loc[all_data['Fence'].isna(), 'Fence'] = 'noFence'

# NA of MiscFeature means there is no other feature
all_data.loc[all_data['MiscFeature'].isna(), 'MiscFeature'] = 'noFeature'

In [228]:
#all_data = all_data.drop(na_check.keys().drop(['SalePrice']), axis=1)

In [284]:
# separate object types and non-object types

data_object = all_data.select_dtypes(include='object')
data_nonObject = all_data.select_dtypes(exclude='object')

print(data_object.columns)
print(data_nonObject.columns)

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'Kitch

In [296]:
cols = data_object.columns

for i in cols:
  print(i+' : '+all_data[i].unique())

KeyError: ignored

In [285]:
# one-hot coding

dummy_data = pd.get_dummies(data_object, drop_first=True)
dummy_data.index = all_data_index
dummy_data.describe()

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,Alley_Pave,Alley_noPath,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_NoSeWa,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,...,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,PavedDrive_P,PavedDrive_Y,PoolQC_Fa,PoolQC_Gd,PoolQC_noPool,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_noFence,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,MiscFeature_noFeature,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
count,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,...,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,0.047619,0.008907,0.775951,0.157588,0.995889,0.026721,0.932169,0.026036,0.005481,0.636862,0.04111,0.020555,0.898253,0.000343,0.060295,0.02912,0.004796,0.73073,0.042823,0.005481,0.003426,0.010277,0.036999,0.015074,0.09147,0.035286,0.066461,0.056526,0.03186,0.012676,0.039054,0.151764,0.007879,0.044878,0.024323,0.056869,0.081877,0.016444,0.05173,0.042823,...,0.012333,0.06372,0.005139,0.266872,0.277835,0.421377,0.04248,0.008222,0.001713,0.892086,0.025351,0.005139,0.004796,0.909215,0.02124,0.904762,0.000685,0.00137,0.996574,0.038369,0.11271,0.004111,0.804385,0.00137,0.032545,0.000343,0.964029,0.004111,0.001713,0.008907,0.003083,0.002741,0.081877,0.002398,0.865022,0.004111,0.008222,0.015759,0.822885,0.083933
std,0.212995,0.093973,0.417026,0.364417,0.063996,0.161296,0.2515,0.159271,0.073845,0.480987,0.198579,0.141913,0.302367,0.018509,0.238073,0.16817,0.0691,0.443657,0.202492,0.073845,0.05844,0.100873,0.188792,0.121867,0.288325,0.184534,0.249129,0.230975,0.175658,0.111889,0.193758,0.358854,0.088431,0.207072,0.154078,0.231631,0.274225,0.127197,0.221519,0.202492,...,0.110386,0.244296,0.071513,0.442401,0.448008,0.493864,0.201717,0.090317,0.041359,0.310325,0.157216,0.071513,0.0691,0.287352,0.144209,0.293594,0.026171,0.036999,0.05844,0.192119,0.316292,0.063996,0.396741,0.036999,0.177474,0.018509,0.18625,0.063996,0.041359,0.093973,0.055451,0.052289,0.274225,0.04892,0.341758,0.063996,0.090317,0.124562,0.381832,0.277335
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [286]:
impute = SimpleImputer(strategy='most_frequent')

impute.fit(data_nonObject)
data_nonObject_ = impute.transform(data_nonObject)

data_nonObject = pd.DataFrame(data_nonObject_, columns=data_nonObject.columns, index=all_data_index)
data_nonObject.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,57.137718,67.756423,10168.11408,6.089072,5.564577,1971.312778,1984.264474,101.396026,441.272011,49.565262,560.579993,1051.417266,1159.581706,336.483727,4.694416,1500.759849,0.429599,0.061322,1.568003,0.380267,2.860226,1.044536,6.451524,0.597122,1979.577938,1.766701,472.712573,93.709832,47.486811,23.098321,2.602261,16.06235,2.251799,50.825968,6.213087,2007.792737,160467.6074
std,42.517628,21.59254,7886.996359,1.409947,1.113131,30.291442,20.894344,178.854579,455.606014,169.179104,439.590889,441.120498,392.362079,428.701456,46.396825,506.051045,0.524676,0.245608,0.552969,0.502872,0.822693,0.214462,1.569379,0.646129,25.605659,0.761506,215.535686,126.526589,67.575493,64.244246,25.188169,56.184365,35.663946,567.402211,2.714762,1.314964,59785.743435
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1895.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,60.0,7478.0,5.0,5.0,1953.5,1965.0,0.0,0.0,0.0,220.0,793.0,876.0,0.0,0.0,1126.0,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1961.5,1.0,320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,140000.0
50%,50.0,63.0,9453.0,6.0,5.0,1973.0,1993.0,0.0,368.0,0.0,467.0,989.0,1082.0,0.0,0.0,1444.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1984.0,2.0,480.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,140000.0
75%,70.0,78.0,11570.0,7.0,6.0,2001.0,2004.0,163.5,733.0,0.0,805.0,1302.0,1387.5,704.0,0.0,1743.5,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2003.0,2.0,576.0,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,163000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1526.0,2336.0,6110.0,5095.0,2065.0,1064.0,5642.0,3.0,2.0,4.0,2.0,8.0,3.0,15.0,4.0,2207.0,5.0,1488.0,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0,755000.0


In [287]:
all_data = pd.merge(dummy_data, data_nonObject, right_index=True, left_index=True)
all_data.head()

Unnamed: 0_level_0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,Alley_Pave,Alley_noPath,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_NoSeWa,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,...,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,0.0,150.0,856.0,856.0,854.0,0.0,1710.0,1.0,0.0,2.0,1.0,3.0,1.0,8.0,0.0,2003.0,2.0,548.0,0.0,61.0,0.0,0.0,0.0,0.0,0.0,2.0,2008.0,208500.0
2,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,0.0,284.0,1262.0,1262.0,0.0,0.0,1262.0,0.0,1.0,2.0,0.0,3.0,1.0,6.0,1.0,1976.0,2.0,460.0,298.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,181500.0
3,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,0.0,434.0,920.0,920.0,866.0,0.0,1786.0,1.0,0.0,2.0,1.0,3.0,1.0,6.0,1.0,2001.0,2.0,608.0,0.0,42.0,0.0,0.0,0.0,0.0,0.0,9.0,2008.0,223500.0
4,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,0.0,540.0,756.0,961.0,756.0,0.0,1717.0,1.0,0.0,1.0,0.0,3.0,1.0,7.0,1.0,1998.0,3.0,642.0,0.0,35.0,272.0,0.0,0.0,0.0,0.0,2.0,2006.0,140000.0
5,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,0.0,490.0,1145.0,1145.0,1053.0,0.0,2198.0,1.0,0.0,2.0,1.0,4.0,1.0,9.0,1.0,2000.0,3.0,836.0,192.0,84.0,0.0,0.0,0.0,0.0,0.0,12.0,2008.0,250000.0


In [288]:
# select reliable data
nTrain = len(train)
train = all_data[:nTrain]
test = all_data[nTrain:]

cormat = train.corr()
df = cormat.index[abs(cormat['SalePrice']) >= 0.2]
train = train[df]
test = test[df.drop('SalePrice')]
all_data = all_data[df]
all_data.head()

Unnamed: 0_level_0,MSZoning_RL,MSZoning_RM,LotShape_Reg,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_StoneBr,HouseStyle_2Story,RoofStyle_Gable,RoofStyle_Hip,Exterior1st_VinylSd,Exterior2nd_VinylSd,MasVnrType_None,MasVnrType_Stone,ExterQual_Gd,ExterQual_TA,Foundation_CBlock,Foundation_PConc,BsmtQual_Gd,BsmtQual_TA,BsmtExposure_Gd,BsmtExposure_No,BsmtFinType1_GLQ,HeatingQC_TA,CentralAir_Y,Electrical_SBrkr,KitchenQual_Gd,KitchenQual_TA,FireplaceQu_Gd,GarageType_Attchd,GarageType_BuiltIn,GarageType_Detchd,GarageFinish_Unf,GarageQual_TA,GarageCond_TA,PavedDrive_Y,SaleType_New,SaleType_WD,SaleCondition_Partial,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
1,1,0,1,0,0,0,1,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,0,0,1,0,0,0,1,1,1,0,1,0,65.0,8450.0,7.0,2003.0,2003.0,196.0,706.0,150.0,856.0,856.0,854.0,1710.0,1.0,2.0,1.0,8.0,0.0,2003.0,2.0,548.0,0.0,61.0,208500.0
2,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,80.0,9600.0,6.0,1976.0,1976.0,0.0,978.0,284.0,1262.0,1262.0,0.0,1262.0,0.0,2.0,0.0,6.0,1.0,1976.0,2.0,460.0,298.0,0.0,181500.0
3,1,0,0,0,0,0,1,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0,1,1,1,0,0,1,0,0,0,1,1,1,0,1,0,68.0,11250.0,7.0,2001.0,2002.0,162.0,486.0,434.0,920.0,920.0,866.0,1786.0,1.0,2.0,1.0,6.0,1.0,2001.0,2.0,608.0,0.0,42.0,223500.0
4,1,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,1,1,0,1,0,0,1,1,1,1,1,0,1,0,60.0,9550.0,7.0,1915.0,1970.0,0.0,216.0,540.0,756.0,961.0,756.0,1717.0,1.0,1.0,0.0,7.0,1.0,1998.0,3.0,642.0,0.0,35.0,140000.0
5,1,0,0,1,0,0,1,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0,1,1,1,0,0,1,0,0,0,1,1,1,0,1,0,84.0,14260.0,8.0,2000.0,2000.0,350.0,655.0,490.0,1145.0,1145.0,1053.0,2198.0,1.0,2.0,1.0,9.0,1.0,2000.0,3.0,836.0,192.0,84.0,250000.0


In [255]:
all_data.describe()

Unnamed: 0,MSZoning_RL,MSZoning_RM,LotShape_Reg,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_StoneBr,HouseStyle_2Story,RoofStyle_Gable,RoofStyle_Hip,Exterior1st_VinylSd,Exterior2nd_VinylSd,MasVnrType_None,MasVnrType_Stone,ExterQual_Gd,ExterQual_TA,Foundation_CBlock,Foundation_PConc,BsmtQual_Gd,BsmtQual_TA,BsmtExposure_Gd,BsmtExposure_No,BsmtFinType1_GLQ,HeatingQC_TA,CentralAir_Y,Electrical_SBrkr,KitchenQual_Gd,KitchenQual_TA,GarageType_Attchd,GarageType_BuiltIn,GarageType_Detchd,GarageFinish_Unf,GarageQual_TA,GarageCond_TA,PavedDrive_Y,SaleType_New,SaleType_WD,SaleCondition_Partial,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,SalePrice
count,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,0.775951,0.157588,0.636862,0.024323,0.056869,0.017472,0.298732,0.791367,0.188763,0.351148,0.347379,0.59678,0.085303,0.335389,0.615964,0.42309,0.448099,0.414183,0.439534,0.094553,0.652278,0.290853,0.293594,0.932854,0.915039,0.394313,0.511134,0.590271,0.06372,0.266872,0.421377,0.892086,0.909215,0.904762,0.081877,0.865022,0.083933,69.305795,10168.11408,6.089072,1971.312778,1984.264474,102.201312,441.423235,560.772104,1051.777587,1159.581706,336.483727,1500.759849,0.429894,1.568003,0.380267,6.451524,0.597122,1978.113406,1.766621,472.874572,93.709832,47.486811,180921.19589
std,0.417026,0.364417,0.480987,0.154078,0.231631,0.131043,0.457781,0.406401,0.391388,0.477411,0.476219,0.490628,0.27938,0.472207,0.48645,0.494134,0.497384,0.492665,0.496415,0.292647,0.476329,0.454234,0.455486,0.250318,0.278871,0.488786,0.499962,0.491868,0.244296,0.442401,0.493864,0.310325,0.287352,0.293594,0.274225,0.341758,0.277335,21.312345,7886.996359,1.409947,30.291442,20.894344,178.626089,455.53275,439.468337,440.690726,392.362079,428.701456,506.051045,0.524556,0.552969,0.502872,1.569379,0.646129,24.867762,0.761494,215.357904,126.526589,67.575493,56174.332503
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,1300.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,334.0,0.0,334.0,0.0,0.0,0.0,2.0,0.0,1895.0,0.0,0.0,0.0,0.0,34900.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,60.0,7478.0,5.0,1953.5,1965.0,0.0,0.0,220.0,793.0,876.0,0.0,1126.0,0.0,1.0,0.0,5.0,0.0,1961.5,1.0,320.0,0.0,0.0,163000.0
50%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,69.305795,9453.0,6.0,1973.0,1993.0,0.0,369.0,467.0,990.0,1082.0,0.0,1444.0,0.0,2.0,0.0,6.0,1.0,1978.113406,2.0,480.0,0.0,26.0,180921.19589
75%,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,78.0,11570.0,7.0,2001.0,2004.0,163.5,733.0,805.0,1302.0,1387.5,704.0,1743.5,1.0,2.0,1.0,7.0,1.0,2001.0,2.0,576.0,168.0,70.0,180921.19589
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,313.0,215245.0,10.0,2010.0,2010.0,1600.0,5644.0,2336.0,6110.0,5095.0,2065.0,5642.0,3.0,4.0,2.0,15.0,4.0,2207.0,5.0,1488.0,1424.0,742.0,755000.0


In [289]:
from sklearn.model_selection import train_test_split

train_x = train.drop(['SalePrice'], axis=1)
train_y = np.log1p(train['SalePrice'])

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, shuffle=True)

test_id_idx = test.index

test_x = test

In [181]:
train_x.describe()

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea
count,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,6.142979,1972.816781,1985.6875,112.875178,1070.565068,1160.978596,1515.858733,1.598459,6.450342,0.605308,1979.343931,1.777397,477.92637
std,1.391344,29.919784,20.511976,190.812277,433.025093,377.198454,481.443603,0.549735,1.4918,0.65131,24.09175,0.760036,210.343873
min,2.0,1880.0,1950.0,0.0,0.0,483.0,520.0,0.0,3.0,0.0,1896.0,0.0,0.0
25%,5.0,1955.0,1967.0,0.0,794.0,865.0,1167.0,1.0,5.0,0.0,1963.0,1.0,336.0
50%,6.0,1976.0,1994.0,0.0,1020.0,1100.0,1464.0,2.0,6.0,1.0,1979.0,2.0,482.0
75%,7.0,2002.0,2004.0,178.0,1347.75,1396.75,1755.0,2.0,7.0,1.0,2002.0,2.0,591.25
max,10.0,2010.0,2010.0,1600.0,3206.0,2696.0,3493.0,4.0,12.0,3.0,2010.0,4.0,1166.0


In [290]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

xgb = XGBRegressor()

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(550,700,50),
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}
grid_search = GridSearchCV(estimator=xgb, param_grid=param, cv=5, 
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

#print(train_x.dtypes)
#print(train_y.dtypes)

grid_search.fit(train_x, train_y)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

{'colsample_bylevel': 0.5, 'colsample_bytree': 0.7, 'max_depth': 4, 'n_estimators': 550}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=550,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)


In [291]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
pred_train = grid_search.predict(train_x)
pred_val = grid_search.predict(val_x)

print('train mae score: ', mean_absolute_error(train_y, pred_train))
print('val mae score:', mean_absolute_error(val_y, pred_val))

#train mae score:  0.054211420134860716
#val mae score: 0.10481369615515054

train mae score:  0.016177904110736416
val mae score: 0.09413574985483512


In [260]:
pred = grid_search.predict(test_x)
submission['SalePrice'] = np.expm1(pred)
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/HousePrices/submission.csv', index=False)