In [31]:
import numpy as np
import pandas as pd
import seaborn as sb

from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

In [225]:
# load data

train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HousePrices/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HousePrices/test.csv')
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HousePrices/sample_submission.csv')

train.set_index('Id', inplace=True)
test.set_index('Id', inplace=True)

print(len(train))
print(len(test))

1460
1459


In [226]:
# combine train and test data

all_data = pd.concat([train, test], sort=False)
all_data_index = all_data.index

train_y = train['SalePrice']
print(len(all_data))

2919


In [227]:
# remove unreliable data

na_check = all_data.isna().sum() / len(all_data)
na_check = na_check[na_check > 0.3]
na_check

Alley          0.932169
FireplaceQu    0.486468
PoolQC         0.996574
Fence          0.804385
MiscFeature    0.964029
SalePrice      0.499829
dtype: float64

In [228]:
all_data = all_data.drop(na_check.keys().drop(['SalePrice']), axis=1)

In [230]:
# separate object types and non-object types

data_object = all_data.select_dtypes(include='object')
data_nonObject = all_data.select_dtypes(exclude='object')

print(data_object.columns)
print(data_nonObject.columns)

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'Ga

In [231]:
# one-hot coding

dummy_data = pd.get_dummies(data_object, drop_first=True)
dummy_data.index = all_data_index
dummy_data.describe()

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_NoSeWa,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,...,Electrical_SBrkr,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,PavedDrive_P,PavedDrive_Y,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
count,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,...,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,0.047619,0.008907,0.775951,0.157588,0.995889,0.026036,0.005481,0.636862,0.04111,0.020555,0.898253,0.000343,0.060295,0.02912,0.004796,0.73073,0.042823,0.005481,0.003426,0.010277,0.036999,0.015074,0.09147,0.035286,0.066461,0.056526,0.03186,0.012676,0.039054,0.151764,0.007879,0.044878,0.024323,0.056869,0.081877,0.016444,0.05173,0.042823,0.06235,0.017472,...,0.915039,0.023981,0.394313,0.511134,0.003083,0.022268,0.023981,0.01199,0.000685,0.930798,0.590271,0.012333,0.06372,0.005139,0.266872,0.277835,0.421377,0.04248,0.008222,0.001713,0.892086,0.025351,0.005139,0.004796,0.909215,0.02124,0.904762,0.004111,0.001713,0.008907,0.003083,0.002741,0.081877,0.002398,0.865022,0.004111,0.008222,0.015759,0.822885,0.083933
std,0.212995,0.093973,0.417026,0.364417,0.063996,0.159271,0.073845,0.480987,0.198579,0.141913,0.302367,0.018509,0.238073,0.16817,0.0691,0.443657,0.202492,0.073845,0.05844,0.100873,0.188792,0.121867,0.288325,0.184534,0.249129,0.230975,0.175658,0.111889,0.193758,0.358854,0.088431,0.207072,0.154078,0.231631,0.274225,0.127197,0.221519,0.202492,0.241832,0.131043,...,0.278871,0.153016,0.488786,0.499962,0.055451,0.147579,0.153016,0.108861,0.026171,0.25384,0.491868,0.110386,0.244296,0.071513,0.442401,0.448008,0.493864,0.201717,0.090317,0.041359,0.310325,0.157216,0.071513,0.0691,0.287352,0.144209,0.293594,0.063996,0.041359,0.093973,0.055451,0.052289,0.274225,0.04892,0.341758,0.063996,0.090317,0.124562,0.381832,0.277335
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [232]:
impute = SimpleImputer(strategy='mean')

impute.fit(data_nonObject)
data_nonObject_ = impute.transform(data_nonObject)

data_nonObject = pd.DataFrame(data_nonObject_, columns=data_nonObject.columns, index=all_data_index)
#data_nonObject.index = all_data_index
data_nonObject.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,57.137718,69.305795,10168.11408,6.089072,5.564577,1971.312778,1984.264474,102.201312,441.423235,49.582248,560.772104,1051.777587,1159.581706,336.483727,4.694416,1500.759849,0.429894,0.061364,1.568003,0.380267,2.860226,1.044536,6.451524,0.597122,1978.113406,1.766621,472.874572,93.709832,47.486811,23.098321,2.602261,16.06235,2.251799,50.825968,6.213087,2007.792737,180921.19589
std,42.517628,21.312345,7886.996359,1.409947,1.113131,30.291442,20.894344,178.626089,455.53275,169.176615,439.468337,440.690726,392.362079,428.701456,46.396825,506.051045,0.524556,0.245603,0.552969,0.502872,0.822693,0.214462,1.569379,0.646129,24.867762,0.761494,215.357904,126.526589,67.575493,64.244246,25.188169,56.184365,35.663946,567.402211,2.714762,1.314964,56174.332503
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1895.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,60.0,7478.0,5.0,5.0,1953.5,1965.0,0.0,0.0,0.0,220.0,793.0,876.0,0.0,0.0,1126.0,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1961.5,1.0,320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,163000.0
50%,50.0,69.305795,9453.0,6.0,5.0,1973.0,1993.0,0.0,369.0,0.0,467.0,990.0,1082.0,0.0,0.0,1444.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1978.113406,2.0,480.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,180921.19589
75%,70.0,78.0,11570.0,7.0,6.0,2001.0,2004.0,163.5,733.0,0.0,805.0,1302.0,1387.5,704.0,0.0,1743.5,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2001.0,2.0,576.0,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,180921.19589
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1526.0,2336.0,6110.0,5095.0,2065.0,1064.0,5642.0,3.0,2.0,4.0,2.0,8.0,3.0,15.0,4.0,2207.0,5.0,1488.0,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0,755000.0


In [234]:
print(len(dummy_data))
print(len(data_nonObject))
print(len(all_data))
all_data = pd.merge(dummy_data, data_nonObject, right_index=True, left_index=True)
all_data.head()
print(dummy_data.columns)
print(data_nonObject.columns)

2919
2919
2919
Index(['MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM',
       'Street_Pave', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg',
       'LandContour_HLS', 'LandContour_Low',
       ...
       'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth',
       'SaleType_WD', 'SaleCondition_AdjLand', 'SaleCondition_Alloca',
       'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=196)
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscV

In [163]:
all_data.describe()

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_NoSeWa,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,...,SaleCondition_Normal,SaleCondition_Partial,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,...,5837.0,5837.0,5837.0,5837.0,4865.0,5837.0,5837.0,5837.0,5837.0,5837.0,5791.0,5835.0,5835.0,5835.0,5835.0,5837.0,5837.0,5837.0,5837.0,5833.0,5833.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5519.0,5835.0,5835.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,5837.0,2919.0
mean,0.047627,0.008909,0.775912,0.157615,0.995888,0.026041,0.005482,0.6368,0.041117,0.020559,0.898235,0.000343,0.060305,0.029125,0.004797,0.730684,0.04283,0.005482,0.003426,0.010279,0.037005,0.015076,0.091485,0.035292,0.066301,0.056536,0.031866,0.012678,0.039061,0.15179,0.007881,0.044886,0.024328,0.056879,0.081891,0.016447,0.051739,0.04283,0.062361,0.017475,...,0.822854,0.083947,1460.0,57.144081,69.304625,10168.153675,6.089258,5.564502,1971.31386,1984.267775,102.21896,441.356641,49.541045,560.844901,1051.742588,1159.565188,336.541374,4.69522,1500.801782,0.429796,0.061375,1.5681,0.380161,2.860202,1.044543,6.451602,0.597225,1978.115782,1.766752,472.908312,93.599794,47.483296,23.102279,2.602707,16.065102,2.252184,50.834675,6.213123,2007.792702,180932.645427
std,0.212994,0.093973,0.417016,0.364411,0.063996,0.15927,0.073845,0.480963,0.198578,0.141913,0.302364,0.018509,0.238071,0.16817,0.0691,0.443643,0.202492,0.073845,0.05844,0.100873,0.188791,0.121867,0.288323,0.184533,0.248829,0.230973,0.175657,0.111889,0.193757,0.358848,0.088431,0.207072,0.154077,0.23163,0.274223,0.127197,0.221518,0.202492,0.24183,0.131043,...,0.381825,0.277332,842.787043,42.514848,23.344762,7886.995779,1.409875,1.113116,30.291329,20.892822,179.329222,455.582417,169.176327,439.508471,440.758147,392.360048,428.678824,46.396784,506.040901,0.524683,0.245686,0.552919,0.502806,0.822691,0.214461,1.569368,0.646082,25.573675,0.761558,215.37939,126.246891,67.57496,64.243534,25.188146,56.183972,35.663934,567.40182,2.71476,1.314962,79440.092805
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1895.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,730.0,20.0,59.0,7476.0,5.0,5.0,1953.0,1965.0,0.0,0.0,0.0,220.0,793.0,876.0,0.0,0.0,1126.0,0.0,0.0,1.0,0.0,2.0,1.0,5.0,0.0,1960.0,1.0,320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,129950.0
50%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1460.0,50.0,68.0,9453.0,6.0,5.0,1973.0,1993.0,0.0,368.0,0.0,467.0,989.0,1082.0,0.0,0.0,1444.0,0.0,0.0,2.0,0.0,3.0,1.0,6.0,1.0,1979.0,2.0,480.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2190.0,70.0,80.0,11577.0,7.0,6.0,2001.0,2004.0,164.0,733.0,0.0,806.0,1302.0,1388.0,704.0,0.0,1744.0,1.0,0.0,2.0,1.0,3.0,1.0,7.0,1.0,2002.0,2.0,576.0,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,2919.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1526.0,2336.0,6110.0,5095.0,2065.0,1064.0,5642.0,3.0,2.0,4.0,2.0,8.0,3.0,15.0,4.0,2207.0,5.0,1488.0,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0,755000.0


In [235]:
# select reliable data
nTrain = len(train)
train = all_data[:nTrain]
test = all_data[nTrain:]

cormat = train.corr()
df = cormat.index[abs(cormat['SalePrice']) >= 0.3]
train = train[df]
test = test[df.drop('SalePrice')]

In [191]:
print(len(all_data))
print(nTrain)
print(len(train))
print(len(test))

5837
1460
1460
4377


In [236]:
from sklearn.model_selection import train_test_split

train_x = train.drop(['SalePrice'], axis=1)
train_y = train['SalePrice']

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, shuffle=True)

test_id_idx = test.index

test_x = test

In [181]:
train_x.describe()

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea
count,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,6.142979,1972.816781,1985.6875,112.875178,1070.565068,1160.978596,1515.858733,1.598459,6.450342,0.605308,1979.343931,1.777397,477.92637
std,1.391344,29.919784,20.511976,190.812277,433.025093,377.198454,481.443603,0.549735,1.4918,0.65131,24.09175,0.760036,210.343873
min,2.0,1880.0,1950.0,0.0,0.0,483.0,520.0,0.0,3.0,0.0,1896.0,0.0,0.0
25%,5.0,1955.0,1967.0,0.0,794.0,865.0,1167.0,1.0,5.0,0.0,1963.0,1.0,336.0
50%,6.0,1976.0,1994.0,0.0,1020.0,1100.0,1464.0,2.0,6.0,1.0,1979.0,2.0,482.0
75%,7.0,2002.0,2004.0,178.0,1347.75,1396.75,1755.0,2.0,7.0,1.0,2002.0,2.0,591.25
max,10.0,2010.0,2010.0,1600.0,3206.0,2696.0,3493.0,4.0,12.0,3.0,2010.0,4.0,1166.0


In [241]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

xgb = XGBRegressor()

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(550,700,50),
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}
grid_search = GridSearchCV(estimator=xgb, param_grid=param, cv=10, 
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

#print(train_x.dtypes)
#print(train_y.dtypes)

grid_search.fit(train_x, train_y)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

{'colsample_bylevel': 1, 'colsample_bytree': 1, 'max_depth': 2, 'n_estimators': 600}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=2, min_child_weight=1, missing=None, n_estimators=600,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)


In [242]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
pred_train = grid_search.predict(train_x)
pred_val = grid_search.predict(val_x)

print('train mae score: ', mean_absolute_error(train_y, pred_train))
print('val mae score:', mean_absolute_error(val_y, pred_val))

#train mae score:  10568.155270093108
#val mae score: 17147.93216235017

train mae score:  10259.323439506636
val mae score: 17173.29493525257


In [239]:
print(len(submission))
submission.head()

1459


Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [243]:
pred = grid_search.predict(test_x)
submission['SalePrice'] = pred
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/HousePrices/submission.csv', index=False)