In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree 
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

In [3]:
train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
test.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:
#Checking any missing values
NAs = pd.concat([train.isnull().sum()], axis=1, keys=["Train"])
NAs[NAs.sum(axis=1) > 0]

Unnamed: 0,Train
LotFrontage,259
Alley,1369
MasVnrType,8
MasVnrArea,8
BsmtQual,37
BsmtCond,37
BsmtExposure,38
BsmtFinType1,37
BsmtFinType2,38
Electrical,1


In [6]:
#Removing missing collumns
removeColumn1 = ['LotFrontage','Alley', 'MasVnrType','MasVnrArea', 'BsmtQual','BsmtCond', 'BsmtExposure', 
                 'BsmtFinType1', 'BsmtFinType2','Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 
                 'GarageFinish','GarageQual','GarageCond','PoolQC','Fence', 'MiscFeature']
#Removing some of string values that we don't transform to label
removeColumn2 = ['Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle',
                 'RoofMatl','Exterior1st','Exterior2nd','ExterCond','Heating','HeatingQC',
                 'CentralAir','Functional','PavedDrive', 'Foundation','MSZoning', 'Utilities','BsmtFinSF1',
                 'BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','KitchenQual','GarageCars',
                 'GarageArea','SaleType']
reduced_train = train.drop(removeColumn1, axis=1)
new_reduced_train = reduced_train.drop(removeColumn2, axis=1)
reduced_test = test.drop(removeColumn1, axis=1)
new_reduced_test = reduced_test.drop(removeColumn2, axis=1)

In [7]:
#old training set
print (train.shape) 
#new training set
print (new_reduced_train.shape)

(1460, 81)
(1460, 34)


In [8]:
#old test set
print (test.shape)
#new test set
print (new_reduced_test.shape)

(1459, 80)
(1459, 33)


In [9]:
labStreet = LabelEncoder()
labLotShape = LabelEncoder()
labLandContour = LabelEncoder()
labLotConfig = LabelEncoder()
labLandSlope = LabelEncoder()
labSaleCondition = LabelEncoder()
labExterQual= LabelEncoder()

In [10]:
new_reduced_train['Street'] = labStreet.fit_transform(new_reduced_train['Street'])
new_reduced_train['LotShape'] = labLotShape.fit_transform(new_reduced_train['LotShape'])
new_reduced_train['LandContour'] = labLandContour.fit_transform(new_reduced_train['LandContour'])
new_reduced_train['LotConfig'] = labLotConfig.fit_transform(new_reduced_train['LotConfig'])
new_reduced_train['LandSlope'] = labLandSlope.fit_transform(new_reduced_train['LandSlope'])
new_reduced_train['SaleCondition'] = labSaleCondition.fit_transform(new_reduced_train['SaleCondition'])
new_reduced_train['ExterQual'] = labExterQual.fit_transform(new_reduced_train['ExterQual'])


new_reduced_train.head()

Unnamed: 0,Id,MSSubClass,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,OverallQual,OverallCond,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleCondition,SalePrice
0,1,60,8450,1,3,3,4,0,7,5,...,61,0,0,0,0,0,2,2008,4,208500
1,2,20,9600,1,3,3,2,0,6,8,...,0,0,0,0,0,0,5,2007,4,181500
2,3,60,11250,1,0,3,4,0,7,5,...,42,0,0,0,0,0,9,2008,4,223500
3,4,70,9550,1,0,3,0,0,7,5,...,35,272,0,0,0,0,2,2006,0,140000
4,5,60,14260,1,0,3,2,0,8,5,...,84,0,0,0,0,0,12,2008,4,250000


In [11]:
new_reduced_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 34 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
LotArea          1460 non-null int64
Street           1460 non-null int64
LotShape         1460 non-null int64
LandContour      1460 non-null int64
LotConfig        1460 non-null int64
LandSlope        1460 non-null int64
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
ExterQual        1460 non-null int64
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
TotRmsAbvGrd     1460 non-null int64
Fireplaces       1460 non-null int64
WoodDeckSF       1460 non-null int64
OpenP

In [12]:
#Checking any missing values
NAs = pd.concat([new_reduced_test.isnull().sum()], axis=1, keys=["Train"])
NAs[NAs.sum(axis=1) > 0]

Unnamed: 0,Train


In [13]:
tlabStreet = LabelEncoder()
tlabLotShape = LabelEncoder()
tlabLandContour = LabelEncoder()
tlabLotConfig = LabelEncoder()
tlabLandSlope = LabelEncoder()
tlabSaleCondition = LabelEncoder()
tlabExterQual= LabelEncoder()

In [14]:
new_reduced_test['Street'] = tlabStreet.fit_transform(new_reduced_test['Street'])
new_reduced_test['LotShape'] = tlabLotShape.fit_transform(new_reduced_test['LotShape'])
new_reduced_test['LandContour'] = tlabLandContour.fit_transform(new_reduced_test['LandContour'])
new_reduced_test['LotConfig'] = tlabLotConfig.fit_transform(new_reduced_test['LotConfig'])
new_reduced_test['LandSlope'] = tlabLandSlope.fit_transform(new_reduced_test['LandSlope'])
new_reduced_test['SaleCondition'] = tlabSaleCondition.fit_transform(new_reduced_test['SaleCondition'])
new_reduced_test['ExterQual'] = tlabExterQual.fit_transform(new_reduced_test['ExterQual'])

new_reduced_test.head()

Unnamed: 0,Id,MSSubClass,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,OverallQual,OverallCond,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleCondition
0,1461,20,11622,1,3,3,4,0,5,6,...,140,0,0,0,120,0,0,6,2010,4
1,1462,20,14267,1,0,3,0,0,6,6,...,393,36,0,0,0,0,12500,6,2010,4
2,1463,60,13830,1,0,3,4,0,5,5,...,212,34,0,0,0,0,0,3,2010,4
3,1464,60,9978,1,0,3,4,0,6,6,...,360,36,0,0,0,0,0,6,2010,4
4,1465,120,5005,1,0,1,4,0,8,5,...,0,82,0,0,144,0,0,1,2010,4


In [15]:
new_reduced_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 33 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
LotArea          1459 non-null int64
Street           1459 non-null int64
LotShape         1459 non-null int64
LandContour      1459 non-null int64
LotConfig        1459 non-null int64
LandSlope        1459 non-null int64
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
ExterQual        1459 non-null int64
1stFlrSF         1459 non-null int64
2ndFlrSF         1459 non-null int64
LowQualFinSF     1459 non-null int64
GrLivArea        1459 non-null int64
FullBath         1459 non-null int64
HalfBath         1459 non-null int64
BedroomAbvGr     1459 non-null int64
KitchenAbvGr     1459 non-null int64
TotRmsAbvGrd     1459 non-null int64
Fireplaces       1459 non-null int64
WoodDeckSF       1459 non-null int64
OpenP

In [16]:
y = new_reduced_train['SalePrice']
Y = y.values.reshape(-1,1)
X = new_reduced_train.drop(['SalePrice'],axis=1)


In [17]:
new_reduced_train.columns

Index(['Id', 'MSSubClass', 'LotArea', 'Street', 'LotShape', 'LandContour',
       'LotConfig', 'LandSlope', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'ExterQual', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'TotRmsAbvGrd', 'Fireplaces', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SaleCondition', 'SalePrice'],
      dtype='object')

In [18]:
X.shape, y.shape

((1460, 33), (1460,))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)
#Model
#try min_samples_split (such as 2, 5, 10, 15)
# try max_depth (such as 1, 2, 3, 4, 5)
model1 = tree.DecisionTreeRegressor(max_depth=1, min_samples_split=2)
model2 = tree.DecisionTreeRegressor(max_depth=2, min_samples_split=5)
model3 = tree.DecisionTreeRegressor(max_depth=3, min_samples_split=10)
model4 = tree.DecisionTreeRegressor(max_depth=4, min_samples_split=15)
model5 = tree.DecisionTreeRegressor(max_depth=5, min_samples_split=10)
model6 = tree.DecisionTreeRegressor(max_depth=4, min_samples_split=5)
model7 = tree.DecisionTreeRegressor(max_depth=3, min_samples_split=2)
model8 = tree.DecisionTreeRegressor(max_depth=2, min_samples_split=10)
model9 = tree.DecisionTreeRegressor(max_depth=1, min_samples_split=15)

In [20]:
model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)
model4.fit(X_train,y_train)
model5.fit(X_train,y_train)
model6.fit(X_train,y_train)
model7.fit(X_train,y_train)
model8.fit(X_train,y_train)
model9.fit(X_train,y_train)

prediction1 = model1.predict(X_test)
prediction2 = model2.predict(X_test)
prediction3 = model3.predict(X_test)
prediction4 = model4.predict(X_test)
prediction5 = model5.predict(X_test)
prediction6 = model6.predict(X_test)
prediction7 = model7.predict(X_test)
prediction8 = model8.predict(X_test)
prediction9 = model9.predict(X_test)

In [21]:
#Checking prediction result quality method
from sklearn.metrics import mean_squared_log_error
def kaggle_score(y_true,y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred));
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def valid_pred(prediction):
    mae, kaggle, mape = [], [], []
    mae.append(mean_absolute_error(y_test, prediction))
    kaggle.append(kaggle_score(y_test, np.abs(prediction)))
    mape.append(mean_absolute_percentage_error(y_test, prediction))
    return print("Test MAE:", np.mean(mae), "Test Kaggle-Score:", np.mean(kaggle) , "Test MAPE:", np.mean(mape))

In [22]:
valid_pred(prediction1)
valid_pred(prediction2)
valid_pred(prediction3)
valid_pred(prediction4)
valid_pred(prediction5)
valid_pred(prediction6)
valid_pred(prediction7)
valid_pred(prediction8)
valid_pred(prediction9)

Test MAE: 40791.459342061156 Test Kaggle-Score: 0.2848384533320511 Test MAPE: 35.62568033266307
Test MAE: 31518.289584865084 Test Kaggle-Score: 0.22531876735734618 Test MAPE: 38.56882246161633
Test MAE: 28689.224792121113 Test Kaggle-Score: 0.20568588129210946 Test MAPE: 39.92668752607096
Test MAE: 27042.51874356172 Test Kaggle-Score: 0.18948207720461863 Test MAPE: 40.741401120788076
Test MAE: 24186.825487523656 Test Kaggle-Score: 0.17746844320537786 Test MAPE: 39.94985046757268
Test MAE: 27042.51874356172 Test Kaggle-Score: 0.18948207720461863 Test MAPE: 40.741401120788076
Test MAE: 28689.224792121113 Test Kaggle-Score: 0.20568588129210946 Test MAPE: 39.92668752607096
Test MAE: 31518.289584865084 Test Kaggle-Score: 0.22531876735734618 Test MAPE: 38.56882246161633
Test MAE: 40791.459342061156 Test Kaggle-Score: 0.2848384533320511 Test MAPE: 35.62568033266307


In [23]:
#5th of the prediction has minimum Kaggle-Score "0.17746844320537786". So, we will predict with model5

In [24]:
result = model5.predict(new_reduced_test)

In [25]:
result

array([122345.47668394, 146906.37951807, 161885.11864407, ...,
       146906.37951807, 122345.47668394, 224588.49056604])

In [26]:
len(result)

1459

In [27]:
#old
submission

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.683570
3,1464,179317.477511
4,1465,150730.079977
5,1466,177150.989247
6,1467,172070.659229
7,1468,175110.956520
8,1469,162011.698832
9,1470,160726.247831


In [28]:
#new
sub = submission.drop(['SalePrice'], axis = 1)
sub['SalePrice'] = result
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,122345.476684
1,1462,146906.379518
2,1463,161885.118644
3,1464,170112.65493
4,1465,198720.0


In [29]:
#download as a csv file
sub.to_csv('submission.csv', index=False)