In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

# For reproducability
np.random.seed(0)

In [2]:
# Read the CSV file
data = pd.read_csv('train.csv', index_col=0)

data.head() # The first five rows with all columns

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Number of rows and columns
data.shape 

(1460, 80)

In [4]:
# Selecting only numerical columns <Features>
# You can do these in many ways

# One way
# Selecting the data types you want
numerical_cols = data.select_dtypes(['int64', 'float64'])
numerical_cols.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,61,0,0,0,0,0,2,2008,208500
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,298,0,0,0,0,0,0,5,2007,181500
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,42,0,0,0,0,0,9,2008,223500
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,35,272,0,0,0,0,2,2006,140000
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,192,84,0,0,0,0,0,12,2008,250000


In [5]:
# Second method.
# Exclude the objects 
numerical_cols = data.select_dtypes(exclude='object')

# Same result
numerical_cols.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,61,0,0,0,0,0,2,2008,208500
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,298,0,0,0,0,0,0,5,2007,181500
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,42,0,0,0,0,0,9,2008,223500
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,35,272,0,0,0,0,2,2006,140000
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,192,84,0,0,0,0,0,12,2008,250000


In [6]:
# Missing values per column 
data.isnull().sum()

MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive

In [7]:
# Since our target has no missing values
# Separate it from the training features

y = data.SalePrice # Target

# Drop the target from the Features
X = numerical_cols.drop(['SalePrice'], axis=1)

X.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,548,0,61,0,0,0,0,0,2,2008
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,460,298,0,0,0,0,0,0,5,2007
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,608,0,42,0,0,0,0,0,9,2008
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,642,0,35,272,0,0,0,0,2,2006
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,836,192,84,0,0,0,0,0,12,2008


In [8]:
# The missing entries in the features
X.isnull().sum()

MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64

In [9]:
# Percentage of missing data

missing_data = X.isnull().sum() # Missing data per column
total_missing = missing_data.sum() # Total missing values

total_entries = np.product(X.shape) # Total number of entries
percentage_missing = total_missing / total_entries * 100 

percentage_missing

0.6621004566210046

In [10]:
# Total number of cells 
total_entries

52560

In [11]:
# Total number of cells with missing values
total_missing

348

In [12]:
# Split the training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=123)

In [13]:
# Randomly selected training features
X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
319,60,90.0,9900,7,5,1993,1993,256.0,987,0,...,656,340,60,144,0,0,0,0,4,2009
581,20,,14585,6,6,1960,1987,85.0,594,219,...,572,216,110,0,0,0,0,0,6,2007
962,60,,12227,6,7,1977,1995,424.0,896,0,...,619,550,282,0,0,0,0,0,7,2008
79,90,72.0,10778,4,5,1968,1968,0.0,0,0,...,0,0,0,0,0,0,0,0,4,2010
6,50,85.0,14115,5,5,1993,1995,0.0,732,0,...,480,40,30,0,320,0,0,700,10,2009


In [14]:
# Randomly selected validation features
X_valid.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
148,60,,9505,7,5,2001,2001,180.0,0,0,...,434,144,48,0,0,0,0,0,5,2010
677,70,60.0,9600,4,2,1900,1950,0.0,0,0,...,779,0,0,90,0,0,0,0,5,2006
1305,160,32.0,3363,7,5,2004,2004,117.0,0,0,...,380,0,40,0,0,0,0,0,4,2006
1373,60,75.0,9750,7,6,1998,1998,0.0,975,0,...,583,253,170,0,0,0,0,0,6,2006
1428,50,60.0,10930,5,6,1945,1950,0.0,580,0,...,288,0,0,0,0,0,0,0,4,2008


In [15]:
# Now impute the missing values
# Set the imputer
imputer = SimpleImputer()

# Impute both the training and validation sets
imp_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imp_X_valid = pd.DataFrame(imputer.transform(X_valid))

In [16]:
# Imputing removes the column names
# Also distorts the index labels
imp_X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,60.0,90.0,9900.0,7.0,5.0,1993.0,1993.0,256.0,987.0,0.0,...,656.0,340.0,60.0,144.0,0.0,0.0,0.0,0.0,4.0,2009.0
1,20.0,69.58427,14585.0,6.0,6.0,1960.0,1987.0,85.0,594.0,219.0,...,572.0,216.0,110.0,0.0,0.0,0.0,0.0,0.0,6.0,2007.0
2,60.0,69.58427,12227.0,6.0,7.0,1977.0,1995.0,424.0,896.0,0.0,...,619.0,550.0,282.0,0.0,0.0,0.0,0.0,0.0,7.0,2008.0
3,90.0,72.0,10778.0,4.0,5.0,1968.0,1968.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010.0
4,50.0,85.0,14115.0,5.0,5.0,1993.0,1995.0,0.0,732.0,0.0,...,480.0,40.0,30.0,0.0,320.0,0.0,0.0,700.0,10.0,2009.0


In [17]:
X_train.shape

(1168, 36)

In [18]:
imp_X_train.shape

(1168, 36)

In [19]:
imp_X_valid.shape

(292, 36)

In [20]:
# Let's put back the feature names 
# Plus the index labels

imp_X_train.columns = X_train.columns
imp_X_valid.columns = X_valid.columns

imp_X_train.index = X_train.index
imp_X_valid.index = X_valid.index

imp_X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
319,60.0,90.0,9900.0,7.0,5.0,1993.0,1993.0,256.0,987.0,0.0,...,656.0,340.0,60.0,144.0,0.0,0.0,0.0,0.0,4.0,2009.0
581,20.0,69.58427,14585.0,6.0,6.0,1960.0,1987.0,85.0,594.0,219.0,...,572.0,216.0,110.0,0.0,0.0,0.0,0.0,0.0,6.0,2007.0
962,60.0,69.58427,12227.0,6.0,7.0,1977.0,1995.0,424.0,896.0,0.0,...,619.0,550.0,282.0,0.0,0.0,0.0,0.0,0.0,7.0,2008.0
79,90.0,72.0,10778.0,4.0,5.0,1968.0,1968.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010.0
6,50.0,85.0,14115.0,5.0,5.0,1993.0,1995.0,0.0,732.0,0.0,...,480.0,40.0,30.0,0.0,320.0,0.0,0.0,700.0,10.0,2009.0


In [21]:
# Declaring the model and fitting it

model = DecisionTreeRegressor(random_state=0)

model.fit(imp_X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [22]:
predictions = model.predict(imp_X_valid)

predictions

array([284000., 160000., 172500., 226000., 127000., 220000., 345000.,
       134000., 142000., 124000., 155000., 251000., 145000.,  93000.,
       268000., 165000., 139600., 305000., 263435., 190000., 142500.,
       152000., 127000., 149000., 213500., 134000., 175000., 197900.,
       149900., 125000., 142000., 200141., 129000., 215200., 319000.,
       204900., 174900., 122000., 117000., 116000., 137000., 191000.,
       257500., 181000., 163500., 107900., 213250., 194500., 248000.,
       154900., 200000., 186000., 174900., 145250., 125000., 139000.,
       123000., 157500., 239500., 320000., 109900., 278000., 200100.,
       152000., 231500., 139000., 119750., 233170., 239000., 137500.,
       147000., 206900., 164700., 255500., 201800., 114500., 168000.,
       127500., 269790., 199900., 339750., 350000.,  84000., 109000.,
       157500., 153000.,  99900., 110000.,  75500., 335000., 120500.,
       169000., 179200., 219500., 179000., 109900., 266500., 117000.,
       213500., 1285

In [23]:
len(predictions)

292

In [24]:
# Evaluating the model's performance..
mae = mean_absolute_error(predictions, y_valid)

mae

26064.496575342466