In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

In [2]:
data = pd.read_csv('train.csv', index_col=0)

data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Setting the target and separating it from the features
y = data.SalePrice

X = data.drop(['SalePrice'], axis=1)

In [4]:
# Dataframe of only categorical columns
obj_cols = data.select_dtypes('object')

obj_cols.head()

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
4,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
5,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [5]:
# The number of categories in each categorical feature
obj_cols.nunique()

MSZoning          5
Street            2
Alley             2
LotShape          4
LandContour       4
Utilities         2
LotConfig         5
LandSlope         3
Neighborhood     25
Condition1        9
Condition2        8
BldgType          5
HouseStyle        8
RoofStyle         6
RoofMatl          8
Exterior1st      15
Exterior2nd      16
MasVnrType        4
ExterQual         4
ExterCond         5
Foundation        6
BsmtQual          4
BsmtCond          4
BsmtExposure      4
BsmtFinType1      6
BsmtFinType2      6
Heating           6
HeatingQC         5
CentralAir        2
Electrical        5
KitchenQual       4
Functional        7
FireplaceQu       5
GarageType        6
GarageFinish      3
GarageQual        5
GarageCond        5
PavedDrive        3
PoolQC            3
Fence             4
MiscFeature       4
SaleType          9
SaleCondition     6
dtype: int64

In [6]:
# List of categorical features with low cardinality < below 10 >
#  Features with less than 10 unique values
# Plus no missing values
categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() <=10 and X[col].isnull().sum() == 0]

categorical_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [8]:
len(categorical_cols)

24

In [9]:
# Creating a list of numerical columns only
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

numerical_cols

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [10]:
# Combining only selected columns
my_cols = numerical_cols + categorical_cols

# Setting them as the features
X = data[my_cols]

X.head(10)

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,ExterCond,Foundation,Heating,HeatingQC,CentralAir,KitchenQual,Functional,PavedDrive,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,TA,CBlock,GasA,Ex,Y,TA,Typ,Y,WD,Normal
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,TA,BrkTil,GasA,Gd,Y,Gd,Typ,Y,WD,Abnorml
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
6,50,85.0,14115,5,5,1993,1995,0.0,732,0,...,TA,Wood,GasA,Ex,Y,TA,Typ,Y,WD,Normal
7,20,75.0,10084,8,5,2004,2005,186.0,1369,0,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
8,60,,10382,7,6,1973,1973,240.0,859,32,...,TA,CBlock,GasA,Ex,Y,TA,Typ,Y,WD,Normal
9,50,51.0,6120,7,5,1931,1950,0.0,0,0,...,TA,BrkTil,GasA,Gd,Y,TA,Min1,Y,WD,Abnorml
10,190,50.0,7420,5,6,1939,1950,0.0,851,0,...,TA,BrkTil,GasA,Ex,Y,TA,Typ,Y,WD,Normal


In [11]:
# Only the numerical columns have missing entries
X.isnull().sum()

MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
MSZoning           0
Street             0
LotShape           0
LandContour        0
Utilities          0
LotConfig          0
LandSlope          0
Condition1         0
Condition2         0
BldgType           0
HouseStyle         0
RoofStyle    

In [12]:
# Splitting the training and validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=11)

In [13]:
# Declaring the imputer for the numerical columns
imputer = SimpleImputer()

# Encoder for the categorical columns
encoder = LabelEncoder()

In [14]:
# Imputing the numerical columns both in training and validation sets
imp_X_train = pd.DataFrame(imputer.fit_transform(X_train[numerical_cols]))
imp_X_valid = pd.DataFrame(imputer.transform(X_valid[numerical_cols]))

In [16]:
# Imputation alters the index and removes the column names 
imp_X_valid.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
dtype: int64

In [17]:
X_valid[numerical_cols].head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
128,45,55.0,4388,5,7,1930,1950,0.0,116,0,...,0,0,0,0,0,0,0,0,6,2007
456,20,80.0,9600,7,6,1973,1973,320.0,916,0,...,528,0,0,0,0,0,0,0,9,2007
1324,30,50.0,5330,4,7,1940,1950,0.0,280,0,...,0,164,0,0,0,0,0,0,12,2009
218,70,57.0,9906,4,4,1925,1950,0.0,0,0,...,210,0,172,60,0,0,0,0,9,2006
1182,120,64.0,5587,8,5,2008,2008,186.0,1480,0,...,482,162,53,0,153,0,0,0,11,2008


In [18]:
imp_X_valid.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,45.0,55.0,4388.0,5.0,7.0,1930.0,1950.0,0.0,116.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2007.0
1,20.0,80.0,9600.0,7.0,6.0,1973.0,1973.0,320.0,916.0,0.0,...,528.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,2007.0
2,30.0,50.0,5330.0,4.0,7.0,1940.0,1950.0,0.0,280.0,0.0,...,0.0,164.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,2009.0
3,70.0,57.0,9906.0,4.0,4.0,1925.0,1950.0,0.0,0.0,0.0,...,210.0,0.0,172.0,60.0,0.0,0.0,0.0,0.0,9.0,2006.0
4,120.0,64.0,5587.0,8.0,5.0,2008.0,2008.0,186.0,1480.0,0.0,...,482.0,162.0,53.0,0.0,153.0,0.0,0.0,0.0,11.0,2008.0


In [19]:
# Putting the column names back
imp_X_train.columns = X_train[numerical_cols].columns
imp_X_valid.columns = X_valid[numerical_cols].columns

In [20]:
# Note the index values did not change
imp_X_valid.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,45.0,55.0,4388.0,5.0,7.0,1930.0,1950.0,0.0,116.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2007.0
1,20.0,80.0,9600.0,7.0,6.0,1973.0,1973.0,320.0,916.0,0.0,...,528.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,2007.0
2,30.0,50.0,5330.0,4.0,7.0,1940.0,1950.0,0.0,280.0,0.0,...,0.0,164.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,2009.0
3,70.0,57.0,9906.0,4.0,4.0,1925.0,1950.0,0.0,0.0,0.0,...,210.0,0.0,172.0,60.0,0.0,0.0,0.0,0.0,9.0,2006.0
4,120.0,64.0,5587.0,8.0,5.0,2008.0,2008.0,186.0,1480.0,0.0,...,482.0,162.0,53.0,0.0,153.0,0.0,0.0,0.0,11.0,2008.0


In [21]:
# Resetting the index
imp_X_train.index = X_train[numerical_cols].index
imp_X_valid.index = X_valid[numerical_cols].index

imp_X_valid.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
128,45.0,55.0,4388.0,5.0,7.0,1930.0,1950.0,0.0,116.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2007.0
456,20.0,80.0,9600.0,7.0,6.0,1973.0,1973.0,320.0,916.0,0.0,...,528.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,2007.0
1324,30.0,50.0,5330.0,4.0,7.0,1940.0,1950.0,0.0,280.0,0.0,...,0.0,164.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,2009.0
218,70.0,57.0,9906.0,4.0,4.0,1925.0,1950.0,0.0,0.0,0.0,...,210.0,0.0,172.0,60.0,0.0,0.0,0.0,0.0,9.0,2006.0
1182,120.0,64.0,5587.0,8.0,5.0,2008.0,2008.0,186.0,1480.0,0.0,...,482.0,162.0,53.0,0.0,153.0,0.0,0.0,0.0,11.0,2008.0


In [22]:
# Make copies of the categorical columns to label encode
enc_X_train = X_train[categorical_cols].copy()
enc_X_valid = X_valid[categorical_cols].copy()

In [23]:
# Label Encoding the categories

for col in categorical_cols:
    enc_X_train[col] = pd.DataFrame(encoder.fit_transform(X_train[col]))
    enc_X_valid[col] = pd.DataFrame(encoder.transform(X_valid[col]))

ValueError: y contains previously unseen labels: 'Po'

In [24]:
# Choosing between good cols and bad ones
good_cols = [col for col in categorical_cols if set(X_train[col]) == set(X_valid[col])]

len(good_cols)

11

In [26]:
X_train[categorical_cols].nunique()

MSZoning         5
Street           2
LotShape         4
LandContour      4
Utilities        2
LotConfig        5
LandSlope        3
Condition1       9
Condition2       8
BldgType         5
HouseStyle       8
RoofStyle        6
RoofMatl         8
ExterQual        4
ExterCond        4
Foundation       6
Heating          6
HeatingQC        4
CentralAir       2
KitchenQual      4
Functional       7
PavedDrive       3
SaleType         9
SaleCondition    6
dtype: int64

In [27]:
X[good_cols].head()

Unnamed: 0_level_0,MSZoning,LandContour,LotConfig,LandSlope,BldgType,HouseStyle,ExterQual,CentralAir,KitchenQual,PavedDrive,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,RL,Lvl,Inside,Gtl,1Fam,2Story,Gd,Y,Gd,Y,Normal
2,RL,Lvl,FR2,Gtl,1Fam,1Story,TA,Y,TA,Y,Normal
3,RL,Lvl,Inside,Gtl,1Fam,2Story,Gd,Y,Gd,Y,Normal
4,RL,Lvl,Corner,Gtl,1Fam,2Story,TA,Y,Gd,Y,Abnorml
5,RL,Lvl,FR2,Gtl,1Fam,2Story,Gd,Y,Gd,Y,Normal


In [28]:
X[good_cols].isnull().sum()

MSZoning         0
LandContour      0
LotConfig        0
LandSlope        0
BldgType         0
HouseStyle       0
ExterQual        0
CentralAir       0
KitchenQual      0
PavedDrive       0
SaleCondition    0
dtype: int64

In [29]:
X[good_cols].nunique()

MSZoning         5
LandContour      4
LotConfig        5
LandSlope        3
BldgType         5
HouseStyle       8
ExterQual        4
CentralAir       2
KitchenQual      4
PavedDrive       3
SaleCondition    6
dtype: int64

In [30]:
new_cols = numerical_cols + good_cols

new_X = X[new_cols]

new_X.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,LandContour,LotConfig,LandSlope,BldgType,HouseStyle,ExterQual,CentralAir,KitchenQual,PavedDrive,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,Lvl,Inside,Gtl,1Fam,2Story,Gd,Y,Gd,Y,Normal
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,Lvl,FR2,Gtl,1Fam,1Story,TA,Y,TA,Y,Normal
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,Lvl,Inside,Gtl,1Fam,2Story,Gd,Y,Gd,Y,Normal
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,Lvl,Corner,Gtl,1Fam,2Story,TA,Y,Gd,Y,Abnorml
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,Lvl,FR2,Gtl,1Fam,2Story,Gd,Y,Gd,Y,Normal


In [31]:
n_X_t, n_X_v, n_y_t, n_y_v = train_test_split(new_X, y, test_size=0.2, random_state=11)

n_X_t.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,LandContour,LotConfig,LandSlope,BldgType,HouseStyle,ExterQual,CentralAir,KitchenQual,PavedDrive,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
288,20,,8125,4,4,1971,1971,0.0,614,0,...,Lvl,Corner,Gtl,1Fam,1Story,TA,Y,TA,Y,Normal
727,20,,21695,6,9,1988,2007,260.0,808,0,...,Lvl,Corner,Gtl,1Fam,1Story,Gd,Y,Gd,Y,Normal
1005,120,43.0,3182,7,5,2005,2006,16.0,16,0,...,Lvl,Inside,Gtl,TwnhsE,1Story,Gd,Y,Gd,Y,Normal
490,180,21.0,1526,4,8,1970,2002,0.0,515,0,...,Lvl,Inside,Gtl,Twnhs,SFoyer,TA,Y,Gd,Y,Normal
1199,20,70.0,9100,7,5,2001,2001,0.0,0,0,...,Lvl,Inside,Gtl,1Fam,1Story,Gd,Y,Gd,Y,Normal


In [32]:
new_X.isnull().sum()

MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
MSZoning           0
LandContour        0
LotConfig          0
LandSlope          0
BldgType           0
HouseStyle         0
ExterQual          0
CentralAir         0
KitchenQual        0
PavedDrive         0
SaleCondition      0
dtype: int64

In [33]:
new_X.dtypes

MSSubClass         int64
LotFrontage      float64
LotArea            int64
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
MasVnrArea       float64
BsmtFinSF1         int64
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
1stFlrSF           int64
2ndFlrSF           int64
LowQualFinSF       int64
GrLivArea          int64
BsmtFullBath       int64
BsmtHalfBath       int64
FullBath           int64
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
TotRmsAbvGrd       int64
Fireplaces         int64
GarageYrBlt      float64
GarageCars         int64
GarageArea         int64
WoodDeckSF         int64
OpenPorchSF        int64
EnclosedPorch      int64
3SsnPorch          int64
ScreenPorch        int64
PoolArea           int64
MiscVal            int64
MoSold             int64
YrSold             int64
MSZoning          object
LandContour       object
LotConfig         object
LandSlope         object


In [34]:

enc_new_t = n_X_t.copy()
enc_new_v = n_X_v.copy()

for col in good_cols:
    enc_new_t[col] = pd.DataFrame(encoder.fit_transform(n_X_t[col]))
    enc_new_v[col] = pd.DataFrame(encoder.transform(n_X_v[col]))

enc_new_t.head(15)

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,LandContour,LotConfig,LandSlope,BldgType,HouseStyle,ExterQual,CentralAir,KitchenQual,PavedDrive,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
288,20,,8125,4,4,1971,1971,0.0,614,0,...,2.0,4.0,2.0,0.0,2.0,1.0,0.0,1.0,2.0,4.0
727,20,,21695,6,9,1988,2007,260.0,808,0,...,3.0,4.0,0.0,0.0,2.0,3.0,1.0,3.0,0.0,4.0
1005,120,43.0,3182,7,5,2005,2006,16.0,16,0,...,3.0,4.0,0.0,1.0,0.0,3.0,1.0,3.0,2.0,4.0
490,180,21.0,1526,4,8,1970,2002,0.0,515,0,...,3.0,4.0,0.0,2.0,2.0,3.0,1.0,3.0,2.0,3.0
1199,20,70.0,9100,7,5,2001,2001,0.0,0,0,...,,,,,,,,,,
1036,20,,11500,4,3,1957,1957,0.0,0,0,...,3.0,4.0,0.0,4.0,2.0,2.0,1.0,2.0,2.0,4.0
1453,180,35.0,3675,5,5,2005,2005,80.0,547,0,...,,,,,,,,,,
32,20,,8544,5,6,1966,2006,0.0,0,0,...,3.0,0.0,0.0,4.0,2.0,2.0,1.0,2.0,2.0,4.0
1126,20,60.0,10434,4,5,1955,1955,0.0,0,0,...,3.0,4.0,0.0,0.0,2.0,3.0,1.0,3.0,2.0,4.0
485,20,,7758,5,7,1962,2001,0.0,588,0,...,3.0,4.0,0.0,0.0,2.0,2.0,1.0,0.0,2.0,4.0


In [35]:
n_X_t[good_cols].head(10)

Unnamed: 0_level_0,MSZoning,LandContour,LotConfig,LandSlope,BldgType,HouseStyle,ExterQual,CentralAir,KitchenQual,PavedDrive,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
288,RL,Lvl,Corner,Gtl,1Fam,1Story,TA,Y,TA,Y,Normal
727,RL,Lvl,Corner,Gtl,1Fam,1Story,Gd,Y,Gd,Y,Normal
1005,RL,Lvl,Inside,Gtl,TwnhsE,1Story,Gd,Y,Gd,Y,Normal
490,RM,Lvl,Inside,Gtl,Twnhs,SFoyer,TA,Y,Gd,Y,Normal
1199,RL,Lvl,Inside,Gtl,1Fam,1Story,Gd,Y,Gd,Y,Normal
1036,RL,Lvl,CulDSac,Gtl,1Fam,1Story,TA,N,TA,N,Normal
1453,RM,Lvl,Inside,Gtl,TwnhsE,SLvl,TA,Y,TA,Y,Normal
32,RL,Lvl,CulDSac,Gtl,1Fam,1Story,TA,Y,Gd,Y,Normal
1126,RL,Lvl,Inside,Gtl,1Fam,1Story,TA,Y,Fa,Y,Normal
485,RL,Lvl,Corner,Gtl,1Fam,1Story,TA,Y,Gd,Y,Normal


In [37]:
enc_new_t[good_cols].head(10)

Unnamed: 0_level_0,MSZoning,LandContour,LotConfig,LandSlope,BldgType,HouseStyle,ExterQual,CentralAir,KitchenQual,PavedDrive,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
288,3.0,2.0,4.0,2.0,0.0,2.0,1.0,0.0,1.0,2.0,4.0
727,4.0,3.0,4.0,0.0,0.0,2.0,3.0,1.0,3.0,0.0,4.0
1005,4.0,3.0,4.0,0.0,1.0,0.0,3.0,1.0,3.0,2.0,4.0
490,3.0,3.0,4.0,0.0,2.0,2.0,3.0,1.0,3.0,2.0,3.0
1199,,,,,,,,,,,
1036,3.0,3.0,4.0,0.0,4.0,2.0,2.0,1.0,2.0,2.0,4.0
1453,,,,,,,,,,,
32,3.0,3.0,0.0,0.0,4.0,2.0,2.0,1.0,2.0,2.0,4.0
1126,3.0,3.0,4.0,0.0,0.0,2.0,3.0,1.0,3.0,2.0,4.0
485,3.0,3.0,4.0,0.0,0.0,2.0,2.0,1.0,0.0,2.0,4.0


In [40]:
model = DecisionTreeRegressor
model.fit(enc_new_t, y_train)

TypeError: fit() missing 1 required positional argument: 'y'