In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

np.random.seed(0)

In [2]:
data = pd.read_csv('train.csv', index_col='Id')

data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
y = data.SalePrice

X = data.drop(['SalePrice'], axis=1)

In [4]:
# Numerical features
numerical_X = X.select_dtypes(exclude='object')

# List of numerical features
numerical_cols = [col for col in numerical_X.columns]

numerical_X.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,548,0,61,0,0,0,0,0,2,2008
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,460,298,0,0,0,0,0,0,5,2007
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,608,0,42,0,0,0,0,0,9,2008
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,642,0,35,272,0,0,0,0,2,2006
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,836,192,84,0,0,0,0,0,12,2008


In [5]:
# Categorical features 
categorical_X = X.select_dtypes('object')

categorical_X.head()

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
4,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
5,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [6]:
categorical_X.nunique()

MSZoning          5
Street            2
Alley             2
LotShape          4
LandContour       4
Utilities         2
LotConfig         5
LandSlope         3
Neighborhood     25
Condition1        9
Condition2        8
BldgType          5
HouseStyle        8
RoofStyle         6
RoofMatl          8
Exterior1st      15
Exterior2nd      16
MasVnrType        4
ExterQual         4
ExterCond         5
Foundation        6
BsmtQual          4
BsmtCond          4
BsmtExposure      4
BsmtFinType1      6
BsmtFinType2      6
Heating           6
HeatingQC         5
CentralAir        2
Electrical        5
KitchenQual       4
Functional        7
FireplaceQu       5
GarageType        6
GarageFinish      3
GarageQual        5
GarageCond        5
PavedDrive        3
PoolQC            3
Fence             4
MiscFeature       4
SaleType          9
SaleCondition     6
dtype: int64

In [7]:
# Select only the low cardinality columns with no missing entries
categorical_cols = [col for col in categorical_X.columns if categorical_X[col].nunique() <=10 and categorical_X[col].isnull().sum() ==0]

In [8]:
categories = X[categorical_cols]

categories.head()

Unnamed: 0_level_0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,BldgType,...,ExterCond,Foundation,Heating,HeatingQC,CentralAir,KitchenQual,Functional,PavedDrive,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,1Fam,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
2,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Feedr,Norm,1Fam,...,TA,CBlock,GasA,Ex,Y,TA,Typ,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,Norm,Norm,1Fam,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
4,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,1Fam,...,TA,BrkTil,GasA,Gd,Y,Gd,Typ,Y,WD,Abnorml
5,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,Norm,Norm,1Fam,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal


In [9]:
new_X = pd.concat([numerical_X, categories], axis=1)

new_X.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,ExterCond,Foundation,Heating,HeatingQC,CentralAir,KitchenQual,Functional,PavedDrive,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,TA,CBlock,GasA,Ex,Y,TA,Typ,Y,WD,Normal
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,TA,BrkTil,GasA,Gd,Y,Gd,Typ,Y,WD,Abnorml
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal


In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(new_X, y, test_size=0.2, random_state=11)

In [11]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,ExterCond,Foundation,Heating,HeatingQC,CentralAir,KitchenQual,Functional,PavedDrive,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
288,20,,8125,4,4,1971,1971,0.0,614,0,...,TA,CBlock,GasA,TA,Y,TA,Typ,Y,WD,Normal
727,20,,21695,6,9,1988,2007,260.0,808,0,...,Gd,CBlock,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
1005,120,43.0,3182,7,5,2005,2006,16.0,16,0,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
490,180,21.0,1526,4,8,1970,2002,0.0,515,0,...,Gd,CBlock,GasA,TA,Y,Gd,Typ,Y,WD,Normal
1199,20,70.0,9100,7,5,2001,2001,0.0,0,0,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal


In [12]:
# Impute both training and validation sets
imputer = SimpleImputer(strategy='constant')

imp_X_train = pd.DataFrame(imputer.fit_transform(X_train[numerical_cols]))
imp_X_valid = pd.DataFrame(imputer.transform(X_valid[numerical_cols]))

imp_X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,20.0,0.0,8125.0,4.0,4.0,1971.0,1971.0,0.0,614.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2006.0
1,20.0,0.0,21695.0,6.0,9.0,1988.0,2007.0,260.0,808.0,0.0,...,540.0,292.0,44.0,0.0,182.0,0.0,0.0,0.0,12.0,2009.0
2,120.0,43.0,3182.0,7.0,5.0,2005.0,2006.0,16.0,16.0,0.0,...,457.0,156.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2009.0
3,180.0,21.0,1526.0,4.0,8.0,1970.0,2002.0,0.0,515.0,0.0,...,286.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2009.0
4,20.0,70.0,9100.0,7.0,5.0,2001.0,2001.0,0.0,0.0,0.0,...,573.0,356.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2009.0


In [13]:
imp_X_train.columns = X_train[numerical_cols].columns
imp_X_valid.columns = X_valid[numerical_cols].columns

imp_X_train.index = X_train.index
imp_X_valid.index = X_valid.index

In [14]:
imp_X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
288,20.0,0.0,8125.0,4.0,4.0,1971.0,1971.0,0.0,614.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2006.0
727,20.0,0.0,21695.0,6.0,9.0,1988.0,2007.0,260.0,808.0,0.0,...,540.0,292.0,44.0,0.0,182.0,0.0,0.0,0.0,12.0,2009.0
1005,120.0,43.0,3182.0,7.0,5.0,2005.0,2006.0,16.0,16.0,0.0,...,457.0,156.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2009.0
490,180.0,21.0,1526.0,4.0,8.0,1970.0,2002.0,0.0,515.0,0.0,...,286.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2009.0
1199,20.0,70.0,9100.0,7.0,5.0,2001.0,2001.0,0.0,0.0,0.0,...,573.0,356.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2009.0


In [15]:
# Label Encoding the training and validation sets

enc_X_train = X_train[categorical_cols].copy()
enc_X_valid = X_valid[categorical_cols].copy()
# enc_X_train.head()

encoder = LabelEncoder()

In [42]:
for col in categorical_cols:
    enc_X_train[col] = encoder.fit_transform(enc_X_train[col])
    enc_X_valid[col] = encoder.transform(enc_X_valid[col])
    
    
enc_X_train.head()

ValueError: y contains previously unseen labels: 'Gd'

In [18]:
enc_X_valid.nunique()

MSZoning         5
Street           1
LotShape         3
LandContour      4
Utilities        1
LotConfig        5
LandSlope        3
Condition1       8
Condition2       3
BldgType         5
HouseStyle       8
RoofStyle        5
RoofMatl         4
ExterQual        4
ExterCond        4
Foundation       5
Heating          4
HeatingQC        5
CentralAir       2
KitchenQual      4
Functional       5
PavedDrive       3
SaleType         8
SaleCondition    6
dtype: int64

In [19]:
# Figuring the encoding error above

# Function that loops through X_valid 
# And prints all the unique values in each col

def all_uniques(df):
    columns = df.columns
    
    for col in columns:
        unique = df[col].unique()
        print(f'Column: {col} \t\t {unique}')

In [20]:
# All uniques in tha validation set
all_uniques(X_valid[categorical_cols])

Column: MSZoning 		 ['RM' 'RL' 'FV' 'C (all)' 'RH']
Column: Street 		 ['Pave']
Column: LotShape 		 ['IR1' 'Reg' 'IR2']
Column: LandContour 		 ['Bnk' 'Lvl' 'HLS' 'Low']
Column: Utilities 		 ['AllPub']
Column: LotConfig 		 ['Inside' 'Corner' 'FR2' 'FR3' 'CulDSac']
Column: LandSlope 		 ['Gtl' 'Mod' 'Sev']
Column: Condition1 		 ['Feedr' 'Norm' 'PosN' 'RRAn' 'Artery' 'PosA' 'RRAe' 'RRNe']
Column: Condition2 		 ['Norm' 'Feedr' 'RRNn']
Column: BldgType 		 ['1Fam' 'TwnhsE' 'Twnhs' '2fmCon' 'Duplex']
Column: HouseStyle 		 ['1.5Unf' '1Story' '2Story' '1.5Fin' '2.5Unf' 'SLvl' 'SFoyer' '2.5Fin']
Column: RoofStyle 		 ['Gable' 'Hip' 'Gambrel' 'Flat' 'Mansard']
Column: RoofMatl 		 ['CompShg' 'WdShngl' 'WdShake' 'Tar&Grv']
Column: ExterQual 		 ['TA' 'Fa' 'Ex' 'Gd']
Column: ExterCond 		 ['Gd' 'TA' 'Fa' 'Po']
Column: Foundation 		 ['BrkTil' 'CBlock' 'PConc' 'Slab' 'Stone']
Column: Heating 		 ['GasA' 'GasW' 'Wall' 'Grav']
Column: HeatingQC 		 ['Ex' 'Fa' 'Gd' 'TA' 'Po']
Column: CentralAir 		 ['Y' 'N']
Col

In [21]:
# All uniques in the training set
all_uniques(X_train[categorical_cols])

Column: MSZoning 		 ['RL' 'RM' 'FV' 'C (all)' 'RH']
Column: Street 		 ['Pave' 'Grvl']
Column: LotShape 		 ['IR1' 'Reg' 'IR2' 'IR3']
Column: LandContour 		 ['Lvl' 'HLS' 'Bnk' 'Low']
Column: Utilities 		 ['AllPub' 'NoSeWa']
Column: LotConfig 		 ['Corner' 'Inside' 'CulDSac' 'FR2' 'FR3']
Column: LandSlope 		 ['Gtl' 'Mod' 'Sev']
Column: Condition1 		 ['Norm' 'Artery' 'RRAe' 'PosN' 'Feedr' 'RRAn' 'RRNn' 'RRNe' 'PosA']
Column: Condition2 		 ['Norm' 'PosN' 'Feedr' 'RRAe' 'RRAn' 'Artery' 'RRNn' 'PosA']
Column: BldgType 		 ['1Fam' 'TwnhsE' 'Twnhs' 'Duplex' '2fmCon']
Column: HouseStyle 		 ['1Story' 'SFoyer' 'SLvl' '2Story' '1.5Fin' '1.5Unf' '2.5Fin' '2.5Unf']
Column: RoofStyle 		 ['Gable' 'Hip' 'Mansard' 'Gambrel' 'Flat' 'Shed']
Column: RoofMatl 		 ['CompShg' 'Membran' 'Tar&Grv' 'Roll' 'ClyTile' 'WdShngl' 'Metal'
 'WdShake']
Column: ExterQual 		 ['TA' 'Gd' 'Ex' 'Fa']
Column: ExterCond 		 ['TA' 'Gd' 'Fa' 'Ex']
Column: Foundation 		 ['CBlock' 'PConc' 'Slab' 'Wood' 'BrkTil' 'Stone']
Column: Heating 

In [39]:
def the_comp(df1, df2):
    columns_1 = df1.columns
    columns_2 = df2.columns
    
    print('\nTraining dataset: \n')
    for col in columns_1:
        unique_1 = df1[col].unique()
        print(f'{col}: \t\t\t {unique_1}')
        
    print('\nThe validation set:\n ')
    for col in columns_2:
        unique_2 = df2[col].unique()
        print(f'{col}: \t\t\t {unique_2}')
        
    if len(unique_2) > len(unique_1):
        print('Yeaah')
        

In [40]:
the_comp(X_train[categorical_cols], X_valid[categorical_cols])


Training dataset: 

MSZoning: 			 ['RL' 'RM' 'FV' 'C (all)' 'RH']
Street: 			 ['Pave' 'Grvl']
LotShape: 			 ['IR1' 'Reg' 'IR2' 'IR3']
LandContour: 			 ['Lvl' 'HLS' 'Bnk' 'Low']
Utilities: 			 ['AllPub' 'NoSeWa']
LotConfig: 			 ['Corner' 'Inside' 'CulDSac' 'FR2' 'FR3']
LandSlope: 			 ['Gtl' 'Mod' 'Sev']
Condition1: 			 ['Norm' 'Artery' 'RRAe' 'PosN' 'Feedr' 'RRAn' 'RRNn' 'RRNe' 'PosA']
Condition2: 			 ['Norm' 'PosN' 'Feedr' 'RRAe' 'RRAn' 'Artery' 'RRNn' 'PosA']
BldgType: 			 ['1Fam' 'TwnhsE' 'Twnhs' 'Duplex' '2fmCon']
HouseStyle: 			 ['1Story' 'SFoyer' 'SLvl' '2Story' '1.5Fin' '1.5Unf' '2.5Fin' '2.5Unf']
RoofStyle: 			 ['Gable' 'Hip' 'Mansard' 'Gambrel' 'Flat' 'Shed']
RoofMatl: 			 ['CompShg' 'Membran' 'Tar&Grv' 'Roll' 'ClyTile' 'WdShngl' 'Metal'
 'WdShake']
ExterQual: 			 ['TA' 'Gd' 'Ex' 'Fa']
ExterCond: 			 ['TA' 'Gd' 'Fa' 'Ex']
Foundation: 			 ['CBlock' 'PConc' 'Slab' 'Wood' 'BrkTil' 'Stone']
Heating: 			 ['GasA' 'GasW' 'Grav' 'Wall' 'Floor' 'OthW']
HeatingQC: 			 ['TA' 'Ex' 'Gd' 'F