In [1]:
from Library.DataAnalyzer import DataAnalyzer
from Library.DataPreprocessor import DataPreprocessor
from Library.DataVisualizer import DataVisualizer
from Library.RegressionEngine import RegressionEngine

from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

import pandas as pd
import numpy as np

In [2]:
preprocessor = DataPreprocessor()
analyzer = DataAnalyzer()
visualizer = DataVisualizer()
trainer = RegressionEngine()

lr = LinearRegression()
r = Ridge()
l = Lasso()
E = ElasticNet()
ETR = ExtraTreeRegressor()
GBR = GradientBoostingRegressor()
KNR = KNeighborsRegressor()
XGB = XGBRegressor()

# Add the algorithms you define to the array
myAlgorithmArray = [lr, r, l, E, ETR, GBR, KNR, XGB]

# Read File

In [3]:
df_train = pd.read_csv("Data/train.csv")

# Exploratory Data Analysis

In [4]:
df_train.shape

(1460, 81)

In [5]:
df_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [6]:
df_train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [8]:
analyzer.calculateNullValuesSum(df_train)

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [9]:
df_train.sample(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
26,27,20,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2010,WD,Normal,134800
596,597,70,RM,60.0,3600,Pave,Grvl,Reg,Lvl,AllPub,...,0,,,,0,10,2006,WD,Normal,114504
1324,1325,20,RL,75.0,9986,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2007,New,Partial,147000
163,164,45,RL,55.0,5500,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,4,2007,WD,Normal,103200
619,620,60,RL,85.0,12244,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2008,WD,Normal,305000


In [10]:
analyzer.findColumnsByTopValuePercentage(df_train, 85)

[('Street', 99.58904109589041),
 ('LandContour', 89.7945205479452),
 ('Utilities', 99.93150684931507),
 ('LandSlope', 94.65753424657535),
 ('Condition1', 86.3013698630137),
 ('Condition2', 98.97260273972603),
 ('RoofMatl', 98.21917808219179),
 ('ExterCond', 87.8082191780822),
 ('BsmtCond', 92.12930428671821),
 ('BsmtFinType2', 88.32630098452883),
 ('BsmtFinSF2', 88.56164383561645),
 ('Heating', 97.80821917808218),
 ('CentralAir', 93.4931506849315),
 ('Electrical', 91.43248800548321),
 ('LowQualFinSF', 98.21917808219179),
 ('BsmtHalfBath', 94.3835616438356),
 ('KitchenAbvGr', 95.34246575342465),
 ('Functional', 93.15068493150685),
 ('GarageQual', 95.06889050036258),
 ('GarageCond', 96.15663524292965),
 ('PavedDrive', 91.78082191780823),
 ('EnclosedPorch', 85.75342465753425),
 ('3SsnPorch', 98.35616438356163),
 ('ScreenPorch', 92.05479452054794),
 ('PoolArea', 99.52054794520548),
 ('MiscFeature', 90.74074074074075),
 ('MiscVal', 96.43835616438356),
 ('SaleType', 86.78082191780821)]

In [11]:
analyzer.findColumnsWithMissingValuesAboveThreshold(df_train, 80)

[('Alley', 93.76712328767123),
 ('PoolQC', 99.52054794520548),
 ('Fence', 80.75342465753424),
 ('MiscFeature', 96.30136986301369)]

In [12]:
df_train = preprocessor.dropColumnsFromDataFrame(df_train, ["Street", "Utilities", "Condition2", "RoofMatl", "Heating", "LowQualFinSF", "GarageCond", "3SsnPorch", "PoolArea", "MiscVal", "Alley", "PoolQC", "Fence", "MiscFeature", "LandSlope", "BsmtCond", "CentralAir", "Electrical", "BsmtHalfBath", "KitchenAbvGr", "Functional", "GarageQual", "PavedDrive", "ScreenPorch", "LandContour", "Condition1", "ExterCond", "BsmtFinType2", "BsmtFinSF2", "EnclosedPorch", "SaleType"])

In [13]:
df_train = preprocessor.addColumnWithValue(df_train, "Age", 2022-df_train["YearRemodAdd"])

In [14]:
df_train = preprocessor.addColumnWithValue(df_train, "isRestorated", 0)

In [15]:
df_train["isRestorated"] = np.where(df_train["YearBuilt"]==df_train["YearRemodAdd"], "No", "Yes") 

In [16]:
df_train = preprocessor.dropColumnsFromDataFrame(df_train, ["YearBuilt", "YearRemodAdd", "Id"])

In [17]:
df_train.sample(5)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LotConfig,Neighborhood,BldgType,HouseStyle,OverallQual,...,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,MoSold,YrSold,SaleCondition,SalePrice,Age,isRestorated
666,60,RL,,18450,IR1,Inside,NAmes,1Fam,2Story,6,...,2,596,0,265,8,2007,Abnorml,129000,43,Yes
1085,85,RL,73.0,9069,Reg,Inside,SawyerW,1Fam,SFoyer,6,...,2,564,120,0,4,2010,Normal,147000,30,No
1457,70,RL,66.0,9042,Reg,Inside,Crawfor,1Fam,2Story,7,...,1,252,0,60,5,2010,Normal,266500,16,Yes
756,60,RL,68.0,10769,IR1,Inside,CollgCr,1Fam,2Story,8,...,2,578,144,105,4,2009,Normal,212000,15,No
1265,160,FV,35.0,3735,Reg,FR3,Somerst,TwnhsE,2Story,7,...,2,506,0,34,3,2006,Normal,183900,23,No


In [18]:
df_train = preprocessor.updateColumnValuesBasedOnCondition(df_train, "LotFrontage", "isnull", None, df_train["LotFrontage"].mode(), df_train["LotFrontage"])
df_train = preprocessor.updateColumnValuesBasedOnCondition(df_train, "BsmtQual", "isnull", None, df_train["BsmtQual"].mode(), df_train["BsmtQual"])
df_train = preprocessor.updateColumnValuesBasedOnCondition(df_train, "BsmtExposure", "isnull", None, df_train["BsmtExposure"].mode(), df_train["BsmtExposure"])
df_train = preprocessor.updateColumnValuesBasedOnCondition(df_train, "BsmtFinType1", "isnull", None, df_train["BsmtFinType1"].mode(), df_train["BsmtFinType1"])
df_train = preprocessor.updateColumnValuesBasedOnCondition(df_train, "GarageType", "isnull", None, df_train["GarageType"].mode(), df_train["GarageType"])
df_train = preprocessor.updateColumnValuesBasedOnCondition(df_train, "GarageYrBlt", "isnull", None, df_train["GarageYrBlt"].mode(), df_train["GarageYrBlt"])
df_train = preprocessor.updateColumnValuesBasedOnCondition(df_train, "GarageFinish", "isnull", None, df_train["GarageFinish"].mode(), df_train["GarageFinish"])
df_train = preprocessor.updateColumnValuesBasedOnCondition(df_train, "MasVnrType", "isnull", None, df_train["MasVnrType"].mode(), df_train["MasVnrType"])
df_train = preprocessor.updateColumnValuesBasedOnCondition(df_train, "MasVnrArea", "isnull", None, df_train["MasVnrArea"].mode(), df_train["MasVnrArea"])

In [19]:
df_train = preprocessor.dropColumnsFromDataFrame(df_train, ["FireplaceQu"])

In [20]:
df_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LotConfig,Neighborhood,BldgType,HouseStyle,OverallQual,...,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,MoSold,YrSold,SaleCondition,SalePrice,Age,isRestorated
0,60,RL,65.0,8450,Reg,Inside,CollgCr,1Fam,2Story,7,...,2,548,0,61,2,2008,Normal,208500,19,No
1,20,RL,80.0,9600,Reg,FR2,Veenker,1Fam,1Story,6,...,2,460,298,0,5,2007,Normal,181500,46,No
2,60,RL,68.0,11250,IR1,Inside,CollgCr,1Fam,2Story,7,...,2,608,0,42,9,2008,Normal,223500,20,Yes
3,70,RL,60.0,9550,IR1,Corner,Crawfor,1Fam,2Story,7,...,3,642,0,35,2,2006,Abnorml,140000,52,Yes
4,60,RL,84.0,14260,IR1,FR2,NoRidge,1Fam,2Story,8,...,3,836,192,84,12,2008,Normal,250000,22,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Reg,Inside,Gilbert,1Fam,2Story,6,...,2,460,0,40,8,2007,Normal,175000,22,Yes
1456,20,RL,85.0,13175,Reg,Inside,NWAmes,1Fam,1Story,6,...,2,500,349,0,2,2010,Normal,210000,34,Yes
1457,70,RL,66.0,9042,Reg,Inside,Crawfor,1Fam,2Story,7,...,1,252,0,60,5,2010,Normal,266500,16,Yes
1458,20,RL,68.0,9717,Reg,Inside,NAmes,1Fam,1Story,5,...,1,240,366,0,4,2010,Normal,142125,26,Yes


In [21]:
df_train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'LotShape',
       'LotConfig', 'Neighborhood', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'Foundation', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea',
       'WoodDeckSF', 'OpenPorchSF', 'MoSold', 'YrSold', 'SaleCondition',
       'SalePrice', 'Age', 'isRestorated'],
      dtype='object')

In [22]:
x = df_train

In [23]:
x = pd.get_dummies(x, drop_first=True)

In [24]:
analyzer.sortCorrelationsWithColumn(x, "SalePrice")

SalePrice            1.000000
OverallQual          0.790982
GrLivArea            0.708624
GarageCars           0.640409
GarageArea           0.623431
                       ...   
Foundation_Stone     0.012103
LotConfig_FR2        0.006859
BldgType_TwnhsE      0.003804
Foundation_Wood      0.002711
RoofStyle_Mansard    0.000308
Name: SalePrice, Length: 149, dtype: float64

In [25]:
x = preprocessor.dropColumnsFromDataFrame(x, ["Foundation_Stone", "LotConfig_FR2", "BldgType_TwnhsE", "Foundation_Wood", "RoofStyle_Mansard"])

In [26]:
analyzer.sortCorrelationsWithColumn(x, "SalePrice")

SalePrice               1.000000
OverallQual             0.790982
GrLivArea               0.708624
GarageCars              0.640409
GarageArea              0.623431
                          ...   
LotConfig_FR3           0.018186
Exterior1st_Plywood     0.017719
Exterior2nd_Stone       0.016754
SaleCondition_Alloca    0.015525
Neighborhood_SawyerW    0.014560
Name: SalePrice, Length: 144, dtype: float64

In [27]:
a = abs(x.corr()["SalePrice"]).sort_values(ascending=False)

In [28]:
a.head(50)

SalePrice                1.000000
OverallQual              0.790982
GrLivArea                0.708624
GarageCars               0.640409
GarageArea               0.623431
TotalBsmtSF              0.613581
1stFlrSF                 0.605852
ExterQual_TA             0.589044
FullBath                 0.560664
TotRmsAbvGrd             0.533723
KitchenQual_TA           0.519298
GarageFinish_Unf         0.513906
Age                      0.507101
BsmtQual_TA              0.498545
Foundation_PConc         0.497734
MasVnrArea               0.472614
Fireplaces               0.466929
ExterQual_Gd             0.452466
BsmtFinType1_GLQ         0.434597
Neighborhood_NridgHt     0.402149
GarageYrBlt              0.397778
BsmtFinSF1               0.386420
MasVnrType_None          0.367456
GarageType_Detchd        0.354141
SaleCondition_Partial    0.352060
Foundation_CBlock        0.343263
MasVnrType_Stone         0.330476
Neighborhood_NoRidge     0.330424
LotFrontage              0.329220
WoodDeckSF    

In [29]:
x = x[["OverallQual",
       "GrLivArea",
       "GarageCars",
       "GarageArea",
       "TotalBsmtSF",
       "1stFlrSF",
       "ExterQual_TA",
       "FullBath",
       "TotRmsAbvGrd",
       "KitchenQual_TA",
       "GarageFinish_Unf",
       "Age",
       "BsmtQual_TA",
       "Foundation_PConc",
       "MasVnrArea",
       "Fireplaces",
       "ExterQual_Gd",
       "BsmtFinType1_GLQ",
       "Neighborhood_NridgHt",
       "GarageYrBlt",
       "BsmtFinSF1",
       "MasVnrType_None",
       "GarageType_Detchd",
       "SaleCondition_Partial",
       "Foundation_CBlock",
       # "MasVnrType_Stone",
       # "Neighborhood_NoRidge",
       # "LotFrontage",
       # "WoodDeckSF",
       # "KitchenQual_Gd",
       # "BsmtExposure_No",
       # "2ndFlrSF",
       # "OpenPorchSF",
       # "HeatingQC_TA",
       # "BsmtExposure_Gd",
       # "Exterior2nd_VinylSd",
       # "Exterior1st_VinylSd",
       # "MSZoning_RM",
       # "HalfBath",
       # "LotShape_Reg",
       # "LotArea",
       # "MSZoning_RL",
       # "HouseStyle_2Story",
       # "RoofStyle_Hip",
       # "GarageType_BuiltIn",
       # "BsmtQual_Gd",
       # "GarageType_Attchd",
       # "BsmtFullBath",
       # "RoofStyle_Gable"
       ]]

In [30]:
y = df_train[["SalePrice"]]

In [31]:
x

Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,ExterQual_TA,FullBath,TotRmsAbvGrd,KitchenQual_TA,...,Fireplaces,ExterQual_Gd,BsmtFinType1_GLQ,Neighborhood_NridgHt,GarageYrBlt,BsmtFinSF1,MasVnrType_None,GarageType_Detchd,SaleCondition_Partial,Foundation_CBlock
0,7,1710,2,548,856,856,0,2,8,0,...,0,1,1,0,2003.0,706,0,0,0,0
1,6,1262,2,460,1262,1262,1,2,6,1,...,1,0,0,0,1976.0,978,1,0,0,1
2,7,1786,2,608,920,920,0,2,6,0,...,1,1,1,0,2001.0,486,0,0,0,0
3,7,1717,3,642,756,961,1,1,7,0,...,1,0,0,0,1998.0,216,1,1,0,0
4,8,2198,3,836,1145,1145,0,2,9,0,...,1,1,1,0,2000.0,655,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,6,1647,2,460,953,953,1,2,7,1,...,1,0,0,0,1999.0,0,1,0,0,0
1456,6,2073,2,500,1542,2073,1,2,7,1,...,2,0,0,0,1978.0,790,0,0,0,1
1457,7,2340,1,252,1152,1188,0,2,9,0,...,2,0,1,0,1941.0,275,1,0,0,0
1458,5,1078,1,240,1078,1078,1,1,5,0,...,0,0,1,0,1950.0,49,1,0,0,1


In [32]:
trainer.fitAndPredictAll(myAlgorithmArray, x, y, random_state=42)

Algorithm                    R_Squared     RMSE      MAE
-------------------------  -----------  -------  -------
LinearRegression              0.831296  35972.4  23605.3
Ridge                         0.831205  35982.1  23591
Lasso                         0.831299  35972.1  23603.4
ElasticNet                    0.814894  37680.5  23352.4
ExtraTreeRegressor            0.754175  43423    28222.1
GradientBoostingRegressor     0.881071  30203.1  19156.6
KNeighborsRegressor           0.798103  39352.5  24832.6
XGBRegressor                  0.87596   30845.2  20379.5
