### House Prices - Advanced Regression Techniques
- #### Predict sales prices and practice feature engineering, RFs, and gradient boosting

#### Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.

#### With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

### Import required modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from xgboost import XGBRegressor
from sklearn.linear_model import ARDRegression, LassoCV, LinearRegression,RANSACRegressor, Ridge,BayesianRidge
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor,RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [3]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [4]:
import warnings
warnings.filterwarnings('ignore')

### Import the train and test data

In [5]:
train_df = pd.read_csv('train.csv')

In [6]:
test_df = pd.read_csv('test.csv')

### Read the Data

In [7]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


#### Saleprice is the dependent variable y while other variables are independent variables

### Let us check out the data to see if there are null values and calculate the percentage of null values for each variable

In [8]:
def null_values(df):
    total_null = df.isnull().sum().sort_values(ascending = False)
    count_null = df.isnull().count().sort_values(ascending = False)
    percent_null = round(total_null/count_null * 100,2)
    missing_df = pd.concat([total_null, percent_null], axis = 1, keys=['Total','Percent'])
    return missing_df

In [9]:
null_df = null_values(train_df)

In [10]:
null_df[null_df['Percent'] >= 20]

Unnamed: 0,Total,Percent
PoolQC,1453,99.52
MiscFeature,1406,96.3
Alley,1369,93.77
Fence,1179,80.75
FireplaceQu,690,47.26


### PoolQC, MiscFeature, Alley, Fence and FireplaceQu has high percentage of missing values so we drop them

In [11]:
train_df.drop(['PoolQC','MiscFeature','Alley','Fence','Fence'], axis = 1, inplace = True)
test_df.drop(['PoolQC','MiscFeature','Alley','Fence','Fence'], axis = 1, inplace = True)

### Fill the missing values of the numeric variables with the median for both test and train data.

In [12]:
median_train = train_df.median()
median_test = test_df.median()

In [13]:
train_df.fillna(train_df.median(), inplace = True)
test_df.fillna(test_df.median(), inplace = True)

In [14]:
null_values(train_df)

Unnamed: 0,Total,Percent
FireplaceQu,690,47.26
GarageCond,81,5.55
GarageType,81,5.55
GarageFinish,81,5.55
GarageQual,81,5.55
...,...,...
BsmtFinSF2,0,0.00
BsmtUnfSF,0,0.00
TotalBsmtSF,0,0.00
MSSubClass,0,0.00


In [15]:
train_df.fillna(method = 'bfill', inplace = True)
test_df.fillna(method = 'bfill', inplace = True)

In [16]:
null_values(train_df)

Unnamed: 0,Total,Percent
FireplaceQu,2,0.14
Id,0,0.00
HalfBath,0,0.00
Fireplaces,0,0.00
Functional,0,0.00
...,...,...
MasVnrArea,0,0.00
MasVnrType,0,0.00
Exterior2nd,0,0.00
Exterior1st,0,0.00


In [17]:
train_df = train_df.dropna()

In [18]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


### Separate the dependent variable from the independent variable. The dependent variable y is SalePrice

In [19]:
X = train_df
Y = X.pop('SalePrice')
X1 = X.copy()
Y1 = Y.copy()

### Encode the string variables that are categorical for both train and test data

In [27]:
# Label encoding for categoricals
for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

In [28]:
# Label encoding for categoricals for test data
for colname in test_df.select_dtypes("object"):
    test_df[colname], _ = test_df[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = test_df.dtypes == int

In [29]:
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,0,80.0,11622,0,0,0,0,0,...,0,0,0,120,0,0,6,2010,0,0
1,1462,20,1,81.0,14267,0,1,0,0,1,...,36,0,0,0,0,12500,6,2010,0,0
2,1463,60,1,74.0,13830,0,1,0,0,0,...,34,0,0,0,0,0,3,2010,0,0
3,1464,60,1,78.0,9978,0,1,0,0,0,...,36,0,0,0,0,0,6,2010,0,0
4,1465,120,1,43.0,5005,0,1,1,0,0,...,82,0,0,144,0,0,1,2010,0,0


#### Creating mutual information scores to determine the correlation of the independent variable with the dependent variable

def make_mi_scores(X, Y, discrete_features):
    mi_scores = mutual_info_regression(X, Y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, Y, discrete_features)
mi_scores[::3]

plt.figure(figsize = (15,12))
matrix = np.triu(updated_train_df.corr())
sns.heatmap(updated_train_df.corr(), annot=True, mask = matrix)

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores[::3])

### Check for Multicollinearity
- Multicollinearity occurs when predictor variables in a regression model are correlated. This correlation is a problem because predictor variables should be independent. If the correlation between variables is high, it can cause problems when we fit the model and interpret the results. When we have multicollinearity in the linear model, the coefficients that the model suggests are unreliable.

- There are different ways of detecting (or testing) multi-collinearity, one such way is the Variation Inflation Factor.

- Variance Inflation factor: Variance inflation factors measures the inflation in the variances of the regression parameter estimates due to collinearity that exists among the predictors. It is a measure of how much the variance of the estimated regression coefficient βk is "inflated" by the existence of correlation among the predictor variables in the model.

- General Rule of thumb: If VIF is 1 then there is no correlation between the kth predictor and the remaining predictor variables, and hence the variance of β̂k is not inflated at all. Whereas if VIF exceeds 5 or is close to exceeding 5, we say there is moderate VIF and if it is 10 or exceeding 10, it shows signs of high multi-collinearity.

In [30]:
#Calculating the variance inflation factor for each variable. We remove any variable with VIF greater than 5

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [31]:
# Function to check VIF
def checking_vif(train):
    vif = pd.DataFrame()
    vif["feature"] = train.columns

    # Calculating VIF for each feature
    vif["VIF"] = [
        variance_inflation_factor(train.values, i) for i in range(len(train.columns))
    ]
    return vif


print(checking_vif(X))

          feature           VIF
0              Id      4.210693
1      MSSubClass     14.889202
2        MSZoning      1.635868
3     LotFrontage     19.442787
4         LotArea      3.577002
..            ...           ...
71        MiscVal      1.126065
72         MoSold      6.829391
73         YrSold  43017.999492
74       SaleType      1.207007
75  SaleCondition      1.399052

[76 rows x 2 columns]


In [32]:
X_train_df = X.drop('YrSold', axis = 1)

#check VIF again
print(checking_vif(X_train_df))

          feature        VIF
0              Id   4.206284
1      MSSubClass  14.873563
2        MSZoning   1.635474
3     LotFrontage  19.424597
4         LotArea   3.563499
..            ...        ...
70       PoolArea   1.199411
71        MiscVal   1.123704
72         MoSold   6.828649
73       SaleType   1.206592
74  SaleCondition   1.385889

[75 rows x 2 columns]


In [33]:
new_v = checking_vif(X_train_df)

In [34]:
new_v = new_v[new_v['VIF']<=5].reset_index(drop=True)

In [35]:
new_v.groupby('feature')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022F80DB65C0>

In [36]:
new_v.set_index('feature', inplace=True)

In [37]:
new_v

Unnamed: 0_level_0,VIF
feature,Unnamed: 1_level_1
Id,4.206284
MSZoning,1.635474
LotArea,3.563499
Street,1.171446
LotShape,2.084463
LandContour,1.628604
Utilities,1.10028
LotConfig,1.592607
LandSlope,1.879693
Neighborhood,4.809147


In [38]:
new_v1 = new_v.transpose()

In [39]:
new_v1.columns

Index(['Id', 'MSZoning', 'LotArea', 'Street', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'MasVnrType', 'MasVnrArea', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath',
       'HalfBath', 'KitchenQual', 'Functional', 'Fireplaces', 'FireplaceQu',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'SaleType', 'SaleCondition'],
      dtype='object', name='feature')

In [40]:
X.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodD

In [41]:
X = X.filter(new_v1.columns)
# Add the intercept term
X = sm.add_constant(X)

In [42]:
y = Y

Split data into train and test data

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [44]:
X_train.shape

(1020, 51)

Now, we will create the linear regression model as the VIF is less than 5 for all the independent variables, and we can assume that multicollinearity has been removed between the variables.

Preporcess the data

In [45]:
scale = MinMaxScaler()

In [46]:

data_scaled=pd.DataFrame(scale.fit_transform(X_train), columns=X_train.columns)

data_scaled.head()

Unnamed: 0,const,Id,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,SaleType,SaleCondition
0,0.0,0.092656,0.0,0.042534,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.114352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.995196,0.0,0.035991,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.03734,0.082267,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.717914,0.0,0.068971,0.0,0.333333,0.0,0.0,0.0,0.0,...,0.0,0.249708,0.197441,0.0,0.0,0.0,0.0,0.0,0.125,0.4
3,0.0,0.474262,0.0,0.09458,0.0,0.333333,0.0,0.0,0.5,0.0,...,0.0,0.445741,0.091408,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.298559,0.0,0.043782,0.0,0.666667,0.0,0.0,0.75,0.0,...,0.0,0.184364,0.111517,0.0,0.0,0.0,0.0,0.0,0.75,0.0


In [47]:
X_train_new = data_scaled.copy()

In [48]:
X_train_new.shape ,y_train.shape

((1020, 51), (1020,))

### **Creating linear regression model using statsmodels OLS**

In [49]:
# Create the model
model1 = sm.OLS(y_train, X_train).fit()

# Get the model summary
model1.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.644
Model:,OLS,Adj. R-squared:,0.625
Method:,Least Squares,F-statistic:,35.0
Date:,"Sun, 28 Aug 2022",Prob (F-statistic):,1.7900000000000002e-181
Time:,20:44:39,Log-Likelihood:,-12416.0
No. Observations:,1020,AIC:,24930.0
Df Residuals:,969,BIC:,25190.0
Df Model:,50,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.342e+05,6868.817,19.532,0.000,1.21e+05,1.48e+05
Id,-1.5889,3.651,-0.435,0.664,-8.754,5.576
MSZoning,-3231.5475,2018.754,-1.601,0.110,-7193.182,730.087
LotArea,0.9099,0.175,5.189,0.000,0.566,1.254
Street,-5.299e+04,2.6e+04,-2.035,0.042,-1.04e+05,-1887.206
LotShape,7367.8869,3035.932,2.427,0.015,1410.128,1.33e+04
LandContour,9612.3413,2796.639,3.437,0.001,4124.174,1.51e+04
Utilities,-1.002e+05,5.12e+04,-1.958,0.051,-2.01e+05,228.408
LotConfig,633.0298,1601.996,0.395,0.693,-2510.752,3776.811

0,1,2,3
Omnibus:,256.764,Durbin-Watson:,1.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1741.295
Skew:,0.974,Prob(JB):,0.0
Kurtosis:,9.097,Cond. No.,513000.0


If P>|t| ia less than 0.05, the variable is significant

Drop the columns with pvalues greater than 0.05 and run the model again

In [50]:
X = X.drop(columns = {'Id', 'MSZoning', 'Street','LotConfig','LandSlope', 'Condition1','Condition2', 'ExterCond','Exterior1st', 'BsmtCond','BsmtFinType2','CentralAir',
                      'Electrical','BsmtFullBath','Heating','BsmtHalfBath','KitchenQual','Functional','GarageType','GarageCond',
                       'EnclosedPorch','3SsnPorch','PoolArea','MiscVal'})
X = sm.add_constant(X)

# Splitting the data in 70:30 ratio of train to test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.30 , random_state = 1)

# Create the model
model2 = sm.OLS(y_train, X_train).fit()

# Get the model summary
model2.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.641
Model:,OLS,Adj. R-squared:,0.631
Method:,Least Squares,F-statistic:,68.15
Date:,"Sun, 28 Aug 2022",Prob (F-statistic):,4.1700000000000005e-200
Time:,20:44:43,Log-Likelihood:,-12429.0
No. Observations:,1020,AIC:,24910.0
Df Residuals:,993,BIC:,25050.0
Df Model:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.418e+05,5599.875,25.329,0.000,1.31e+05,1.53e+05
LotArea,1.0091,0.165,6.121,0.000,0.686,1.333
LotShape,6716.7765,2956.339,2.272,0.023,915.387,1.25e+04
LandContour,4663.2655,2566.503,1.817,0.070,-373.126,9699.657
Utilities,-1.148e+05,4.92e+04,-2.334,0.020,-2.11e+05,-1.83e+04
Neighborhood,-1095.3412,267.383,-4.097,0.000,-1620.041,-570.641
HouseStyle,-2557.5832,1260.160,-2.030,0.043,-5030.465,-84.701
RoofStyle,1.552e+04,3005.272,5.164,0.000,9620.331,2.14e+04
RoofMatl,-1.695e+04,3150.211,-5.379,0.000,-2.31e+04,-1.08e+04

0,1,2,3
Omnibus:,259.425,Durbin-Watson:,1.915
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2157.052
Skew:,0.923,Prob(JB):,0.0
Kurtosis:,9.881,Cond. No.,489000.0


In [51]:
# Defining the Random forest regressor
rf = RandomForestRegressor(n_estimators = 200, max_depth = 4, min_samples_split = 2)

In [53]:
X1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1458 entries, 0 to 1457
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1458 non-null   int64  
 1   MSSubClass     1458 non-null   int64  
 2   MSZoning       1458 non-null   object 
 3   LotFrontage    1458 non-null   float64
 4   LotArea        1458 non-null   int64  
 5   Street         1458 non-null   object 
 6   LotShape       1458 non-null   object 
 7   LandContour    1458 non-null   object 
 8   Utilities      1458 non-null   object 
 9   LotConfig      1458 non-null   object 
 10  LandSlope      1458 non-null   object 
 11  Neighborhood   1458 non-null   object 
 12  Condition1     1458 non-null   object 
 13  Condition2     1458 non-null   object 
 14  BldgType       1458 non-null   object 
 15  HouseStyle     1458 non-null   object 
 16  OverallQual    1458 non-null   int64  
 17  OverallCond    1458 non-null   int64  
 18  YearBuil

In [54]:
# Label encoding for categoricals
for colname in X1.select_dtypes("object"):
    X1[colname], _ = X1[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X1.dtypes == int

In [55]:
X1.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,0,65.0,8450,0,0,0,0,0,...,61,0,0,0,0,0,2,2008,0,0
1,2,20,0,80.0,9600,0,0,0,0,1,...,0,0,0,0,0,0,5,2007,0,0
2,3,60,0,68.0,11250,0,1,0,0,0,...,42,0,0,0,0,0,9,2008,0,0
3,4,70,0,60.0,9550,0,1,0,0,2,...,35,272,0,0,0,0,2,2006,0,1
4,5,60,0,84.0,14260,0,1,0,0,1,...,84,0,0,0,0,0,12,2008,0,0


In [56]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, Y1, test_size = 0.30 , random_state = 1)

In [57]:
# Hyperparameters, we have randomly choosen them for now but we can tune these hyperparameters and get the best model. 

# Fitting the model
rf.fit(X_train1,y_train1)

In [None]:
X_train1.shape, X_test1.shape

In [58]:
rf.score(X_train1,y_train1)

0.8681940153154202

check performance of the model on train and test data

In [60]:
# RMSE
def rmse(predictions, targets):
    return np.sqrt(((targets - predictions) ** 2).mean())


# MAPE
def mape(predictions, targets):
    return np.mean(np.abs((targets - predictions)) / targets) * 100


# MAE
def mae(predictions, targets):
    return np.mean(np.abs((targets - predictions)))


# Model Performance on test and train data
def model_pref(olsmodel, x_train, x_test, y_train, y_test):

    # Insample Prediction
    y_pred_train = olsmodel.predict(x_train)
    y_observed_train = y_train

    # Prediction on test data
    y_pred_test = olsmodel.predict(x_test)
    y_observed_test = y_test

    print(
        pd.DataFrame(
            {
                "Data": ["Train", "Test"],
                "RMSE": [
                    rmse(y_pred_train, y_observed_train),
                    rmse(y_pred_test, y_observed_test),
                ],
                "MAE": [
                    mae(y_pred_train, y_observed_train),
                    mae(y_pred_test, y_observed_test),
                ],
                "MAPE": [
                    mape(y_pred_train, y_observed_train),
                    mape(y_pred_test, y_observed_test),
                ],
            }
        )
    )


# Checking model performance
model_pref(model2, X_train, X_test, y_train, y_test)  

    Data          RMSE           MAE       MAPE
0  Train  47405.135602  33502.234776  20.135764
1   Test  50225.665092  35065.965716  21.922875


In [61]:
model_pref(rf, X_train1, X_test1, y_train1, y_test1)  

    Data          RMSE           MAE       MAPE
0  Train  28718.047860  20024.807153  12.393776
1   Test  30418.172872  20934.914151  13.676543


In [None]:
from xgboost import XGBRegressor
from sklearn.linear_model import ARDRegression, LassoCV, LinearRegression,RANSACRegressor, Ridge,BayesianRidge
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor,RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
dict_regressors = { 'XGB': XGBRegressor(),
              'ARDRegression':ARDRegression(),
              'LassoCV':LassoCV(),
              'LinearRegression':LinearRegression(),
              'RANSACRegressor':RANSACRegressor(),
              'RandomForestRegressor':RandomForestRegressor(),
              'DecisionTreeRegressor':DecisionTreeRegressor(),
              'ExtraTreesRegressor':ExtraTreesRegressor(),
              'GradientBoostingRegressor':GradientBoostingRegressor()     
                   
             }
              
              
    
    
    
    
    
    


In [None]:
def evaluation_matrix(model, Xval, yval):
   
    yhat_probs = model.predict(Xval)
    error = (yhat_probs - yval)**2
    mse = error.mean()
    score = model.score(X_val,y_val)
    return mse, score

In [None]:
def batch_regressor(X_train, y_train, X_val, y_val, no_regressor = 9, verbose = True):

    dict_models = {}
    for regressor_name,regressor in list(dict_regressors.items())[:no_regressor]:
        regressor.fit(X_train, y_train)
        mse,score = evaluation_matrix(regressor, X_val, y_val)
        dict_models[regressor_name] = {'model': regressor, 'mse':mse, 'score':score}
        
    return dict_models

In [None]:
def display_dict_models(dict_models, sort_by='score'):

    cls = [key for key in dict_models.keys()]
    mse = [dict_models[key]['mse'] for key in cls]
    score = [dict_models[key]['score'] for key in cls]
    
   
    df_ = pd.DataFrame(data=np.zeros(shape=(len(cls),3)), columns = ['regressor', 'mse', 'score'])

    for x in range(0,len(cls)):
        df_.loc[x, 'regressor'] = cls[x]
        df_.loc[x, 'mse'] = mse[x]
        df_.loc[x, 'score'] = score[x]
    display(df_.sort_values(by=sort_by, ascending=False))
    return df_

In [None]:
models = batch_regressor(X_train, y_train, X_val, y_val)

In [None]:
models

In [None]:
df = display_dict_models(models)

In [None]:
test_df.head()

In [None]:
model_gb = DecisionTreeRegressor()

In [None]:
test_df['SalePrice'] = model_gb.fit(X_train1,y_train1).predict(test_df)

In [None]:
predicted = test_df[['Id','SalePrice']]

In [None]:
predicted['SalePrice']= round(predicted['SalePrice'],2)

In [None]:
predicted.head()

In [None]:
predicted.to_csv('submission1.csv', index_label=False, index=False)

In [63]:
model_rf = RandomForestRegressor(n_estimators = 200, max_depth = 4, min_samples_split = 2)

In [64]:
test_df['SalePrice'] = model_rf.fit(X_train1,y_train1).predict(test_df)

In [65]:
predicted_rf = test_df[['Id','SalePrice']]

In [67]:
predicted_rf['SalePrice']= round(predicted_rf['SalePrice'],2)

In [68]:
predicted_rf.head()

Unnamed: 0,Id,SalePrice
0,1461,126286.1
1,1462,154837.44
2,1463,175548.36
3,1464,178882.77
4,1465,225171.52
