# import all required libraries

In [1]:
import xgboost
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

# Load data from csv file

In [2]:
housing_train = pd.read_csv('train.csv')
housing_test = pd.read_csv('test.csv')

# Feature Engineering

## Drop low corelated features

In [3]:
housing_train = housing_train.drop(['PoolArea'], axis=1)
housing_test = housing_test.drop(['PoolArea'], axis=1)
housing_train = housing_train.drop(['MoSold'], axis=1)
housing_test = housing_test.drop(['MoSold'], axis=1)
housing_train = housing_train.drop(['3SsnPorch'], axis=1)
housing_test = housing_test.drop(['3SsnPorch'], axis=1)
housing_train = housing_train.drop(['BsmtFinSF2'], axis=1)
housing_test = housing_test.drop(['BsmtFinSF2'], axis=1)
housing_train = housing_train.drop(['BsmtHalfBath'], axis=1)
housing_test = housing_test.drop(['BsmtHalfBath'], axis=1)
housing_train = housing_train.drop(['MiscVal'], axis=1)
housing_test = housing_test.drop(['MiscVal'], axis=1)
housing_train = housing_train.drop(['Id'], axis=1)
housing_test = housing_test.drop(['Id'], axis=1)
housing_train = housing_train.drop(['LowQualFinSF'], axis=1)
housing_test = housing_test.drop(['LowQualFinSF'], axis=1)
housing_train = housing_train.drop(['YrSold'], axis=1)
housing_test = housing_test.drop(['YrSold'], axis=1)
housing_train = housing_train.drop(['MSSubClass'], axis=1)
housing_test = housing_test.drop(['MSSubClass'], axis=1)
housing_train = housing_train.drop(['EnclosedPorch'], axis=1)
housing_test = housing_test.drop(['EnclosedPorch'], axis=1)
housing_train = housing_train.drop(['KitchenAbvGr'], axis=1)
housing_test = housing_test.drop(['KitchenAbvGr'], axis=1)
housing_test = housing_test.drop(['Utilities'], axis=1)
housing_train = housing_train.drop(['Utilities'], axis=1)
housing_test = housing_test.drop(['LandSlope'], axis=1)
housing_train = housing_train.drop(['LandSlope'], axis=1)
housing_test = housing_test.drop(['Street'], axis=1)
housing_train = housing_train.drop(['Street'], axis=1)

## Drop label column 

In [4]:
Y_label =  housing_train['SalePrice'].copy()
housing_train = housing_train.drop(['SalePrice'], axis=1)
all_data = pd.concat([housing_train, housing_test])

## Fill empty data

In [5]:
for column in ['Alley',
                       'BsmtQual',
                       'BsmtCond',
                       'BsmtExposure',
                       'BsmtFinType1',
                       'BsmtFinType2',
                       'FireplaceQu',
                       'GarageType',
                       'GarageFinish',
                       'GarageQual',
                       'GarageCond',
                       'PoolQC',
                       'Fence',
                       'MiscFeature'
                ]:
    all_data[column]=all_data[column].fillna('None')
    
for column in ['MSZoning',
                       'Exterior1st',
                       'Exterior2nd',
                       'MasVnrType',
                       'Electrical',
                       'KitchenQual',
                       'Functional',
                       'SaleType',
                       'GarageYrBlt']:
    all_data[column]=all_data[column].fillna(all_data[column].mode()[0])
for column in ['LotFrontage',
                        'MasVnrArea',
                        'BsmtFinSF1',
                        'BsmtUnfSF',
                        'TotalBsmtSF',
                        'BsmtFullBath',
                        'GarageCars',
                        'GarageArea']:
    all_data[column]=all_data[column].fillna(all_data[column].median())

## Handle categorical data using Ordinal encoder and One hot encoder

In [6]:
column_transformer = ColumnTransformer(transformers=[("OE", OrdinalEncoder(categories=[
                                                                                             ['Po','Fa','TA','Gd','Ex'], #ExterQual
                                                                                             ['Po','Fa','TA','Gd','Ex'], #ExterCond
                                                                                             ['None','Po','Fa','TA','Gd','Ex'], #BsmtQual
                                                                                             ['None','Po','Fa','TA','Gd','Ex'], #BsmtCond
                                                                                             ['None','No','Mn','Av','Gd'], #BsmtExposure
                                                                                             ['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ'], #BsmtFinType1
                                                                                             ['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ'], #BsmtFinType2
                                                                                             ['Po','Fa','TA','Gd','Ex'], #HeatingQC
                                                                                             ['N','Y'],#CentralAir
                                                                                             ['Po','Fa','TA','Gd','Ex'], #KitchenQual
                                                                                             ['None','Po','Fa','TA','Gd','Ex'],#FireplaceQu
                                                                                             ['None','Unf','RFn','Fin'], #GarageFinish
                                                                                             ['None','Po','Fa','TA','Gd','Ex'], #GarageQual
                                                                                             ['N','P','Y'], #PavedDrive
                                                                                             ['None','Po','Fa','TA','Gd','Ex'], #PoolQC
                                                                                      ]
                                                                           ), 
                                                       ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2','HeatingQC', 'CentralAir', 'KitchenQual',  'FireplaceQu', 'GarageFinish', 'GarageQual', 'PavedDrive','PoolQC']
                                                     ),
                                                    ('OHE', OneHotEncoder(sparse_output=False, drop='first'), ['Functional','MSZoning','Alley','LotShape','LandContour','LotConfig','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','GarageCond','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','Foundation','Heating','Electrical','MiscFeature','SaleType','SaleCondition','Fence', 'GarageType'])], remainder='passthrough')
column_transformer.set_output(transform='pandas')
all_data_trans = column_transformer.fit_transform(all_data)
all_data_trans = all_data_trans.loc[:,~all_data_trans.columns.duplicated()].copy()

# Divide the data into training and testing data

In [7]:
housing_train = all_data_trans.iloc[:1460,:]
housing_test = all_data_trans.iloc[1460:,:]
X_test=housing_train.iloc[:200]
Y_test = Y_label.iloc[:200]


# Using Standrd Scaler

In [8]:
scaler = StandardScaler()
housing_train_tr = scaler.fit_transform(housing_train)
housing_test_tr = scaler.transform(housing_test)
prepared_data =scaler.fit_transform(X_test)

# Predictions using different ML algorithms

## 1. Linear Regression

In [9]:
model_linear_regression = LinearRegression()
model_linear_regression.fit(housing_train_tr,Y_label)
Y_pred_linear_regression = model_linear_regression.predict(prepared_data)
r2=r2_score(Y_test, Y_pred_linear_regression)
print('MAE:', mean_absolute_error(Y_test, Y_pred_linear_regression))
print('RMSE:', np.sqrt(mean_squared_error(Y_test, Y_pred_linear_regression)))
print('R2:', r2)

MAE: 20245.053424429196
RMSE: 42539.22131505131
R2: 0.6902588520345366


## 2. Decision Tree regressor

In [10]:
model_decision_tree = DecisionTreeRegressor()
model_decision_tree.fit(housing_train_tr,Y_label)
Y_pred_decision_tree = model_decision_tree.predict(prepared_data)
r2=r2_score(Y_test, Y_pred_decision_tree)
print('MAE:', mean_absolute_error(Y_test, Y_pred_decision_tree))
print('RMSE:', np.sqrt(mean_squared_error(Y_test, Y_pred_decision_tree)))
print('R2:', r2)

MAE: 3268.575
RMSE: 10434.057018964388
R2: 0.9813651353244505


## 3. Random Forest Regressor

In [11]:
model_random_forest = RandomForestRegressor(n_estimators=1000)
model_random_forest.fit(housing_train_tr,Y_label)
Y_pred_random_forest = model_random_forest.predict(prepared_data)
r2=r2_score(Y_test, Y_pred_random_forest)
print('MAE:', mean_absolute_error(Y_test, Y_pred_random_forest))
print('RMSE:', np.sqrt(mean_squared_error(Y_test, Y_pred_random_forest)))
print('R2:', r2)

MAE: 8361.4688
RMSE: 11921.568477943463
R2: 0.9756731079822598


## 4. XGBRegressor

In [12]:
model_xgb=xgboost.XGBRegressor(learning_rate=0.12)
model_xgb.fit(housing_train_tr,Y_label)
Y_pred_xgb = model_xgb.predict(prepared_data)

In [13]:
r2=r2_score(Y_test, Y_pred_xgb)
print('MAE:', mean_absolute_error(Y_test, Y_pred_xgb))
print('RMSE:', np.sqrt(mean_squared_error(Y_test, Y_pred_xgb)))
print('R2:', r2)

MAE: 7005.5425390625
RMSE: 9422.234761373058
R2: 0.9848040563824535
