# import all required libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

# Load data from csv file

In [2]:
housing_train = pd.read_csv('train.csv')
housing_test = pd.read_csv('test.csv')
housing_train.drop('Id', axis=1, inplace=True)
housing_test.drop('Id', axis=1, inplace=True)

# Feature Engineering

## Fill empty data

### Fill empty train set

In [3]:
housing_train["GarageYrBlt"].fillna(0, inplace=True)
housing_train["MasVnrArea"].fillna(0, inplace=True)
housing_train["LotFrontage"].fillna(0, inplace=True)
housing_train["Alley"].fillna('None', inplace=True)
housing_train["MasVnrType"].fillna('None', inplace=True)
housing_train["BsmtQual"].fillna('None', inplace=True)
housing_train["BsmtCond"].fillna('None', inplace=True)
housing_train["BsmtExposure"].fillna('None', inplace=True)
housing_train["BsmtFinType1"].fillna('None', inplace=True)
housing_train["BsmtFinType2"].fillna('None', inplace=True)
housing_train["Electrical"].fillna(housing_train["Electrical"].mode()[0], inplace=True)
housing_train["FireplaceQu"].fillna('None', inplace=True)
housing_train["GarageType"].fillna('None', inplace=True)
housing_train["GarageFinish"].fillna('None', inplace=True)
housing_train["GarageQual"].fillna('None', inplace=True)
housing_train["GarageCond"].fillna('None', inplace=True)
housing_train["PoolQC"].fillna('None', inplace=True)
housing_train["Fence"].fillna('None', inplace=True)
housing_train["MiscFeature"].fillna('None', inplace=True)
print(housing_train['MSZoning'].value_counts())

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64


### Fill empty test set

In [4]:
housing_test["GarageYrBlt"].fillna(0, inplace=True)
housing_test["MasVnrArea"].fillna(0, inplace=True)
housing_test["LotFrontage"].fillna(0, inplace=True)
housing_test["Alley"].fillna('None', inplace=True)
housing_test["MasVnrType"].fillna('None', inplace=True)
housing_test["BsmtQual"].fillna('None', inplace=True)
housing_test["BsmtCond"].fillna('None', inplace=True)
housing_test["BsmtExposure"].fillna('None', inplace=True)
housing_test["BsmtFinType1"].fillna('None', inplace=True)
housing_test["BsmtFinType2"].fillna('None', inplace=True)
housing_test["Electrical"].fillna(housing_test["Electrical"].mode()[0], inplace=True)
housing_test["FireplaceQu"].fillna('None', inplace=True)
housing_test["GarageType"].fillna('None', inplace=True)
housing_test["GarageFinish"].fillna('None', inplace=True)
housing_test["GarageQual"].fillna('None', inplace=True)
housing_test["GarageCond"].fillna('None', inplace=True)
housing_test["PoolQC"].fillna('None', inplace=True)
housing_test["Fence"].fillna('None', inplace=True)
housing_test["MiscFeature"].fillna('None', inplace=True)
housing_test["MSZoning"].fillna(housing_test["MSZoning"].mode()[0], inplace=True)
housing_test["Utilities"].fillna(housing_test["Utilities"].mode()[0], inplace=True)
housing_test["Exterior1st"].fillna(housing_test["Exterior1st"].mode()[0], inplace=True)
housing_test["Exterior2nd"].fillna(housing_test["Exterior2nd"].mode()[0], inplace=True)
housing_test["BsmtFinSF2"].fillna(housing_test["BsmtFinSF2"].mean(), inplace=True)
housing_test["BsmtUnfSF"].fillna(housing_test["BsmtUnfSF"].mean(), inplace=True)
housing_test["TotalBsmtSF"].fillna(housing_test["TotalBsmtSF"].mean(), inplace=True)
housing_test["BsmtFinSF1"].fillna(housing_test["BsmtFinSF1"].mean(), inplace=True)
housing_test["BsmtFullBath"].fillna(housing_test["BsmtFullBath"].mode()[0], inplace=True)
housing_test["BsmtHalfBath"].fillna(housing_test["BsmtHalfBath"].mode()[0], inplace=True)
housing_test["KitchenQual"].fillna(housing_test["KitchenQual"].mode()[0], inplace=True)
housing_test["GarageArea"].fillna(0, inplace=True)
housing_test["GarageCars"].fillna(0, inplace=True)
housing_test["Functional"].fillna(housing_test["Functional"].mode()[0], inplace=True)
housing_test["SaleType"].fillna(housing_test["SaleType"].mode()[0], inplace=True)
print(housing_test['SaleType'].value_counts())
print(housing_test['SaleType'].isnull().sum())
housing_test.info()

WD       1259
New       117
COD        44
ConLD      17
CWD         8
Oth         4
ConLI       4
Con         3
ConLw       3
Name: SaleType, dtype: int64
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1459 non-null   object 
 2   LotFrontage    1459 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   Alley          1459 non-null   object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1459 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-nul

## Handle categorical data

### Ordinal encoder and One hot encoder

In [5]:
column_transformer = ColumnTransformer(transformers=[("OE", OrdinalEncoder(categories=[
                                                                                         ['ELO','NoSeWa','NoSewr','AllPub'],
                                                                                         ['Sev', 'Mod','Gtl'],
                                                                                         ['Po','Fa','TA','Gd','Ex'],
                                                                                         ['Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','No','Mn','Av','Gd'],
                                                                                         ['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],
                                                                                         ['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],
                                                                                         ['Po','Fa','TA','Gd','Ex'],
                                                                                         ['N','Y'],
                                                                                         ['Po','Fa','TA','Gd','Ex'],
                                                                                         ['Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','Detchd','CarPort','BuiltIn','Basment','Attchd','2Types'],
                                                                                         ['None','Unf','RFn','Fin'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['N','P','Y'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','MnWw','GdWo','MnPrv','GdPrv']
                                                                                      ]
                                                                           ), 
                                                       ['Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'PavedDrive', 'PoolQC','Fence']
                                                     ),
                                                    ('OHE', OneHotEncoder(sparse_output=False, drop='first'), ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','LotConfig','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','Foundation','Heating','Electrical','MiscFeature','SaleType','SaleCondition'])], remainder='passthrough')
column_transformer.set_output(transform='pandas')
housing_train_trans = column_transformer.fit_transform(housing_train)
housing_test_trans = column_transformer.fit_transform(housing_test)

In [6]:
final_housing_train = housing_train_trans.drop(['remainder__SalePrice'], axis=1)
Y_label =  housing_train_trans['remainder__SalePrice']

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
my_pipeline = Pipeline([('scaler', StandardScaler())])

In [8]:
housing_num_tr = my_pipeline.fit_transform(final_housing_train)

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
model = DecisionTreeRegressor()
#model = RandomForestRegressor()
#model = LinearRegression()
model.fit(housing_num_tr,Y_label)

In [10]:
some_data=final_housing_train.iloc[:100]
some_label=Y_label.iloc[:100]

In [11]:
prepared_data =my_pipeline.transform(some_data)

In [12]:
model.predict(prepared_data)

array([208500., 181500., 223500., 140000., 250000., 143000., 307000.,
       200000., 129900., 118000., 129500., 345000., 144000., 279500.,
       157000., 132000., 149000.,  90000., 159000., 139000., 325300.,
       139400., 230000., 129900., 154000., 256300., 134800., 306000.,
       207500.,  68500.,  40000., 149350., 179900., 165500., 277500.,
       309000., 145000., 153000., 109000.,  82000., 160000., 170000.,
       144000., 130250., 141000., 319900., 239686., 249700., 113000.,
       127000., 177000., 114500., 110000., 385000., 130000., 180500.,
       172500., 196500., 438780., 124900., 158000., 101000., 202500.,
       140000., 219500., 317000., 180000., 226000.,  80000., 225000.,
       244000., 129500., 185000., 144900., 107400.,  91000., 135750.,
       127000., 136500., 110000., 193500., 153500., 245000., 126500.,
       168500., 260000., 174000., 164500.,  85000., 123600., 109900.,
        98600., 163500., 133900., 204750., 185000., 214000.,  94750.,
        83000., 1289

In [13]:
some_label

0     208500
1     181500
2     223500
3     140000
4     250000
       ...  
95    185000
96    214000
97     94750
98     83000
99    128950
Name: remainder__SalePrice, Length: 100, dtype: int64

In [14]:
from sklearn.metrics import mean_squared_error
import numpy as np
housing_predictions = model.predict(housing_num_tr)
lin_mse = mean_squared_error(Y_label,housing_predictions)
mse=np.sqrt(lin_mse)

In [15]:
mse

0.0

In [16]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr,Y_label,scoring="neg_mean_squared_error",cv=10)
rsme_scores=np.sqrt(-scores)

In [17]:
rsme_scores

array([37249.92074709, 39261.2398147 , 37094.09387708, 42934.48014676,
       40039.45290371, 28205.46332316, 32761.93092433, 33302.62787741,
       57333.96128387, 37526.1067149 ])

In [18]:
print("mean:", rsme_scores.mean())
print("std dev:", rsme_scores.std())

mean: 38570.9277613008
std dev: 7400.575268340803


In [19]:
housing_test_trans.shape

(1459, 198)

In [20]:
housing_train_trans.shape

(1460, 213)