# import all required libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

# Load data from csv file

In [2]:
housing_train = pd.read_csv('train.csv')
housing_test = pd.read_csv('test.csv')
Y_label =  housing_train['SalePrice'].copy()
housing_train = housing_train.drop(['SalePrice'], axis=1)
all_data = pd.concat([housing_train, housing_test])

# Feature Engineering

## Fill empty data

In [3]:
all_data["GarageYrBlt"].fillna(0, inplace=True)
all_data["MasVnrArea"].fillna(0, inplace=True)
all_data["LotFrontage"].fillna(0, inplace=True)
all_data["Alley"].fillna('None', inplace=True)
all_data["MasVnrType"].fillna('None', inplace=True)
all_data["BsmtQual"].fillna('None', inplace=True)
all_data["BsmtCond"].fillna('None', inplace=True)
all_data["BsmtExposure"].fillna('None', inplace=True)
all_data["BsmtFinType1"].fillna('None', inplace=True)
all_data["BsmtFinType2"].fillna('None', inplace=True)
all_data["Electrical"].fillna(housing_train["Electrical"].mode()[0], inplace=True)
all_data["FireplaceQu"].fillna('None', inplace=True)
all_data["GarageType"].fillna('None', inplace=True)
all_data["GarageFinish"].fillna('None', inplace=True)
all_data["GarageQual"].fillna('None', inplace=True)
all_data["GarageCond"].fillna('None', inplace=True)
all_data["PoolQC"].fillna('None', inplace=True)
all_data["Fence"].fillna('None', inplace=True)
all_data["MiscFeature"].fillna('None', inplace=True)
all_data["MSZoning"].fillna(housing_test["MSZoning"].mode()[0], inplace=True)
all_data["Utilities"].fillna(housing_test["Utilities"].mode()[0], inplace=True)
all_data["Exterior1st"].fillna(housing_test["Exterior1st"].mode()[0], inplace=True)
all_data["Exterior2nd"].fillna(housing_test["Exterior2nd"].mode()[0], inplace=True)
all_data["BsmtFinSF2"].fillna(housing_test["BsmtFinSF2"].mean(), inplace=True)
all_data["BsmtUnfSF"].fillna(housing_test["BsmtUnfSF"].mean(), inplace=True)
all_data["TotalBsmtSF"].fillna(housing_test["TotalBsmtSF"].mean(), inplace=True)
all_data["BsmtFinSF1"].fillna(housing_test["BsmtFinSF1"].mean(), inplace=True)
all_data["BsmtFullBath"].fillna(housing_test["BsmtFullBath"].mode()[0], inplace=True)
all_data["BsmtHalfBath"].fillna(housing_test["BsmtHalfBath"].mode()[0], inplace=True)
all_data["KitchenQual"].fillna(housing_test["KitchenQual"].mode()[0], inplace=True)
all_data["GarageArea"].fillna(0, inplace=True)
all_data["GarageCars"].fillna(0, inplace=True)
all_data["Functional"].fillna(housing_test["Functional"].mode()[0], inplace=True)
all_data["SaleType"].fillna(housing_test["SaleType"].mode()[0], inplace=True)

## Handle categorical data

### Ordinal encoder and One hot encoder

In [4]:
column_transformer = ColumnTransformer(transformers=[("OE", OrdinalEncoder(categories=[
                                                                                         ['ELO','NoSeWa','NoSewr','AllPub'],
                                                                                         ['Sev', 'Mod','Gtl'],
                                                                                         ['Po','Fa','TA','Gd','Ex'],
                                                                                         ['Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','No','Mn','Av','Gd'],
                                                                                         ['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],
                                                                                         ['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],
                                                                                         ['Po','Fa','TA','Gd','Ex'],
                                                                                         ['N','Y'],
                                                                                         ['Po','Fa','TA','Gd','Ex'],
                                                                                         ['Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','Detchd','CarPort','BuiltIn','Basment','Attchd','2Types'],
                                                                                         ['None','Unf','RFn','Fin'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['N','P','Y'],
                                                                                         ['None','Po','Fa','TA','Gd','Ex'],
                                                                                         ['None','MnWw','GdWo','MnPrv','GdPrv']
                                                                                      ]
                                                                           ), 
                                                       ['Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'PavedDrive', 'PoolQC','Fence']
                                                     ),
                                                    ('OHE', OneHotEncoder(sparse_output=False, drop='first'), ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','LotConfig','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','Foundation','Heating','Electrical','MiscFeature','SaleType','SaleCondition'])], remainder='passthrough')
column_transformer.set_output(transform='pandas')
all_data_trans = column_transformer.fit_transform(all_data)
all_data_trans = all_data_trans.loc[:,~all_data_trans.columns.duplicated()].copy()

In [5]:
housing_train = all_data_trans.iloc[:1460,:]
housing_test = all_data_trans.iloc[1460:,:]

In [6]:
my_pipeline = Pipeline([('scaler', StandardScaler())])

In [7]:
housing_train_tr = my_pipeline.fit_transform(housing_train)
housing_test_tr = my_pipeline.transform(housing_test)

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost
#model = DecisionTreeRegressor()
model=xgboost.XGBRegressor(learning_rate=0.12)
#model = RandomForestRegressor()
#model = LinearRegression()
model.fit(housing_train_tr,Y_label)

In [9]:
some_data=housing_train.iloc[:100]
some_label=Y_label.iloc[:100]

In [10]:
prepared_data =my_pipeline.fit_transform(some_data)

In [11]:
model.predict(prepared_data)

array([216021.81 , 180335.7  , 224827.7  , 157389.53 , 286006.2  ,
       162824.16 , 313949.56 , 232297.38 , 135516.47 , 118316.18 ,
       133445.7  , 361362.6  , 142733.92 , 232887.33 , 154419.3  ,
       126271.016, 153456.78 , 102859.195, 165245.1  , 127488.75 ,
       329905.3  , 135581.92 , 238728.69 , 137345.6  , 142663.6  ,
       258397.19 , 127272.99 , 298421.2  , 209503.12 ,  60590.492,
        59001.195, 142159.28 , 190506.88 , 175530.34 , 291116.72 ,
       318531.22 , 136547.   , 146097.8  , 127506.73 ,  80599.516,
       152596.4  , 179462.17 , 139521.08 , 130776.01 , 131256.77 ,
       302888.75 , 275368.44 , 254763.55 , 117586.77 , 123300.1  ,
       185058.42 , 114890.695, 107295.52 , 395943.47 , 135514.17 ,
       161187.88 , 182615.3  , 212756.02 , 444424.62 , 122426.59 ,
       178288.8  ,  94597.586, 204810.92 , 144884.52 , 249111.97 ,
       348537.4  , 215740.52 , 232862.   ,  76154.54 , 254830.89 ,
       280397.44 , 120494.586, 194428.22 , 148060.5  , 112670.

some_label

In [12]:
from sklearn.metrics import mean_squared_error
import numpy as np
housing_predictions = model.predict(housing_train_tr)
lin_mse = mean_squared_error(Y_label,housing_predictions)
mse=np.sqrt(lin_mse)

In [13]:
mse

5362.012819208833

In [14]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_train_tr,Y_label,scoring="neg_mean_squared_error",cv=10)
rsme_scores=np.sqrt(-scores)

In [15]:
rsme_scores

array([22651.22305137, 25875.87469137, 21420.35714047, 40946.8520479 ,
       28951.75603502, 26491.40684343, 24666.10278891, 20180.94055236,
       28588.11630181, 27243.20156616])

In [16]:
print("mean:", rsme_scores.mean())
print("std dev:", rsme_scores.std())

mean: 26701.583101880744
std dev: 5509.337670219389


In [17]:
y_pred=model.predict(housing_test_tr)

In [18]:
pred=pd.DataFrame(y_pred)
sub_df=pd.read_csv('sample_submission.csv')
datasets=pd.concat([sub_df['Id'],pred], axis=1)
datasets.column=[['Id','SalePrice']]
datasets.to_csv('sample_submission2.csv', index=False)

  datasets.column=[['Id','SalePrice']]


0.12
mean: 26701.583101880744
std dev: 5509.337670219389