# Data Preprocessing

In [1]:
# Dependencies

import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Loading training data
housing_train_data = pd.read_csv('data/train.csv')
housing_train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Dropping ID - not a valuable feature
housing_train_data = housing_train_data.drop('Id', axis=1)
housing_train_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Look at missing values
housing_train_data_missing = housing_train_data.isnull().sum().sort_values(ascending=False)
housing_train_data_missing = pd.DataFrame(housing_train_data_missing)
housing_train_data_missing = housing_train_data_missing.loc[housing_train_data_missing[0] != 0]
housing_train_data_missing

Unnamed: 0,0
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
FireplaceQu,690
LotFrontage,259
GarageType,81
GarageCond,81
GarageFinish,81
GarageQual,81


In [5]:
# Imputing Missing Values

housing_processed = housing_train_data

# Categorical columns:
cat_attributes_fill_none = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
                     'GarageCond', 'GarageQual', 'GarageFinish', 'GarageType',
                     'BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtQual', 'BsmtCond',
                     'MasVnrType']

# Replace missing values for categorical columns with None
for cat in cat_attributes_fill_none:
    housing_processed[cat] = housing_processed[cat].fillna("None")
    
# Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
housing_processed['LotFrontage'] = housing_processed.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))    

# Garage: GarageYrBlt, GarageArea and GarageCars these are numerical columns, replace with zero
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars']:
    housing_processed[col] = housing_processed[col].fillna(int(0))
    
# MasVnrArea : replace with zero
housing_processed['MasVnrArea'] = housing_processed['MasVnrArea'].fillna(int(0))

# Use the mode value 
housing_processed['Electrical'] = housing_processed['Electrical'].fillna(housing_processed['Electrical']).mode()[0]

# There is no need of Utilities so let's just drop this column
housing_processed = housing_processed.drop(['Utilities'], axis=1)



In [6]:
# Get the count again to verify no more missing values
housing_processed.isnull().apply(sum).max()

0

In [7]:
# Viewing datatype and total of each column 
housing_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 no

In [8]:
# Removing outliers

num_attributes = housing_processed.select_dtypes(exclude='object')

high_quant = housing_processed.quantile(.999)

for i in num_attributes.columns:
    housing_processed = housing_processed.drop(housing_processed[i][housing_processed[i]>high_quant[i]].index)

housing_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1422 entries, 0 to 1458
Data columns (total 79 columns):
MSSubClass       1422 non-null int64
MSZoning         1422 non-null object
LotFrontage      1422 non-null float64
LotArea          1422 non-null int64
Street           1422 non-null object
Alley            1422 non-null object
LotShape         1422 non-null object
LandContour      1422 non-null object
LotConfig        1422 non-null object
LandSlope        1422 non-null object
Neighborhood     1422 non-null object
Condition1       1422 non-null object
Condition2       1422 non-null object
BldgType         1422 non-null object
HouseStyle       1422 non-null object
OverallQual      1422 non-null int64
OverallCond      1422 non-null int64
YearBuilt        1422 non-null int64
YearRemodAdd     1422 non-null int64
RoofStyle        1422 non-null object
RoofMatl         1422 non-null object
Exterior1st      1422 non-null object
Exterior2nd      1422 non-null object
MasVnrType       1422 no

In [9]:
# Correlation of numerical attributes
corr = housing_processed.corr()

# View the correlation numbers of all the attributes to the target variable(SalePrice)
# Note to remember: 1 indicates positive correlation, -1 negative correlation and 0 means no relationship
corr['SalePrice'].sort_values(ascending=False)


SalePrice        1.000000
OverallQual      0.798677
GrLivArea        0.715305
GarageCars       0.652625
TotalBsmtSF      0.641265
GarageArea       0.639995
1stFlrSF         0.612457
FullBath         0.571596
YearBuilt        0.559036
TotRmsAbvGrd     0.537897
YearRemodAdd     0.534553
MasVnrArea       0.472000
Fireplaces       0.461922
BsmtFinSF1       0.397979
LotFrontage      0.351594
OpenPorchSF      0.351362
WoodDeckSF       0.318615
2ndFlrSF         0.293866
LotArea          0.290421
HalfBath         0.285076
GarageYrBlt      0.271666
BsmtFullBath     0.230715
BsmtUnfSF        0.211267
BedroomAbvGr     0.156268
ScreenPorch      0.102803
3SsnPorch        0.063138
MoSold           0.060741
PoolArea        -0.002637
YrSold          -0.014042
BsmtFinSF2      -0.031948
BsmtHalfBath    -0.033910
MiscVal         -0.052786
LowQualFinSF    -0.068193
OverallCond     -0.081357
MSSubClass      -0.085999
KitchenAbvGr    -0.142625
EnclosedPorch   -0.152033
Name: SalePrice, dtype: float64

In [10]:
# Remove highly correlated features - only one attribute is better to use for similar features

attributes_drop = ['MiscVal', 'MoSold', 'YrSold', 'BsmtFinSF2','BsmtHalfBath','MSSubClass',
                   'GarageArea', 'GarageYrBlt', '3SsnPorch' ]

housing_processed = housing_processed.drop(attributes_drop, axis=1)

In [11]:
# One-hot encoding - transforming categorical attributes to numbers

from sklearn.preprocessing import OneHotEncoder
categorical_encoder = OneHotEncoder()
housing_processed_categorical = categorical_encoder.fit_transform(housing_processed)
housing_processed_categorical

<1422x7333 sparse matrix of type '<class 'numpy.float64'>'
	with 99540 stored elements in Compressed Sparse Row format>

# Data Transformation

In [12]:
# Import modules
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Separate features and target variable
housing_X = housing_processed.drop("SalePrice", axis=1)
housing_y = housing_processed["SalePrice"].copy()


In [13]:
# Get the list of names for numerical and categorical attributes separately
num_attributes = housing_X.select_dtypes(exclude='object')
cat_attributes = housing_X.select_dtypes(include='object')

num_attribs = list(num_attributes)
cat_attribs = list(cat_attributes)

# Numerical Pipeline to impute any missing values with the median and scale attributes
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])


# Description before applying transforms
print(housing_y.describe())
print(" ")

# Apply log-transform to SalePrice
housing_y_prepared  = np.log(housing_y)

# Run the transformation pipeline on all the other attributes
housing_X_prepared = full_pipeline.fit_transform(housing_X)

# Description before applying transforms
print(housing_y_prepared.describe())


count      1422.000000
mean     178405.042897
std       74506.926127
min       35311.000000
25%      129600.000000
50%      161500.000000
75%      211750.000000
max      611657.000000
Name: SalePrice, dtype: float64
 
count    1422.000000
mean       12.014792
std         0.389594
min        10.471950
25%        11.772207
50%        11.992260
75%        12.263160
max        13.323927
Name: SalePrice, dtype: float64


# Machine Learning Model

In [14]:
# !conda install py-xgboost

In [15]:
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
import xgboost


In [16]:
# Split data into train and test 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(housing_X_prepared, housing_y_prepared, test_size=0.2, random_state=7)

In [17]:
# Invert the log-transformed value
def inv_y(transformed_y):
    return np.exp(transformed_y)

# Series to collect RMSE for the different algorithms: "algorithm name + rmse"
rmse_compare = pd.Series()
rmse_compare.index.name = 'Model'

# Series to collect accuracy scores for the different algorithms: "algorithm name + score"
scores_compare = pd.Series()
scores_compare.index.name = 'Model'

In [18]:
# Model 1: Linear Regression =================================================
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

linear_val_predictions = linear_model.predict(X_test)
linear_val_rmse = mean_squared_error(inv_y(linear_val_predictions), inv_y(y_test))
linear_val_rmse = np.sqrt(linear_val_rmse)
rmse_compare['LinearRegression'] = linear_val_rmse

lr_score = linear_model.score(X_test, y_test)*100
scores_compare['LinearRegression'] = lr_score

In [19]:
# Model 2: Decision Trees. Define the model. =============================
dtree_model = DecisionTreeRegressor(random_state=5)
dtree_model.fit(X_train, y_train)

dtree_val_predictions = dtree_model.predict(X_test)
dtree_val_rmse = mean_squared_error(inv_y(dtree_val_predictions), inv_y(y_test))
dtree_val_rmse = np.sqrt(dtree_val_rmse)
rmse_compare['DecisionTree'] = dtree_val_rmse

dtree_score = dtree_model.score(X_test, y_test)*100
scores_compare['DecisionTree'] = dtree_score

In [20]:
# Model 3: Random Forest. Define the model. =============================
rf_model = RandomForestRegressor(random_state=5)
rf_model.fit(X_train, y_train)

rf_val_predictions = rf_model.predict(X_test)
rf_val_rmse = mean_squared_error(inv_y(rf_val_predictions), inv_y(y_test))
rf_val_rmse = np.sqrt(rf_val_rmse)
rmse_compare['RandomForest'] = rf_val_rmse

rf_score = rf_model.score(X_test, y_test)*100
scores_compare['RandomForest'] = rf_score



In [21]:
# Model 4: Gradient Boosting Regression ==========================================
gbr_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, 
                                      max_depth=4, random_state=5)
gbr_model.fit(X_train, y_train)

gbr_val_predictions = gbr_model.predict(X_test)
gbr_val_rmse = mean_squared_error(inv_y(gbr_val_predictions), inv_y(y_test))
gbr_val_rmse = np.sqrt(gbr_val_rmse)
rmse_compare['GradientBoosting'] = gbr_val_rmse

gbr_score = gbr_model.score(X_test, y_test)*100
scores_compare['GradientBoosting'] = gbr_score

In [22]:
# Checking performance for all models

print('RMSE values for different algorithms:')
rmse_compare.sort_values(ascending=True).round()


RMSE values for different algorithms:


Model
LinearRegression    24663.0
GradientBoosting    27212.0
RandomForest        31491.0
DecisionTree        37872.0
dtype: float64

In [23]:
# Checking performance for all models
print('Accuracy scores for different algorithms:')
scores_compare.sort_values(ascending = False).round(3)


Accuracy scores for different algorithms:


Model
GradientBoosting    89.567
LinearRegression    89.546
RandomForest        84.796
DecisionTree        72.805
dtype: float64

# Fine-Tuning Model

### Gradient Boosting and GridSearchCV

In [24]:
# Gradient Boosting and GridSearchCV


from sklearn.model_selection import GridSearchCV

# Define the parameters for exploration
param_grid = param_grid = [
    {'n_estimators': [10, 50, 100, 150], 'max_features': [10, 20, 30, 40, 50, 100, 150]}
  ]


# The model for which we are finding params values
grad_boost = GradientBoostingRegressor()


grad_grid_search = GridSearchCV(grad_boost, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grad_grid_search.fit(X_train, y_train)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
                

In [25]:
# Finding the best values for the passed hyperparameters
grad_grid_search.best_params_

{'max_features': 100, 'n_estimators': 150}

In [26]:
# Finding the best values for the passed hyperparameters
grad_grid_search.best_estimator_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=100, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=150,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [27]:
# Evaluate Using the Fine-Tuned Model 

grad_model_final = GradientBoostingRegressor(max_features=40, n_estimators=150)

grad_model_final.fit(X_train, y_train)
grad_final_val_predictions = grad_model_final.predict(X_test)

# Get RMSE 
grad_final_val_rmse = mean_squared_error(inv_y(grad_final_val_predictions), inv_y(y_test))
np.sqrt(grad_final_val_rmse)

# Get Accuracy
grad_model_final.score(X_test, y_test)*100

90.02421951296297

# Save model

In [28]:
# Save model - Gradient Boosting Regression
 
from sklearn.externals import joblib

# Save model
joblib.dump(grad_model_final, "model/grad_model_final.pkl")
 
# Load saved model
grad_model_final = joblib.load("model/grad_model_final.pkl")



# Prediction