# Objective

Predict the sales price for each house. For each Id in the test set, you must predict the value of the SalePrice variable.

Competition Link: https://www.kaggle.com/c/home-data-for-ml-course/overview/description

# Data Preprocessing

In [1]:
# Dependencies

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Loading testing data
housing_test_data = pd.read_csv('data/test.csv')
housing_test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [3]:
# Loading training data
housing_train_data = pd.read_csv('data/train.csv')
housing_train_data = housing_train_data.drop(housing_train_data.index[-4])
housing_train_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1455,20,FV,62.0,7500,Pave,Pave,Reg,Lvl,AllPub,...,0,,,,0,10,2009,WD,Normal,185000
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
# Dropping ID - not a valuable feature
housing_train_data = housing_train_data.drop('Id', axis=1)
housing_train_data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,20,FV,62.0,7500,Pave,Pave,Reg,Lvl,AllPub,Inside,...,0,,,,0,10,2009,WD,Normal,185000
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [5]:
# Look at missing values
housing_train_data_missing = housing_train_data.isnull().sum().sort_values(ascending=False)
housing_train_data_missing = pd.DataFrame(housing_train_data_missing)
housing_train_data_missing = housing_train_data_missing.loc[housing_train_data_missing[0] != 0]
housing_train_data_missing

Unnamed: 0,0
PoolQC,1452
MiscFeature,1405
Alley,1368
Fence,1179
FireplaceQu,690
LotFrontage,259
GarageType,81
GarageCond,81
GarageFinish,81
GarageQual,81


In [6]:
# Imputing Missing Values

housing_processed = housing_train_data

# Categorical columns:
cat_attributes_fill_none = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
                     'GarageCond', 'GarageQual', 'GarageFinish', 'GarageType',
                     'BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtQual', 'BsmtCond',
                     'MasVnrType']

# Replace missing values for categorical columns with None
for cat in cat_attributes_fill_none:
    housing_processed[cat] = housing_processed[cat].fillna("None")
    
# Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
housing_processed['LotFrontage'] = housing_processed.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))    

# Garage: GarageYrBlt, GarageArea and GarageCars these are numerical columns, replace with zero
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars']:
    housing_processed[col] = housing_processed[col].fillna(int(0))
    
# MasVnrArea : replace with zero
housing_processed['MasVnrArea'] = housing_processed['MasVnrArea'].fillna(int(0))

# Use the mode value 
housing_processed['Electrical'] = housing_processed['Electrical'].fillna(housing_processed['Electrical']).mode()[0]

# There is no need of Utilities so let's just drop this column
housing_processed = housing_processed.drop(['Utilities'], axis=1)


In [7]:
# Get the count again to verify no more missing values
housing_processed.isnull().apply(sum).max()

0

In [8]:
# Viewing datatype and total of each column 
housing_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1459
Data columns (total 80 columns):
MSSubClass       1459 non-null int64
MSZoning         1459 non-null object
LotFrontage      1459 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            1459 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1459 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1459 non-null object
Exterior2nd      1459 no

In [9]:
# Correlation of numerical attributes
corr = housing_processed.corr()

# View the correlation numbers of all the attributes to the target variable(SalePrice)
# Note to remember: 1 indicates positive correlation, -1 negative correlation and 0 means no relationship
corr['SalePrice'].sort_values(ascending=False)


SalePrice        1.000000
OverallQual      0.791037
GrLivArea        0.708664
GarageCars       0.640382
GarageArea       0.623432
TotalBsmtSF      0.613588
1stFlrSF         0.606443
FullBath         0.560611
TotRmsAbvGrd     0.533689
YearBuilt        0.522874
YearRemodAdd     0.507090
MasVnrArea       0.472615
Fireplaces       0.467152
BsmtFinSF1       0.386323
LotFrontage      0.349778
WoodDeckSF       0.324378
2ndFlrSF         0.319617
OpenPorchSF      0.316101
HalfBath         0.284368
LotArea          0.263795
GarageYrBlt      0.261323
BsmtFullBath     0.226950
BsmtUnfSF        0.214477
BedroomAbvGr     0.168181
ScreenPorch      0.111522
PoolArea         0.092425
MoSold           0.046877
3SsnPorch        0.044615
BsmtFinSF2      -0.011562
BsmtHalfBath    -0.016785
MiscVal         -0.021169
LowQualFinSF    -0.025577
YrSold          -0.029364
OverallCond     -0.077959
MSSubClass      -0.084091
EnclosedPorch   -0.128499
KitchenAbvGr    -0.135863
Name: SalePrice, dtype: float64

In [10]:
# Remove highly correlated features - only one attribute is better to use for similar features

attributes_drop = ['MiscVal', 'MoSold', 'YrSold', 'BsmtFinSF2','BsmtHalfBath','MSSubClass',
                   'GarageArea', 'GarageYrBlt', '3SsnPorch' ]

housing_processed = housing_processed.drop(attributes_drop, axis=1)

In [11]:
# One-hot encoding - transforming categorical attributes to numbers

from sklearn.preprocessing import OneHotEncoder
categorical_encoder = OneHotEncoder()
housing_processed_categorical = categorical_encoder.fit_transform(housing_processed)
housing_processed_categorical

<1459x7541 sparse matrix of type '<class 'numpy.float64'>'
	with 102130 stored elements in Compressed Sparse Row format>

# Data Transformation

In [12]:
# Import modules
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Separate features and target variable
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
housing_X = housing_processed[features]
housing_y = housing_processed["SalePrice"]


In [13]:
# Get the list of names for numerical and categorical attributes separately
num_attributes = housing_X.select_dtypes(exclude='object')
cat_attributes = housing_X.select_dtypes(include='object')

num_attribs = list(num_attributes)
cat_attribs = list(cat_attributes)

# Numerical Pipeline to impute any missing values with the median and scale attributes
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

# Description before applying transforms
print(housing_y.describe())
print(" ")

# Apply log-transform to SalePrice
housing_y_prepared  = np.log(housing_y)

# Run the transformation pipeline on all the other attributes
housing_X_prepared = full_pipeline.fit_transform(housing_X)

# Description before applying transforms
print(housing_y_prepared.describe())


count      1459.00000
mean     180901.26525
std       79466.09037
min       34900.00000
25%      129950.00000
50%      163000.00000
75%      214000.00000
max      755000.00000
Name: SalePrice, dtype: float64
 
count    1459.000000
mean       12.023893
std         0.399543
min        10.460242
25%        11.774905
50%        12.001505
75%        12.273731
max        13.534473
Name: SalePrice, dtype: float64


# Machine Learning Model

In [14]:
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
import xgboost
# !conda install py-xgboost

In [15]:
# Split data into train and test 
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(housing_X_prepared, housing_y_prepared, random_state=7)

X_train, X_test, y_train, y_test = train_test_split(housing_X_prepared, housing_y_prepared, test_size=0.2)



In [16]:
# Invert the log-transformed value
def inv_y(transformed_y):
    return np.exp(transformed_y)

# Series to collect RMSE for the different algorithms: "algorithm name + rmse"
rmse_compare = pd.Series()
rmse_compare.index.name = 'Model'

# Series to collect accuracy scores for the different algorithms: "algorithm name + score"
scores_compare = pd.Series()
scores_compare.index.name = 'Model'

In [17]:
# Model 1: Linear Regression =================================================
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

linear_val_predictions = linear_model.predict(X_test)
linear_val_rmse = mean_squared_error(inv_y(linear_val_predictions), inv_y(y_test))
linear_val_rmse = np.sqrt(linear_val_rmse)
rmse_compare['LinearRegression'] = linear_val_rmse

lr_score = linear_model.score(X_test, y_test)*100
scores_compare['LinearRegression'] = lr_score

In [18]:
# Model 2: Decision Trees. Define the model. =============================
dtree_model = DecisionTreeRegressor(random_state=5)
dtree_model.fit(X_train, y_train)

dtree_val_predictions = dtree_model.predict(X_test)
dtree_val_rmse = mean_squared_error(inv_y(dtree_val_predictions), inv_y(y_test))
dtree_val_rmse = np.sqrt(dtree_val_rmse)
rmse_compare['DecisionTree'] = dtree_val_rmse

dtree_score = dtree_model.score(X_test, y_test)*100
scores_compare['DecisionTree'] = dtree_score

In [19]:
# Model 3: Random Forest. Define the model. =============================
rf_model = RandomForestRegressor(random_state=5)
rf_model.fit(X_train, y_train)

rf_val_predictions = rf_model.predict(X_test)
rf_val_rmse = mean_squared_error(inv_y(rf_val_predictions), inv_y(y_test))
rf_val_rmse = np.sqrt(rf_val_rmse)
rmse_compare['RandomForest'] = rf_val_rmse

rf_score = rf_model.score(X_test, y_test)*100
scores_compare['RandomForest'] = rf_score



In [20]:
# Model 4: Gradient Boosting Regression ==========================================
gbr_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, 
                                      max_depth=4, random_state=5)
gbr_model.fit(X_train, y_train)

gbr_val_predictions = gbr_model.predict(X_test)
gbr_val_rmse = mean_squared_error(inv_y(gbr_val_predictions), inv_y(y_test))
gbr_val_rmse = np.sqrt(gbr_val_rmse)
rmse_compare['GradientBoosting'] = gbr_val_rmse

gbr_score = gbr_model.score(X_test, y_test)*100
scores_compare['GradientBoosting'] = gbr_score

In [21]:
# Checking performance for all models

print('RMSE values for different algorithms:')
rmse_compare.sort_values(ascending=True).round()


RMSE values for different algorithms:


Model
GradientBoosting    36463.0
RandomForest        41738.0
DecisionTree        49062.0
LinearRegression    68684.0
dtype: float64

In [22]:
# Checking performance for all models
print('Accuracy scores for different algorithms:')
scores_compare.sort_values(ascending = False).round(3)


Accuracy scores for different algorithms:


Model
GradientBoosting    75.065
RandomForest        69.382
LinearRegression    66.330
DecisionTree        52.324
dtype: float64

# Fine-Tuning Model

### Gradient Boosting and GridSearchCV

In [23]:
# Gradient Boosting and GridSearchCV


from sklearn.model_selection import GridSearchCV

# Define the parameters for exploration
param_grid = param_grid = [
    {'n_estimators': [10, 50, 100, 150], 'max_features': [0.1, 0.25, 0.5, 0.75, 1.0]}
  ]

# The model for which we are finding params values
grad_boost = GradientBoostingRegressor()


grad_grid_search = GridSearchCV(grad_boost, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grad_grid_search.fit(X_train, y_train)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
                

In [24]:
# Finding the best values for the passed hyperparameters
grad_grid_search.best_params_

{'max_features': 1.0, 'n_estimators': 150}

In [25]:
# # Finding the best values for the passed hyperparameters
# grad_grid_search.best_estimator_

In [26]:
# Evaluate Using the Fine-Tuned Model 

grad_model_final = GradientBoostingRegressor(max_features= 1, n_estimators= 100)

grad_model_final.fit(X_train, y_train)
grad_final_val_predictions = grad_model_final.predict(X_test)

# Get RMSE 
grad_final_val_rmse = mean_squared_error(inv_y(grad_final_val_predictions), inv_y(y_test))
np.sqrt(grad_final_val_rmse)

# Get Accuracy
grad_model_final.score(X_test, y_test)*100

72.56607934228425

# Save model

In [27]:
# Save model - Gradient Boosting Regression without GridSearchCV
 
from sklearn.externals import joblib

# Save model
joblib.dump(gbr_model, "model/gbr_model.pkl")
 
# Load saved model
gbr_model_load = joblib.load("model/gbr_model.pkl")



# Prediction

### Gradient Boosting

In [28]:
# Prediction - Gradient Boosting Regression without GridSearchCV

# Fitting model 
gbr_model = GradientBoostingRegressor(max_features= 7)
gbr_model.fit(housing_X_prepared, housing_y_prepared)

# Reading test data
test_data_path = 'data/test.csv'
test_data = pd.read_csv(test_data_path)

# Prediction
X_test = housing_X_prepared
test_preds = gbr_model.predict(X_test)

# Displaying prediction in DataFrame
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})

# Saving prediction to file
output.to_csv('third_submission.csv', index=False)

output

Unnamed: 0,Id,SalePrice
0,1461,12.208774
1,1462,11.956280
2,1463,12.261254
3,1464,11.938823
4,1465,12.530479
...,...,...
1454,2915,12.134874
1455,2916,12.097075
1456,2917,12.177043
1457,2918,11.697338
