# Prediction of Sale Price

## Objective 
Develop and assess a regression model to predict SalePrice, addressing Business Requirement 2.

## Inputs
outputs/datasets/collection/HousePrices.csv

## Outputs
Train set (features and target)
Test set (features and target)
Data cleaning and feature engineering pipeline
Features importance plot

## CRISP-DM
Modelling and evaluation.


# Change working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [65]:
import os
current_dir = os.getcwd()
current_dir

'/workspaces/Heritage-Housing'

We want to make the child of the current directory the new current directory
* os.chdir() defines the new current directory

In [66]:
os.chdir('/workspaces/Heritage-Housing')
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [67]:
current_dir = os.getcwd()
current_dir

'/workspaces/Heritage-Housing'

## Load Data

In [68]:
import pandas as pd
%matplotlib inline
train_set_path = "outputs/datasets/cleaned/clean_set.csv"
df = pd.read_csv(train_set_path)
df.head(3)

Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,GarageArea,GarageFinish,GarageYrBlt,...,KitchenQual,LotArea,LotFrontage,OpenPorchSF,OverallCond,OverallQual,TotalBsmtSF,YearBuilt,YearRemodAdd,SalePrice
0,856,854.0,3.0,No,706,GLQ,150,548,RFn,2003.0,...,Gd,8450,65.0,61,5,7,856,2003,2003,208500
1,1262,0.0,3.0,Gd,978,ALQ,284,460,RFn,1976.0,...,TA,9600,80.0,0,8,6,1262,1976,1976,181500
2,920,866.0,3.0,Mn,486,GLQ,434,608,RFn,2001.0,...,Gd,11250,68.0,42,5,7,920,2001,2002,223500


## Machine Learning Pipline

* We first create a ML pipine for our Data Cleaning and Feature engineering 

In [69]:
from sklearn.pipeline import Pipeline

### Feature Engineering
from feature_engine import transformation as vt
from feature_engine.encoding import OrdinalEncoder
from feature_engine.selection import SmartCorrelatedSelection

### Feat Scaling
from sklearn.preprocessing import StandardScaler

### Feat Selection
from sklearn.feature_selection import SelectFromModel

### ML algorithms
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor


selection_method = "variance"
corr_method = "spearman"

def PipelineOptimization(model):
    pipeline_base = Pipeline([

        ("OrdinalCategoricalEncoder", OrdinalEncoder(encoding_method='arbitrary',
                                                    variables=['BsmtExposure',
                                                                'BsmtFinType1',
                                                                'GarageFinish',
                                                                'KitchenQual'])),

        ("NumericLogTransform", vt.LogTransformer(variables=['1stFlrSF',
                                                            'LotArea',
                                                            'GrLivArea','LotFrontage'])),
        ("NumericPowerTransform", vt.PowerTransformer(variables=['TotalBsmtSF','OpenPorchSF'])),
        ("NumericYeoJohnsonTransform",
        vt.YeoJohnsonTransformer(variables=['TotalBsmtSF'])),

        ("SmartCorrelatedSelection",
        SmartCorrelatedSelection(variables=None,
                                    method=corr_method,
                                    threshold=0.8,
                                    selection_method=selection_method
                                    )),

        ("feat_scaling", StandardScaler()),

        ("feat_selection",  SelectFromModel(model)),

        ("model", model),

    ])

    return pipeline_base

**Hyperparameter Optimization**

In [70]:
from sklearn.model_selection import GridSearchCV
import numpy as np

class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")

            model = PipelineOptimization(self.models[key])
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                            verbose=verbose, scoring=scoring, )
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score',
                'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns], self.grid_searches

**Split the Training and testing set**

In [71]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
                                        df.drop(['SalePrice'], axis=1),
                                        df['SalePrice'],
                                        test_size = 0.2,
                                        random_state = 0,
                                        )

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1168, 20) (1168,) (292, 20) (292,)


### Grid Search CV - Sklearn

In [72]:
models_quick_search = {
    'LinearRegression': LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
}

params_quick_search = {
    'LinearRegression': {},
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "ExtraTreesRegressor": {},
}


In [73]:

search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)



Running GridSearchCV for LinearRegression 

Fitting 5 folds for each of 1 candidates, totalling 5 fits



Running GridSearchCV for DecisionTreeRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for RandomForestRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for ExtraTreesRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [74]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score
3,ExtraTreesRegressor,0.687809,0.819848,0.864335,0.066621
0,LinearRegression,0.728158,0.807558,0.855802,0.04733
2,RandomForestRegressor,0.688819,0.770973,0.817628,0.04748
1,DecisionTreeRegressor,0.494971,0.579547,0.667303,0.063698


### Extensive Search

In [75]:
models_search = {
    "RandomForestRegressor":RandomForestRegressor(random_state=0),
    "LinearRegression": LinearRegression(),
    "ExtraTreesRegressor":ExtraTreesRegressor(random_state=0)

}

params_search = {
    "RandomForestRegressor":{
        'model__n_estimators': [100,700],
        'model__max_depth': [6,18,None],
    },
    
    "LinearRegression":{},

    "ExtraTreesRegressor":{'model__n_estimators': [100,50,150],
                        'model__max_depth': [None, 3 ,15],
                        'model__min_samples_split': [2, 50],
                        'model__min_samples_leaf': [1, 50],
                        }
}

In [76]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)


Running GridSearchCV for RandomForestRegressor 

Fitting 5 folds for each of 6 candidates, totalling 30 fits

Running GridSearchCV for LinearRegression 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for ExtraTreesRegressor 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [77]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,model__max_depth,model__n_estimators,model__min_samples_leaf,model__min_samples_split
8,ExtraTreesRegressor,0.704606,0.822258,0.859771,0.059228,,50.0,1.0,2.0
7,ExtraTreesRegressor,0.687809,0.819848,0.864335,0.066621,,100.0,1.0,2.0
9,ExtraTreesRegressor,0.69008,0.819544,0.865691,0.065412,,150.0,1.0,2.0
36,ExtraTreesRegressor,0.762416,0.815853,0.853106,0.038874,15.0,150.0,1.0,50.0
12,ExtraTreesRegressor,0.762187,0.815833,0.853234,0.038962,,150.0,1.0,50.0
33,ExtraTreesRegressor,0.679856,0.815207,0.867005,0.068832,15.0,150.0,1.0,2.0
34,ExtraTreesRegressor,0.760013,0.814748,0.852223,0.039479,15.0,100.0,1.0,50.0
10,ExtraTreesRegressor,0.759661,0.814613,0.851899,0.039516,,100.0,1.0,50.0
31,ExtraTreesRegressor,0.676688,0.814008,0.867144,0.069934,15.0,100.0,1.0,2.0
35,ExtraTreesRegressor,0.762338,0.813599,0.852982,0.039672,15.0,50.0,1.0,50.0


### Best Model

In [78]:
best_model = grid_search_summary.iloc[0,0]
best_model

'ExtraTreesRegressor'

In [79]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

{'model__max_depth': None,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__n_estimators': 50}

In [80]:
best_regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
best_regressor_pipeline