## House Price Prediction

The goal for this project is to build the linear regression model, decision tree regression model, and xgboost regression model to predict the house price based on the selected features and see which model performs good.

In [2]:
# python libraries
import pandas as pd
import numpy as np
import pandas_profiling as pp

# regressors
import xgboost
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

# pre-processing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [9]:
#Load Data
#https://www.kaggle.com/kamelyounes/house-prices-prediction/?select=train.csv
df = pd.read_csv("Data/train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Pandas Profiling
# Print the Pandas dataframe profile
profile = pp.ProfileReport(df)
profile.to_file("df_profile.html")

(using `df.profile_report(correlations={"cramers": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/pandas-profiling/pandas-profiling/issues
(include the error message: 'No data; `observed` has size 0.')
Summarize dataset: 100%|██████████| 94/94 [02:44<00:00,  1.75s/it, Completed]
Generate report structure: 100%|██████████| 1/1 [00:27<00:00, 27.50s/it]
Render HTML: 100%|██████████| 1/1 [00:28<00:00, 28.60s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  6.41it/s]


In [6]:
# Check to see if we have any missing data rows
print(df.isnull().sum())

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64


In [12]:
# Handle the missing data
for key, values in df.iteritems():
    if (pd.api.types.is_numeric_dtype(df[key])):
        df[key].fillna(value= df[key].mean(), inplace=True)
    else :
        df[key].fillna(value= "Missing", inplace=True) 

    
pd.Series(df.isnull().sum())   

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64

In [14]:
# Use label encoder to handle the non-numeric data and split data for test and training.abs
one_hot_enc = LabelEncoder()
for key, values in df.iteritems():
    if (pd.api.types.is_string_dtype(df[key])):        
           df[key] = one_hot_enc.fit_transform(df[key])


scaler = StandardScaler()
scaler.fit(df)
scaler.transform(df)

X = df.drop('SalePrice', 1)
y = df.SalePrice

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Build Regression Models

Build different types of regression models and compare them.

#### 1. Linear Regression

In [20]:
# linear regression grid search
grid_search_linear_reg = GridSearchCV(LinearRegression(), { 'fit_intercept': [True, False],
                                                        'normalize': [True, False], 
                                                        'copy_X': [True, False] 
                                                        }, cv=5)
grid_search_linear_reg.fit(X_train, y_train)
print("linear regression score :", grid_search_linear_reg.best_score_)

linear regression score : 0.7372746490015494


#### 2. Decision tree grid search

In [16]:
# decision tree grid search
decision_tree_param_grid = {'criterion': ['mse', 'mae'],
              'min_samples_split': [10, 20, 40],
              'max_depth': [2, 6, 8],
              'min_samples_leaf': [20, 40, 100],
              'max_leaf_nodes': [5, 20, 100],
              }

grid_search_decision_trees = GridSearchCV(DecisionTreeRegressor(), decision_tree_param_grid, cv=5)
grid_search_decision_trees.fit(X_train, y_train)
print("decision trees score :", grid_search_decision_trees.best_score_)

decision trees score : 0.7426745240489036


#### 3. XGBoost grid search

In [17]:
# xgboost grid search
xgb_param_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth': [2,3,5,10,15],
    'learning_rate': [0.05, 0.1, 0.15, 0.2],
    'min_child_weight': [1,2,3,4],
    'booster': ['gbtree','gblinear'],
    'base_score': [0.25, 0.5, 0.75, 1]
}
grid_search_xgb = RandomizedSearchCV(xgboost.XGBRegressor(), param_distributions = xgb_param_grid,
                              cv=5, n_iter=50,
                              scoring = 'neg_mean_absolute_error', n_jobs = 4,
                              verbose = 5,
                              return_train_score = True,
                              random_state = 42)
grid_search_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n...
                                          validate_parameters=None,
                                          verbosity=None),
                   n_iter=50, n_jobs=4,
                   para

In [18]:
# Evaluate the xgboost regression model
xgb_best_estimator = grid_search_xgb.best_estimator_
print("xgb score :",cross_val_score(xgb_best_estimator, X_train, y_train, cv=5).mean())

xgb score : 0.8812320408719397


In [19]:
xgb_best_estimator.fit(X_train, y_train)
xgb_best_estimator.score(X_test, y_test)

0.8822907013437441

### Conclusion  
- Out of the three models we have tried, xgboost model seem to have performed better which higher score. The score on training set and testing set are very close indicating that there is no overfitting of the training data in the model.

### References
- https://www.kaggle.com/kamelyounes/house-prices-prediction#data
- https://www.kaggle.com/kamelyounes/house-prices-prediction