## KAGGLE COMPETITION  
### Predict House Prices  

I am predicting house prices using different Opensource Algorithms to see which scores the best



In [None]:

# Loading in several python packages that will be used. 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import pandas_profiling 

import seaborn as sns
import matplotlib.style as style
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import boxcox_normmax
from scipy.special import boxcox1p
from scipy.stats import norm, skew
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

import warnings
warnings.filterwarnings('ignore')

In [None]:
house_train= pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
house_test= pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
print("Dataset shape:",'house_train', house_train.shape, 'house_test', house_test.shape)

In [None]:
#SalesPrice correlation with all the feature
plt.figure(figsize=(8, 12))
house_train.corr()['SalePrice'].sort_values().plot(kind='barh')

In [None]:
#Heatmap for top 10 Sales Price-features correlation
k = 10
cols = house_train.corr().nlargest(k, 'SalePrice')['SalePrice'].index
k_corr_matrix = house_train[cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(k_corr_matrix, annot=True, cmap=plt.cm.RdBu_r)

> **Correlation** assumes data should be related linearly

In [None]:
#scatterplot to verify linear relationship
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(house_train[cols], size = 2)
plt.show()

In [None]:
#Is Target variable Normal?
target = house_train['SalePrice']
f, axes = plt.subplots(1, 3, figsize=(15, 4))
sns.distplot(target, kde=False, fit=stats.johnsonsu, ax=axes[0])
sns.distplot(target, kde=False, fit=stats.norm, ax=axes[1])
sns.distplot(target, kde=False, fit=stats.lognorm, ax=axes[ 2])

It is apparent that SalePrice ***doesn't follow normal distribution***, so before performing regression it has to be transformed

In [None]:
# applying log transformation
house_train['SalePrice'] = np.log1p(house_train['SalePrice'])

In [None]:
# distribution histogram and normal probability plot
(mu, sigma) = norm.fit(house_train['SalePrice'])
sns.distplot(house_train['SalePrice'], fit=norm)
plt.legend(['Normal dist ($\mu=${:.2f}, $\sigma=${:.2f})'.format(mu, sigma)])

fig = plt.figure()
stats.probplot(house_train['SalePrice'], plot=plt)
plt.show()

**Finding Outliers in Data**

In [None]:
def detect_outliers(x, y, top=5, plot=True):
    lof = LocalOutlierFactor(n_neighbors=40, contamination=0.1)
    x_ =np.array(x).reshape(-1,1)
    preds = lof.fit_predict(x_)
    lof_scr = lof.negative_outlier_factor_
    out_idx = pd.Series(lof_scr).sort_values()[:top].index
    if plot:
        f, ax = plt.subplots(figsize=(9, 6))
        plt.scatter(x=x, y=y, c=np.exp(lof_scr), cmap='RdBu')
    return out_idx

Standardize the data and see if there're any outlier points

In [None]:
#GrLivArea-SalePrice outlier detection
outs = detect_outliers(house_train['GrLivArea'], house_train['SalePrice'],top=5) 
outs

In [None]:
#Separating qualitative(categorical) and quantitative(continuous) featues
quantitative = [feature for feature in house_train.columns if house_train.dtypes[feature] != 'object']
quantitative.remove('SalePrice')
quantitative.remove('Id')
qualitative = [feature for feature in house_train.columns if house_train.dtypes[feature] == 'object']

In [None]:
#Outliers for all quantitative features
from collections import Counter
all_outliers=[]

for feature in quantitative:
    try:
        outs = detect_outliers(house_train[feature], house_train['SalePrice'],top=5, plot=False)
    except:
        continue
    all_outliers.extend(outs)

print(Counter(all_outliers).most_common())

outliers = [30, 88, 462, 523, 632, 1298, 1324] #
for i in outliers:
    if i in all_outliers:
        print(i)

In [None]:
#delete outliers from training dataset
house_train = house_train.drop(house_train.index[outliers])
house_train.shape

In [None]:
house_train.reset_index(drop=True, inplace=True)
y_train = house_train['SalePrice']
X_train = house_train.drop(['SalePrice'], axis=1)
X_test = house_test

print("Dataset shape:",'X_train', X_train.shape, 'y_train', y_train.shape, 'X_test', X_test.shape)

In [None]:
# Filling NA's of the quantitative features  
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics_train = []; numerics_test = []
for i in house_train.columns: 
    if house_train[i].dtype in numeric_dtypes:
        numerics_train.append(i)
house_train.update(house_train[numerics_train].fillna(0)) #Filling NA's of training dataset

for i in house_test.columns:
    if house_test[i].dtype in numeric_dtypes:
        numerics_test.append(i)
house_test.update(house_test[numerics_test].fillna(0)) #Filling NA's of test dataset
#house_train.shape

In [None]:
#Skewness check and correction using boxcop for quantitative/continuous features
skew_train = house_train[quantitative].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew_train = skew_train[skew_train > 0.5] #skewness value
for i in high_skew_train.index:
    house_train[i] = boxcox1p(house_train[i], boxcox_normmax(house_train[i] + 1))

skew_test = house_test[quantitative].apply(lambda x: skew(x)).sort_values(ascending=False)    
high_skew_test = skew_test[skew_train > 0.5]
for i in high_skew_test.index:
    house_test[i] = boxcox1p(house_test[i], boxcox_normmax(house_test[i] + 1))

In [None]:
#Standard scaling to verify boxcox transformation
sale_price_scaled = StandardScaler().fit_transform(house_train['SalePrice'][:, np.newaxis])

sns.distplot(sale_price_scaled, fit=norm)

low_range = sale_price_scaled[sale_price_scaled[:, 0].argsort()[:5]]
high_range = sale_price_scaled[sale_price_scaled[:, 0].argsort()[-5:]]
print(f'outer range (low) of the distribution: \n{low_range}')
print(f'outer range (high) of the distribution: \n{high_range}')

In [None]:
# Combining train and test datasets
all_data = pd.concat([X_train, house_test], axis=0, sort=False)
all_data.drop(['Id'], axis=1, inplace=True)
all_data.shape

In [None]:
# Some of the non-numeric predictors are stored as numbers; we convert them into strings 
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

In [None]:
#Calculating missing data
na_total = all_data.isnull().sum().sort_values(ascending=False)
na_ratio = (all_data.isnull().sum() / all_data.shape[0]).sort_values(ascending=False)
missing_data = pd.concat([na_total, na_ratio], axis=1, keys=['Total', 'Ratio'])
missing_data.head(10)

In [None]:
# Most value of these 4 features are missing and they have no pattern , just delete them
all_data.drop(['PoolQC', 'Utilities', 'Street', 'MiscFeature', ], axis=1, inplace=True)

In [None]:
#Filling NA with None for categorical features
for col in ('Alley','Fence','FireplaceQu','GarageQual','GarageFinish','GarageCond','GarageType','BsmtExposure',
          'BsmtCond','BsmtQual','BsmtFinType2','BsmtFinType1'):
     all_data[col] = all_data[col].fillna(0)

In [None]:
print(all_data[all_data['GarageCars'].isnull()][['GarageArea', 'GarageCars', 'GarageType', 'GarageYrBlt', 'GarageQual']])
all_data['GarageArea'].fillna(0, inplace=True)
all_data['GarageCars'].fillna(0, inplace=True)

In [None]:
print(all_data[all_data['TotalBsmtSF'].isnull()][
    ['TotalBsmtSF', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtFinSF1', 'BsmtFullBath','BsmtHalfBath']])
for col in ('TotalBsmtSF', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtFinSF1', 'BsmtFullBath','BsmtHalfBath'):
     all_data[col] = all_data[col].fillna(0)
# all_data['TotalBsmtSF'].fillna(0, inplace=True)

In [None]:
all_data['MasVnrType'].fillna('None', inplace=True)
all_data['HasMasVnr'] = all_data['MasVnrType'].apply(lambda x: 0 if x == 'None' else 1)

In [None]:
X_train = all_data.iloc[:len(y_train), :]
X_test = all_data.iloc[len(y_train):, :]
print("Dataset shape:",'X_train', X_train.shape, 'y_train', y_train.shape, 'X_test', X_test.shape)

In [None]:
# fill the NA with the mode, which means most categorical type of the feature-train &test
X_train['MSZoning'] = X_train.groupby(['MSSubClass'])['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
X_test['MSZoning'] = X_test.groupby(['MSSubClass'])['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

for col in ('Functional','Exterior1st','Electrical','KitchenQual','SaleType','Exterior2nd'):
    X_train[col] = X_train[col].fillna(X_train[col].mode()[0])
    X_test[col] = X_test[col].fillna(X_test[col].mode()[0])

In [None]:
X_train['LotFrontage'] = X_train.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
X_train['GarageYrBlt'] = (X_train['YearBuilt'] + X_train['YearRemodAdd']) /2
X_train['MasVnrArea'] = X_train.groupby(['MasVnrType'])['MasVnrArea'].transform(lambda x: x.fillna(x.median()))

X_test['LotFrontage'] = X_test.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
X_test['GarageYrBlt'] = (X_test['YearBuilt'] + X_test['YearRemodAdd']) /2
X_test['MasVnrArea'] = X_test.groupby(['MasVnrType'])['MasVnrArea'].transform(lambda x: x.fillna(x.median()))

In [None]:
X_train['YrBltAndRemod']=X_train['YearBuilt']+X_train['YearRemodAdd']
X_train['TotalSF']=X_train['TotalBsmtSF'] + X_train['1stFlrSF'] + X_train['2ndFlrSF']
X_train['TotalSqrFootage'] = (X_train['BsmtFinSF1'] + X_train['BsmtFinSF2'] +
                                 X_train['1stFlrSF'] + X_train['2ndFlrSF'])
X_train['TotalBathrooms'] = (X_train['FullBath'] + (0.5 * X_train['HalfBath']) +
                               X_train['BsmtFullBath'] + (0.5 * X_train['BsmtHalfBath']))
X_train['TotalPorchSF'] = (X_train['OpenPorchSF'] + X_train['3SsnPorch'] +
                              X_train['EnclosedPorch'] + X_train['ScreenPorch'] +
                              X_train['WoodDeckSF'])

X_test['YrBltAndRemod']=X_test['YearBuilt']+X_test['YearRemodAdd']
X_test['TotalSF']=X_test['TotalBsmtSF'] + X_test['1stFlrSF'] + X_test['2ndFlrSF']
X_test['TotalSqrFootage'] = (X_test['BsmtFinSF1'] + X_test['BsmtFinSF2'] +
                                 X_test['1stFlrSF'] + X_test['2ndFlrSF'])
X_test['TotalBathrooms'] = (X_test['FullBath'] + (0.5 * X_test['HalfBath']) +
                               X_test['BsmtFullBath'] + (0.5 * X_test['BsmtHalfBath']))
X_test['TotalPorchSF'] = (X_test['OpenPorchSF'] + X_test['3SsnPorch'] +
                              X_test['EnclosedPorch'] + X_test['ScreenPorch'] +
                              X_test['WoodDeckSF'])

In [None]:
X_train['has2ndfloor'] = X_train['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
X_train['hasgarage'] = X_train['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
X_train['hasbsmt'] = X_train['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
X_train['hasfireplace'] = X_train['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

X_test['has2ndfloor'] = X_test['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
X_test['hasgarage'] = X_test['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
X_test['hasbsmt'] = X_test['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
X_test['hasfireplace'] = X_test['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
print("Dataset shape:",'X_train', X_train.shape, 'y_train', y_train.shape, 'X_test', X_test.shape)

In [None]:
import category_encoders as ce
ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
encoded_data=ohe.fit_transform(pd.concat([X_train,X_test], axis=0, sort=False)).reset_index(drop=True)
X_train =  encoded_data.iloc[:len(y_train), :]
X_test = encoded_data.iloc[len(y_train):, :]
print("Dataset shape:",'X_train', X_train.shape, 'y_train', y_train.shape, 'X_test', X_test.shape)

Testing for null values

In [None]:
X_test.isnull().sum().sort_values(ascending=False)

In [None]:
#Removes colums where the threshold of zero's is (> 99.95), means has only zero values 
overfit = []
len_X_train =len(X_train)

for i in X_train.columns:
    counts = X_train[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len_X_train * 100 > 99.94 :
        overfit.append(i)

overfit = list(overfit)
overfit.append('MSZoning_C (all)')

#Converting numpy array to handle XGB feature mismatch error -https://github.com/dmlc/xgboost/issues/2334
X_train = np.array(X_train.drop(overfit, axis=1).copy())
y_train = np.array(y_train)
X_test = np.array(X_test.drop(overfit, axis=1).copy())

print("Dataset shape:",'X_train', X_train.shape, 'y_train', y_train.shape, 'X_test', X_test.shape)

Import opensource algorithms to try out.

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import ElasticNet, Lasso, Ridge, ElasticNetCV, LassoCV, RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from mlxtend.regressor import StackingCVRegressor

In [None]:
#cross_val_score to get the root mean square error, which is the score method for current regression problem
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mse(y, y_pred))

def cv_rmse(model, X_train=X_train):
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [None]:
#parameters(for grid search)
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [None]:
#ridge
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))

#lasso
lasso = make_pipeline(
    RobustScaler(),
    LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))

#elastic net
elasticnet = make_pipeline(
    RobustScaler(),
    ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))

#svm
svr = make_pipeline(RobustScaler(), SVR(
    C=20,
    epsilon=0.009,
    gamma=0.0003,
))

#GradientBoosting
gbr = GradientBoostingRegressor(n_estimators=3000,
                                learning_rate=0.05,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)

#lightgbm
lightgbm = LGBMRegressor(
    objective='regression',
    num_leaves=4,
    learning_rate=0.01,
    n_estimators=5000,
    max_bin=200,
    bagging_fraction=0.75,
    bagging_freq=5,
    bagging_seed=7,
    feature_fraction=0.2,
    feature_fraction_seed=7,
    verbose=-1,
    #min_data_in_leaf=2,
    #min_sum_hessian_in_leaf=11
)

#xgboost reg:squarederror replacing reg:linear
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=3460,
                       max_depth=5,
                       min_child_weight=0,
                       gamma=0,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006)

In [None]:
#StackingCVRegressor：A 'Stacking Cross-Validation' regressor for scikit-learn estimators.
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [None]:
print('TEST score on CV')

score = cv_rmse(ridge) #cross_val_score(RidgeCV(alphas),X, y)
print("Ridge score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

score = cv_rmse(lasso)
print("Lasso score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

score = cv_rmse(elasticnet)
print("ElasticNet score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

score = cv_rmse(svr)
print("SVR score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

score = cv_rmse(lightgbm)
print("Lightgbm score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

score = cv_rmse(gbr)
print("GradientBoosting score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

score = cv_rmse(xgboost)
print("Xgboost score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

In [None]:
#train the stacking model
#1.1 learn first-level model
#1.2 construct a training set for second-level model
#2. train the second-level model
#3. re-learn first-level model on the entire train set
print('Training Model')
stack_gen_model = stack_gen.fit(X_train, y_train) #Fit ensemble regressors and the meta-regressor
print('Model Trained')

In [None]:
#submit prediction result
print('Predict submission')
result = np.floor(np.expm1(stack_gen_model.predict(X_test)))

Submit results to Kaggle

In [None]:
submission=pd.DataFrame()
submission['Id'] = house_test['Id']
submission['SalePrice']= result
submission.head()
submission.to_csv("houseprice_submission.csv", index=False)