In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [2]:
# Load DataSet

In [3]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv',index_col='Id')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv',index_col='Id')

train.shape, test.shape

In [4]:
train.head()

In [5]:
# Concat the train and test
df = pd.concat((train,test))
df.shape

# Exploratory Data Analysis (EDA)

In [6]:
df.head()

In [7]:
df.info()

In [8]:
df.describe()

In [9]:
df.select_dtypes(include=['int64','float64']).columns

In [10]:
df.select_dtypes(include=['object']).columns

In [11]:
#Show the null values using heatmap
plt.figure(figsize=(16,9))
sns.heatmap(df.isnull())

These white cells are showing the null values

In [12]:
# Get the percentages of null value
null_percent = df.isnull().sum()/df.shape[0]*100
null_percent

In [13]:
col_for_drop = null_percent[null_percent > 20].keys() # if the null value % 20 or > 20 so need to drop it
# drop columns
df = df.drop(col_for_drop, "columns")
df.shape

In [14]:
# find the unique value count
for i in df.columns:
    print(i + "\t" + str(len(df[i].unique())))

In [15]:
# Plot the distplot of target
plt.figure(figsize=(10,8))
bar = sns.distplot(train["SalePrice"])
bar.legend(["Skewness: {:.2f}".format(train['SalePrice'].skew())])

In [16]:
# correlation heatmap
plt.figure(figsize=(25,25))
ax = sns.heatmap(train.corr(), cmap = "coolwarm", annot=True, linewidth=2)

# to fix the bug "first and last row cut in half of heatmap plot"
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

In [51]:
# correlation heatmap of higly correlated features with SalePrice
hig_corr = train.corr()
hig_corr_features = hig_corr.index[abs(hig_corr["SalePrice"]) >= 0.4]
print(hig_corr_features)

plt.figure(figsize=(10,8))
ax = sns.heatmap(train[hig_corr_features].corr(), cmap = "coolwarm", annot=True, linewidth=3)
# to fix the bug "first and last row cut in half of heatmap plot"
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

In [18]:
# Plot regplot to get the nature of highly correlated data
plt.figure(figsize=(16,9))
for i in range(len(hig_corr_features)):
    if i <= 9:
        plt.subplot(3,4,i+1)
        plt.subplots_adjust(hspace = 0.5, wspace = 0.5)
        sns.regplot(data=train, x = hig_corr_features[i], y = 'SalePrice')

# Handling Missing Value

Handling missing value of Bsmt feature

In [19]:
df.isnull().sum()

In [20]:
df['LotFrontage'].fillna(np.mean(df['LotFrontage']), inplace=True)

In [21]:
def fill_null(values):
    type = values[0]
    area = values[1]
    if pd.isnull(type):
        return "None",0
    else:
        return values

df[['MasVnrType','MasVnrArea']] = df[['MasVnrType','MasVnrArea']].apply(fill_null, axis=1)

In [22]:
df['BsmtQual'].fillna(df['BsmtQual'].mode()[0], inplace=True)
df['BsmtCond'].fillna(df['BsmtCond'].mode()[0], inplace=True)
df['BsmtExposure'].fillna(df['BsmtExposure'].mode()[0], inplace=True)
df['BsmtFinType1'].fillna(df['BsmtFinType1'].mode()[0], inplace=True)
df['BsmtFinType2'].fillna(df['BsmtFinType2'].mode()[0], inplace=True)
df['Electrical'].fillna(df['Electrical'].mode()[0], inplace=True)

In [23]:
df['GarageType'].fillna('No',inplace=True) # fill no that doens't have any garage type
df['GarageYrBlt'].fillna(0,inplace=True) # fill 0 that doesn't have any garage yr build because they don't have any garage
df['GarageFinish'].fillna('No',inplace=True)
df['GarageQual'].fillna('No',inplace=True) 
# GarageCond is similar to GarageQual so no point in storing the same columns, we will drop it later.df
# Exterior2nd is similar to Exterior1st we will also drop it later

In [24]:
df.drop('GarageCond', axis=1, inplace=True)
df.drop('Exterior2nd', axis=1, inplace=True)

Handeled the remaining null features

In [25]:
df['MSZoning'].fillna(df['MSZoning'].mode()[0], inplace=True)
df['Utilities'].fillna(df['Utilities'].mode()[0], inplace=True)
df['Exterior1st'].fillna(df['Exterior1st'].mode()[0], inplace=True)
df['BsmtFinSF1'].fillna(df['BsmtFinSF1'].mean(), inplace=True)
df['BsmtFinSF2'].fillna(df['BsmtFinSF2'].mean(), inplace=True)
df['BsmtUnfSF'].fillna(df['BsmtUnfSF'].mean(), inplace=True)
df['TotalBsmtSF'].fillna(df['TotalBsmtSF'].mean(), inplace=True)
df['BsmtFullBath'].fillna(df['BsmtFullBath'].mode()[0], inplace=True)
df['BsmtHalfBath'].fillna(df['BsmtHalfBath'].mode()[0], inplace=True)
df['KitchenQual'].fillna(df['KitchenQual'].mode()[0], inplace=True)
df['Functional'].fillna(df['Functional'].mode()[0], inplace=True)
df['GarageCars'].fillna(df['GarageCars'].mode()[0], inplace=True)
df['GarageArea'].fillna(df['GarageArea'].mean(), inplace=True)
df['SaleType'].fillna(df['SaleType'].mode()[0], inplace=True)


In [26]:
df.isnull().sum()

Now convert all object columns into numeric column

In [27]:
object_columns = df.select_dtypes(include=['object']).columns
object_columns

In [28]:
for i in object_columns:
    df = pd.get_dummies(df, columns=[i])

In [29]:
df.info()

# scaling dataset with robust scaler as we know that there might be some outliers within the dataset.

In [30]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(df)
df = scaler.transform(df)

# Machine Learning Model Building

In [31]:
train_len = len(train)

In [32]:
X_train = df[:train_len]
X_test = df[train_len:]
y_train = train.SalePrice

X_train.shape, X_test.shape, y_train.shape

# Cross Validation

In [33]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, r2_score

def test_model(model, X_train=X_train, y_train=y_train):
    cv = KFold(n_splits = 3, shuffle=True, random_state = 45) # Split your data to k Folds
    r2 = make_scorer(r2_score) # r2_score is the R^2 (coefficient of determination) which used to evaluate the performance of linear Regression Model
    # formula is R^2 = 1-SS(res)/SS(tot)
    r2_val_score = cross_val_score(model, X_train, y_train, cv=cv, scoring = r2) # Evaluates the data and returns the score
    score = [r2_val_score.mean()]
    return score

# Linear Regression

In [34]:
import sklearn.linear_model as linear_model
LR = linear_model.LinearRegression()
test_model(LR)

# L2, Ridge regression

In [35]:
rdg = linear_model.Ridge()
test_model(rdg)

# L1, Lasso Regression

In [36]:
lasso = linear_model.Lasso(alpha=1e-4)
test_model(lasso)

# Support Vector Machine

In [37]:
from sklearn.svm import SVR
svr_reg = SVR(kernel= 'rbf')
test_model(svr_reg)

We got so low accuracy let's hyperparameter tuned this model. 

In [38]:
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV 
# params = {'kernel': ['linear', 'rbf', 'sigmoid'], 
#           'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
#           'C': [0.1, 1, 10, 100, 1000], 
#           'epsilon': [1, 0.2, 0.1, 0.01, 0.001, 0.0001]}

# rand_search = RandomizedSearchCV(svr_reg, param_distributions=params, n_jobs=-1, cv=11) 
# rand_search.fit(X_train, y_train) 
# rand_search.best_params_

Note: It took so much time, that's why I comment it. These are the parameters I got from this hyperparameter tuning

In [39]:
svr_reg= SVR(kernel='rbf', C=1000, epsilon=0.1, gamma=0.01)
test_model(svr_reg)

Still we didn't get much better accuracy

# Decision Tree Regressor

In [40]:
from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor(random_state=21)
test_model(dt_reg)

# Random Forest Regressor

In [41]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators = 1000, random_state=51)
test_model(rf_reg)

# Bagging and Boosting

In [43]:
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor
br_reg = BaggingRegressor(n_estimators=1000, random_state=51)
gbr_reg = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, loss='ls', random_state=51)

In [44]:
test_model(br_reg)

In [45]:
test_model(gbr_reg)

# XG Boost

In [46]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
test_model(xgb_reg)

We got better accuracy with gradient boosting regression till now, let's submit this prediction with this as of now. 

# Submitting Prediction

In [47]:
gbr_reg.fit(X_train, y_train)
y_pred = gbr_reg.predict(X_test).round(2)
y_pred

In [48]:
ID = pd.Series(test.index)
ID

In [49]:
submit_result = pd.concat([ID, pd.DataFrame(y_pred)], axis=1)
submit_result.columns=['Id', 'SalePrice']
submit_result

In [50]:
submit_result.to_csv('submission.csv', index=False)
submit_result