# Ames House Price Prediction

In this project, I predicted the house prices of Ames Housing.

The dataset contains 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa.

In [None]:
# Importing Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

from scipy import stats
from scipy.stats import norm, skew #for some statistics

In [None]:
df_train=pd.read_csv('../input/train.csv')
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_test=pd.read_csv('../input/test.csv')
df_test.info()

In [None]:
# Saving the Id columns
train_id=df_train['Id']
test_id=df_test['Id']

# Dropping Id columns from both train and test as these are not needed for prediction
df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)

# Data Visualization and Processing

## 1) Visualizing and Removing Outliers

In [None]:
# Exploring outliers

fig, ax = plt.subplots()
ax.scatter(x = df_train['GrLivArea'], y = df_train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

There are extreme outliers for GrLivArea>4000 as per http://ww2.amstat.org/publications/jse/v19n3/Decock/DataDocumentation.txt

In [None]:
# Removing outliers

df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000)].index)

#Check the scatter plot again
fig, ax = plt.subplots()
ax.scatter(df_train['GrLivArea'], df_train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

## 2) Predicting Nature of Target Variable

In [None]:
# Histogram plot of SalePrice
sns.distplot(df_train['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df_train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

# Plotting the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

### We can make 2 observations from the above plot:-

1) SalePrice data is right-skewed and unimodal.

2) Most of the houses have SalePrice around 150000.

In [None]:
# Probability plot or QQ plot to see the linear fit of the SalePrice

fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
plt.show()

The SalePrice is not linearly fitted as it is not normally distributed. We need to make log transformation on the target variable to make it more fit to the linear model.

In [None]:
# Log Transformation of the Target Variable
df_train["SalePrice"]=np.log1p(df_train["SalePrice"]) # log(1+x)

In [None]:
# Plots after transformation

# Histogram plot of SalePrice
sns.distplot(df_train['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df_train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

# Plotting the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

# Probability plot or QQ plot to see the linear fit of the SalePrice
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
plt.show()

## 3) Handling Missing Data

In [None]:
# Combining test data and train data
all_data = pd.concat((df_train, df_test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)
all_data = pd.get_dummies(all_data)
all_data.shape

In [None]:
# Imputing missing data by the mean of each column.
all_data = all_data.fillna(all_data.mean())

# Modelling the data

In [None]:
# Generating train and test sets

X_train=all_data[:df_train.shape[0]]
X_test=all_data[:df_test.shape[0]]

y_train=df_train.SalePrice

In [None]:
# Importing libraries for modelling

from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV, LinearRegression, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [None]:
# Cross Validation Strategy to pick the best model

def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [None]:
# Linear regression
model_LinearReg = LinearRegression()
model_LinearReg.fit(X_train, y_train)
rmse_LinearReg = rmse_cv(model_LinearReg).mean()
rmse_LinearReg

In [None]:
# RidgeCV
model_RidgeCV = RidgeCV()
model_RidgeCV.fit(X_train, y_train)
rmse_RidgeCV = rmse_cv(model_RidgeCV).mean()
rmse_RidgeCV

In [None]:
# ElasticNet
model_ElasticNet=ElasticNet()
model_ElasticNet.fit(X_train,y_train)
rmse_ElasticNet=rmse_cv(model_ElasticNet).mean()
rmse_ElasticNet

In [None]:
# ElasticNetCV
model_ElasticNetCV=ElasticNetCV()
model_ElasticNetCV.fit(X_train,y_train)
rmse_ElasticNetCV=rmse_cv(model_ElasticNetCV).mean()
rmse_ElasticNetCV

In [None]:
# lassoCV
model_lassoCV=LassoCV()
model_lassoCV.fit(X_train,y_train)
rmse_lassoCV=rmse_cv(model_lassoCV).mean()
rmse_lassoCV

In [None]:
# LassoLarsCV
model_LassoLarsCV=LassoLarsCV()
model_LassoLarsCV.fit(X_train,y_train)
rmse_LassoLarsCV=rmse_cv(model_LassoLarsCV).mean()
rmse_LassoLarsCV

In [None]:
# KernelRidge
model_KernelRidge=KernelRidge()
model_KernelRidge.fit(X_train,y_train)
rmse_KernelRidge=rmse_cv(model_KernelRidge).mean()
rmse_KernelRidge

In [None]:
# RandomForestRefressor

model_RandomForest=RandomForestRegressor()
model_RandomForest.fit(X_train,y_train)
rmse_RandomForest=rmse_cv(model_RandomForest).mean()
rmse_RandomForest

In [None]:
# XGBoost
dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test)

params = {"max_depth":2, "eta":0.1}
model = xgb.cv(params, dtrain,  num_boost_round=500, early_stopping_rounds=100)

model_xgb = xgb.XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1) #the params were tuned using xgb.cv
model_xgb.fit(X_train, y_train)

In [None]:
rmse_xgb = model.values[-1,0]
rmse_xgb

###  Comparing RMSE of each Model

In [None]:
# Creating RMSE Dictionary
rmse_dict = {'data':[rmse_RidgeCV, rmse_ElasticNet, rmse_ElasticNetCV, rmse_lassoCV,rmse_LassoLarsCV,rmse_KernelRidge,rmse_RandomForest,rmse_xgb]}

# Creating RMSE DataFrame 
rmse_df = pd.DataFrame(data = rmse_dict, index = ['RidgeCV','ElasticNet','ElasticNetCV','LassoCV','LassoLarsCV','KernelRidge','RandomForest','XGBoost'])

# Plotting RMSE 
rmse_df.plot.bar(legend = False, title = 'Root Mean Square Error')

### Predicting Test Data

In [None]:
y_test = model_xgb.predict(X_test)

In [None]:
hprice = pd.DataFrame({"id":test_id})
hprice = hprice.assign(SalePrice = y_test)
hprice.head()

In [None]:
# Feeding Id and SalePrice into Test data
df_test['SalePrice']=y_test
df_test['Id']=test_id

# Data Visualization for Validation of the Predicted Test Data

In [None]:
# Visualizing SalePrice with respect to GrLivArea

fig, ax = plt.subplots()
ax.scatter(x = df_test['GrLivArea'], y = df_test['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

In [None]:
# Histogram plot of SalePrice
sns.distplot(df_train['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df_train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

# Plotting the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

In [None]:
# Histogram plot of SalePrice
sns.distplot(df_test['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df_test['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

# Plotting the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

The histogram plots of both the Train data and the Test data are in sync. Hence, there is high accuracy in the predicted data.

# Exporting the results to external CSV file

In [None]:
df_test[['Id', 'SalePrice']].to_csv('Predicted_House_Price.csv', index=False)