# Table of Contents

[01. Import Library](#01)<br>
[02. Load Data](#02)<br>
[03. Exploratory Data Analysis (EDA)](#03)<br>
&nbsp;&nbsp;&nbsp;[3.1. Dependent Variable](#3.1)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# 01. Import Library<a id='01'></a>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

import plotly_express as px

from scipy import stats
from scipy.stats import norm, skew 

import pandas_profiling

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# 02. Load Data <a id='02'></a>

In [None]:
print("List of files:", os.listdir('/kaggle/input/house-prices-advanced-regression-techniques'))

# Train data
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
print("\nTrain data length:",df_train.shape)
print("\nTrain data columns:",df_train.columns)
print("\nTrain data columns:",df_train.info())
print("\nTrain data:\n\n",df_train.head())

In [None]:
# Test data
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
print("\nTest data length:",df_test.shape)

# 03. Exploratory Data Analysis (EDA)<a id='03'></a>

In [None]:
# Correlation
df_train_corr = df_train.corr()
df_train_corr

In [None]:
df_train_corr.style.background_gradient(cmap='coolwarm', axis=None)

In [None]:
# SalePrice has highest corr with OverallQual
df_train_corr[['SalePrice','OverallQual']].style.background_gradient(cmap='coolwarm', axis=None)

In [None]:
# Use panda profile report
# df_train.profile_report()

## 3.1. Dependent Variable<a id='3.1'></a>

In [None]:
df_train['SalePrice'].describe()

In [None]:
ax=df_train['SalePrice'].plot.hist(bins=100, alpha=0.6)

### Use matplotlib

In [None]:
# Use matplotlib

# plt.style.use('ggplot')
plt.hist(df_train['SalePrice'], bins = 100)

# Add title and axis names
plt.title('Sales Price')
plt.xlabel('Frequency')
plt.ylabel('Price') 


plt.show()

In [None]:
# Scatter Plot
fig, ax = plt.subplots()
ax.scatter(df_train['GrLivArea'], df_train['SalePrice'])
plt.ylabel('SalePrice', fontsize=12)
plt.xlabel('GrLivArea', fontsize=12)
plt.title('Sale Price', fontsize=16)
plt.show()

In [None]:
# QQ-plot
fig = plt.figure()
ax = fig.add_subplot()
res = stats.probplot(df_train['SalePrice'], plot=plt)
plt.show()

### Use plotly_express

In [None]:
# Scatter Plot with color from 2nd variable
px.scatter(df_train, x='GrLivArea', y='SalePrice', color='OverallQual')

In [None]:
# Scatter Plot with color from 2nd variable
px.scatter(df_train, x='TotalBsmtSF', y='SalePrice', color='OverallQual')

In [None]:
# Box Plot
px.box(df_train[['OverallQual', 'SalePrice']].sort_values(by='OverallQual')
       , x='OverallQual'
       , y='SalePrice'
       , color='OverallQual')

In [None]:
# Box Plot
px.box(df_train[['SaleCondition', 'SalePrice']].sort_values(by='SaleCondition')
       , x='SaleCondition'
       , y='SalePrice'
       , color='SaleCondition')

In [None]:
# Box Plot
px.box(df_train[['ExterQual', 'SalePrice']].sort_values(by='ExterQual')
       , x='ExterQual'
       , y='SalePrice'
       , color='ExterQual')

### Use seaborn

In [None]:
sns.distplot(df_train['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df_train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

# Plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='upper right')

ax = plt.axes()
plt.ylabel('Frequency')
plt.title('SalePrice distribution')


In [None]:
sns.heatmap(df_train_corr, 
            xticklabels=df_train_corr.columns.values,
            yticklabels=df_train_corr.columns.values)

# Independent Variables

In [None]:
df_train.drop(['SalePrice'], axis = 1).describe().T

In [None]:
# Clean outliers
print("Length of data before dropping outliers:", len(df_train))
df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000) 
                                & (df_train['SalePrice']<300000)].index)
print("Length of data after dropping outliers:", len(df_train))
df_train = df_train.drop(df_train[(df_train['GrLivArea']>5000) 
                                | (df_train['SalePrice']>500000)].index)
print("Length of data after dropping outliers:", len(df_train))

In [None]:
# Quantitative Variables
quan_var = [q for q in df_train.columns if df_train.dtypes[q] != 'object']
quan_var.remove('SalePrice') 
quan_var.remove('Id')
print("Quantitative Variables:\n", quan_var)

# Qualitative Variables
qual_var = [q for q in df_train.columns if df_train.dtypes[q] == 'object']
print("\nQualitative Variables:\n", qual_var)

In [None]:
# Combine all data
ntrain = df_train.shape[0]
ntest = df_test.shape[0]
y_train = df_train.SalePrice.values
df_all_data = pd.concat((df_train, df_test)).reset_index(drop=True)
df_all_data.drop(['SalePrice'], axis=1, inplace=True)
print("all_data size is : {}".format(df_all_data.shape))

# Calculate missing data ratio
df_all_data_na = (df_all_data.isnull().sum() / len(df_all_data)) * 100
df_all_data_na = df_all_data_na.drop(df_all_data_na[df_all_data_na == 0].index).sort_values(ascending=False)[:50]
missing_data = pd.DataFrame({'Missing Ratio' :df_all_data_na})
print('Missing data percentage:\n',missing_data.head(50))

# Plot
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='90')
ax.set_facecolor("white")
sns.barplot(x=df_all_data_na.index, y=df_all_data_na)
sns.color_palette('pastel')
plt.xlabel('Features', fontsize=12)
plt.ylabel('Percent of missing values', fontsize=12)
plt.title('Percent missing data by feature', fontsize=15)

# Prediction

In [None]:
df_result = pd.DataFrame(columns=['Model','RMSE','MSE','Summary'])
df_result

## Baseline Model

### Linear Regression

In [None]:
# Run Linear Regression on a single variable that has the highest corr with dependent variable
X = df_train[['OverallQual']]
y = df_train['SalePrice']

# Train Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {:.2f}".format(rmse))
df_result = df_result.append(pd.DataFrame([['Linear Regression'
                                            , rmse
                                            , mse
                                            ,'Baseline model'                               
                                           ]], columns=df_result.columns))
print(df_result)

### Random Forest

In [None]:
# RandomForestRegressor
rf = RandomForestRegressor(random_state=10)
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print("Root Mean Squared Error: {:.2f}".format(rmse))
df_result = df_result.append(pd.DataFrame([['RandomForestRegressor'
                                            , rmse
                                            , mse
                                            ,'Baseline model'                               
                                           ]], columns=df_result.columns))
print(df_result)

### Features - Missing Ratio

In [None]:
# Get the list of variable based on missing data ratio
features_for_reg = missing_data[missing_data['Missing Ratio']<50].index.values.tolist()


# Get Dummies
X_all = pd.get_dummies(df_all_data[features_for_reg])
X_all.fillna(0, inplace=True)

X = X_all[0:len(df_train)]
y = df_train['SalePrice']

# Initiate train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

rf = RandomForestRegressor(random_state=3)
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print("Root Mean Squared Error: {:.2f}".format(rmse))
df_result = df_result.append(pd.DataFrame([['RandomForestRegressor'
                                            , rmse
                                            , mse
                                            ,'Features with less than 50% missing data'                               
                                           ]], columns=df_result.columns))



# Calculate feature importances
importances = rf.feature_importances_
# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]
# Rearrange feature names so they match the sorted feature importances
names = [X_train.columns[i] for i in indices]

print(names)


### Features - Importance

In [None]:
# Get the list of variable based on rf feature importance
features_for_reg = names[:45]


# Run Linear Regression
X_all = X_all[features_for_reg]
X_all.fillna(0, inplace=True)

X = X_all[0:len(df_train)]
y = df_train['SalePrice']

# Initiate train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

rf = RandomForestRegressor(random_state=3)
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print("Root Mean Squared Error: {:.2f}".format(rmse))
df_result = df_result.append(pd.DataFrame([['RandomForestRegressor'
                                            , rmse
                                            , mse
                                            ,'Important features based on RF'                               
                                           ]], columns=df_result.columns))



# Calculate feature importances
importances = rf.feature_importances_
# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]
# Rearrange feature names so they match the sorted feature importances
names = [X_train.columns[i] for i in indices]

print(names)


### Feature Engineering

In [None]:
# New feature
df_all_data["OverallQual_Garage_GrLivArea"] = df_all_data["OverallQual"] * df_all_data["GarageArea"] * df_all_data["GrLivArea"]

# Get Dummies
X_all = pd.get_dummies(df_all_data)
X_all.fillna(0, inplace=True)

X = X_all[0:len(df_train)]
y = df_train['SalePrice']

# Initiate train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

rf = RandomForestRegressor(random_state=3)
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print("Root Mean Squared Error: {:.2f}".format(rmse))
df_result = df_result.append(pd.DataFrame([['RandomForestRegressor'
                                            , rmse
                                            , mse
                                            ,'Features engineering'                               
                                           ]], columns=df_result.columns))



# Calculate feature importances
importances = rf.feature_importances_
# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]
# Rearrange feature names so they match the sorted feature importances
names = [X_train.columns[i] for i in indices]

print(len(names))
print(names)

In [None]:
df_result

### Prediction Submission

In [None]:
# Predict
X_test = X_all.iloc[len(df_train):len(X_all)]
y_pred_rf = rf.predict(X_test)

In [None]:
# Submission
sub = pd.DataFrame()
sub['Id'] = df_test['Id']
sub['SalePrice'] = y_pred_rf
sub.to_csv('submission.csv',index=False)