In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
import warnings

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Loading The Data

In [None]:
train_filepath = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'
train_data = pd.read_csv(train_filepath)

test_filepath = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'
test_data = pd.read_csv(test_filepath)

In [None]:
train_data.head()

In [None]:
test_data.head()

# 2. Inspect the Dataset

In [None]:
train_data.describe()

In [None]:
test_data.describe()

In [None]:
train_data.columns

In [None]:
test_data.columns

### Inspect the label 

# Before start, we set the label for training data using the copy of train_data. So, it will not affected the data preprocess

In [None]:
# Y will be the label for training
train_data_copy = train_data.copy()
Y = train_data_copy['SalePrice']

In [None]:
#Plotting the label
sns.distplot(train_data['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train_data['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train_data['SalePrice'], plot=plt)
plt.show() 

Investigate the dataset to see which correlate with Sale Price

In [None]:
#Take only numerical data to make the process easier
train_data = train_data.select_dtypes(exclude=['object'])
test_data = test_data.select_dtypes(exclude=['object'])

In [None]:
#Look for correlation between feature and label using regression plot

def get_plot(a, b):
    for feature in train_data.columns[a:b]:
        sns.lmplot(x=feature, y='SalePrice', data=train_data)
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        
get_plot(0, 10)

In [None]:
get_plot(9, 20)

In [None]:
get_plot(20,37)

From the plots above, we can filtered which feature will be drop from the train data

In [None]:
train_data.columns

In [None]:
# Remove rows with missing target, separate target from predictors
train_data.drop(['Id', 'SalePrice', 'OverallCond', 'BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'HalfBath', 'PoolArea', 'MiscVal'], axis=1, inplace=True)


In [None]:
train_data.head()

## Using the same method, we can do the same with the test data

In [None]:
test_data_copy = test_data.copy()

In [None]:
test_data.drop(['Id', 'OverallCond', 'BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'HalfBath', 'PoolArea', 'MiscVal'], axis=1, inplace=True)

In [None]:
test_data.head()

# 3. Feature Engineering on Train data

Inspect feature with the missing value

In [None]:
col_with_missing = train_data.isnull().any()
col_with_missing_sum = train_data.isnull().sum()

In [None]:
col_with_missing

In [None]:
col_with_missing_sum

2 ways of handling the missing value are drop and impute feature, we will drop the LotFrontAge feature and impute MasVnrArea and GarageYrBlt

In [None]:
#drop the LotFrontAge feature

train_data = train_data.drop(['LotFrontage'], axis=1)

In [None]:
#Impute the MasVnrArea and GarageYrBlt
from sklearn.impute import SimpleImputer

myimputer = SimpleImputer(strategy='mean')
imputed_train_data = pd.DataFrame(myimputer.fit_transform(train_data))

# Fill in the lines below: imputation removed column names; put them back
imputed_train_data.columns = train_data.columns



## After drop and impute the missing value, we got a clean train data. We still need to scale the train data so it will help improve the model accuracy

In [None]:
print(imputed_train_data.shape)
imputed_train_data.head()

In [None]:
#Labels data
pd.DataFrame(Y)
Y.head()

Scale the data using MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

#scalling the numerical features
def ScaleNumerical(data):
    scale=MinMaxScaler()
    data_scaled=scale.fit_transform(data)
    data = pd.DataFrame(data_scaled, columns=data.columns)
    return data.head()

ScaleNumerical(imputed_train_data)

### -------------------------------------------------------------------------------------

Feature Engineering for the Test Data

In [None]:
col_with_missing_test = test_data.isnull().any()
col_with_missing__test_sum = test_data.isnull().sum()

In [None]:
#drop the LotFrontAge feature

test_data = test_data.drop(['LotFrontage'], axis=1)

In [None]:
#Impute the MasVnrArea and GarageYrBlt
myimputer = SimpleImputer(strategy='mean')
imputed_test_data = pd.DataFrame(myimputer.fit_transform(test_data))

# Fill in the lines below: imputation removed column names; put them back
imputed_test_data.columns = test_data.columns


In [None]:
#scalling the numerical features
def ScaleNumerical(data):
    scale=MinMaxScaler()
    data_scaled=scale.fit_transform(data)
    data = pd.DataFrame(data_scaled, columns=data.columns)
    return data.head()

ScaleNumerical(imputed_test_data)

# 4. Split the data

We need valuation dataset from the training for validate the model before jump right into the test set

In [None]:
X = imputed_train_data
X_test = imputed_test_data

In [None]:
from sklearn.model_selection import train_test_split

#Split the training data into train and validaton data
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, random_state=0)

print('X_train shape: ', X_train.shape, '\n')
print('X_val shape: ', X_val.shape, '\n')
print('Y_train shape: ', Y_train.shape, '\n')
print('Y_val shape: ', Y_val.shape, '\n')

# 5. Build the model

Using Random Forest Regressor

In [None]:
#Fitting random forest regressor to the data
#Import the model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

#Create the model
forest_model = RandomForestRegressor(n_estimators = 150, max_depth=15, random_state = 0)

#Fit the model with feature and label (training)
forest_model.fit(X, Y)

#Evaluate the model with validation dataset
Y_val_pred = forest_model.predict(X_val)
print(mean_absolute_error(Y_val, Y_val_pred))

In [None]:
#Predict the house value in test data using the regressor model
prediction = forest_model.predict(X_test)

In [None]:
output = pd.DataFrame({'Id': test_data_copy.Id, 'SalePrice': prediction})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")