In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#ML
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import *
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import KFold # import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn import datasets, linear_model

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))
pd.set_option('display.max_columns', None)

# Data Analysis

## 1.1 Basic analysis of training data

In [None]:
# Training set
train = pd.read_csv("../input/train.csv")

In [None]:
# Data Type of each column
train.info()

In [None]:
# Stats such as mean, max, min for train data
train.describe()

In [None]:
# For categorical features
train.describe(include='O')

In [None]:
# All columns in the train data
train.columns

In [None]:
# Total number of columns
len(train.columns)

In [None]:
## Number of numerical features
numberic_features=train.select_dtypes(include=[np.number])
numberic_features.columns

In [None]:
len(numberic_features.columns)

In [None]:
numberic_features

In [None]:
## Number of categorical features
categorical_features=train.select_dtypes(include=[np.object])
categorical_features.columns

In [None]:
len(categorical_features.columns)

In [None]:
categorical_features

In [None]:
# Gives an idea of what the training data looks like
train.head()

## 1.2 Figure out how many missing values exist for each column

In [None]:
# Figure out how many null types exist
null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

#OUT OF 1460

In [None]:
# Percent of nulls
percent_null = ((train[null_columns].isnull().sum()/len(train[null_columns])) * 100).sort_values(ascending=False)

In [None]:
# plot missing + null vals
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='90')
sns.barplot(x=percent_null.index, y=percent_null)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of null values + missing values', fontsize=15)

In [None]:
# You can also use .describe to check
train.describe()

## 1.3 Evaluate your target variable

In [None]:
# Stats for your target var
train['SalePrice'].describe()

In [None]:
# Evaluate the distribution of the target Variable
sns.distplot(train['SalePrice']);

In [None]:
# Measure skewness and Kurtosis -> Definition can be found here(http://www.pythonforfinance.net/2016/04/04/python-skew-kurtosis/)
print("Skewness: %f" % train['SalePrice'].skew())
print("Kurtosis: %f" % train['SalePrice'].kurt())

In [None]:
#  A symmetrical dataset will have a skewness equal to 0. So, a normal distribution will have a skewness of 0. 
# A SYMMETRICAL normal distribution has a kurtotis of 0/3. 
print("Above data is not symmetrical")


## 1.4 Data Visualization

### There are 11 useful plots that you can use to visualize your data 
1. Scatter Plot
2. Box Plot
3. Histogram
4. Multivariate plots
5. Violin plots
6. Pair plots
7. kdeplot
8. Heat Map
9. andrew_curves
10. radviz
11. joinplot


### 1.4.1 Scatter Plots: Used to find the relationship between 2 different quantitative features

In [None]:
columns = numberic_features.columns
columns

In [None]:
g=sns.FacetGrid(train[columns], hue="OverallQual", size=10) \
   .map(plt.scatter, "OverallQual", "SalePrice") \
   .add_legend()
g=g.map(plt.scatter, "OverallQual", "SalePrice",edgecolor="w").add_legend();
plt.show()
# Tells me that as OverallQual increases, Sale Prices increases in general

### 1.4.2 JointPlot - an adaptation of the scatter plot. Includes a fit line to see the best fit for your data.

In [None]:
for name in columns:
    sns.jointplot(x=name, y="SalePrice", data=train[columns], size=5,kind="reg",ratio=10,color='green')
    plt.show()

#### Overall Observation:
1. Seems like sale price has few outliers above 700,000
2. OverallQual seems to be increasing with SalePrice
3. TotalBsmtSF has an outlier at 6000, but seems to be increasing with SalePrice
4. GrLivArea seems to be increasing with SalePrice, but has outliers at 4000+
5. Garage Area seems to have few NaN and few outliers at 1250+
6. FullBath increases with SalePrice
7. YearBuilt seems to be increasing with SalePrice, but could have outliers before 1900 and above 200000 (Sale Price)
8. YearRemodAdd seems to be increasing with SalePrice
9. Id is not correlated with SalePrices
10. MSSubClass should be one hot encoded -> No correlation, maybe drop it
11. LotFrontage seems to increase with Sale Price, drop outliers
12. Similar to LotFrontage for LotArea
13. OverallCond should probably be dropped or one hot encoded
14. MasVnrArea seems to have lot of NaNs and few outliers
15.BsmtFinSF1 seems to be correlated with Sale Price and just remove outlier
16. BsmtFinSF2 Should prolly be dropped due to large presence of NaN
17. BsmtUnfSF seems to have lot of 0s or NaN
18. TotalBsmtSF increase with SalePrice, remove outlier
19. 1stFlrSF similar to SalePrice and remove outlier
20.2nd Flr SF has many NaNs
21. Drop LowQualFinSF
22. BsmtFullBath doesn't tell me much. Maybe drop it
23. BsmtHalfBath doesn't tell me much. Maybe drop it
24. HalfBath doesn't tell me much. Maybe drop it
25. BedroomAbvGrade. Maybe split it to read less than 4 and above 4
26. KitchenAbvGrade. Maybe try dropping
27. TotalRmsAboveGrade seems to be increasing with SalePrice.
28. FirePlaces Also seems to be increasing with SalePrice
29. GarageYearBuilt increases with SalePrice
30. GarageCar increases with salePrice
31. GarageArea increases with SalePrice, remove outliers, maybe deal with NaNs
32. WoodDeckSF slightly increasing with SalePrice, but lot of NaNs

### 1.4.3 Box Plot: Used to see the distribution of data of categorical features with target variable

In [None]:
cat_columns = categorical_features.columns
cat_columns

In [None]:
for name in cat_columns:
    plt.subplots(figsize=(30,5))
    sns.boxplot(x=name, y="SalePrice", data=train)
    sns.stripplot(x=name, y="SalePrice", data=train,  size=5, jitter=True, edgecolor="gray")
    plt.show()

### 1.4.4 Violin plots: Used to see the distribution of data of categorical features with target variable. Better representation of box plots

In [None]:
for name in cat_columns:
    plt.subplots(figsize=(30,5))
    sns.violinplot(x=name, y="SalePrice", data=train)
    sns.stripplot(x=name, y="SalePrice", data=train,  size=5, jitter=True, edgecolor="gray")
    plt.show()

### 1.4.5 Histogram: We can also create a histogram of each input variable to get an idea of the distribution

In [None]:
train.hist(figsize=(50,20))
plt.figure()

### 1.4.6  Multivariate Plots: Used to see the interactions between two different features

In [None]:
# Create a function called "chunks" with two arguments, l and n:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i+n]
        
new_cols = list(chunks(columns, 10))
new_cols

In [None]:
pd.plotting.scatter_matrix(train[new_cols[0]],figsize=(50,20))
plt.figure()

In [None]:
pd.plotting.scatter_matrix(train[new_cols[1]],figsize=(50,20))
plt.figure()

In [None]:
pd.plotting.scatter_matrix(train[new_cols[2]],figsize=(50,20))
plt.figure()

In [None]:
pd.plotting.scatter_matrix(train[new_cols[3]],figsize=(50,20))
plt.figure()

## 2. Feature Engineering

### 2.1 Set up Cross Validation

In [None]:
kf = KFold(n_splits=3) # Define the split - into 3 folds 

#### Performance Metrics for cross Val

In [None]:
import math

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_pred, y_test) : 
    assert len(y_test) == len(y_pred)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_test))**2))

rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [None]:
from math import (exp, expm1)

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle_log(y_pred, y_test) : 
    assert len(y_test) == len(y_pred)
    print(np.mean((y_pred - y_test)**2))
    return np.sqrt(np.mean((np.log1p(np.expm1(y_pred)) - np.log1p(np.expm1(y_test)))**2))

rmsle_score_log = make_scorer(rmsle_log, greater_is_better=False)

### 2.2 Set up a constant model to test it against - Here I chose Logistic Regressions

In [None]:
# Logistic regression
def linearRegressionPerformance(X, y):
    predictionModel = linear_model.LinearRegression()
    scores_accuracy = -cross_val_score(predictionModel, X, y, cv=3, scoring=rmsle_score)
    print((scores_accuracy))
    print('K-fold cross-validation results:')
    print(predictionModel.__class__.__name__+" average mean_squared_log_error is %2.3f" % scores_accuracy.mean())



In [None]:
# Logistic regression
def linearRegressionPerformanceLog(X, y):
    predictionModel = linear_model.LinearRegression()
    scores_accuracy = -cross_val_score(predictionModel, X, y, cv=3, scoring=rmsle_score_log)
    print((scores_accuracy))
    print('K-fold cross-validation results:')
    print(predictionModel.__class__.__name__+" average mean_squared_log_error is %2.3f" % scores_accuracy.mean())

In [None]:
# Normalized Logistic Regression
def normalizedlinearRegressionPerformance(X, y):

    x = X.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled, columns=X.columns)
    predictionModel = linear_model.LinearRegression()
    scores_accuracy = cross_val_score(predictionModel, df, y, cv=3, scoring=rmsle_score)
    print(scores_accuracy)
    print('K-fold cross-validation results:')
    print(predictionModel.__class__.__name__+" average mean_squared_log_error is %2.3f" % scores_accuracy.mean())




In [None]:
# Normalized Logistic Regression
def normalizedlinearRegressionPerformanceLog(X, y):

    x = X.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled, columns=X.columns)
    predictionModel = linear_model.LinearRegression()
    print(y)
    scores_accuracy = cross_val_score(predictionModel, df, y, cv=3, scoring=rmsle_score_log)
    print(scores_accuracy)
    print('K-fold cross-validation results:')
    print(predictionModel.__class__.__name__+" average mean_squared_log_error is %2.3f" % scores_accuracy.mean())




### 2.3 Evaluate a Baseline Model to see how your new features are doing relative to the baseline model

In [None]:
modelTrainData,modelValidationData =  train_test_split(train.copy(), test_size=0.2)

In [None]:
meanSalePrice = modelTrainData['SalePrice'].mean()
print(meanSalePrice)

In [None]:
modelValidationData['predictedSalePrice'] = meanSalePrice 

In [None]:
rmsle(modelValidationData['SalePrice'], modelValidationData['predictedSalePrice'])

### 2.4 Feature Engineering Performance

#### 2.4.1.1 Basic Model: Filling missing values(Method 1) + Existing features + One hot encoded categorical features + Drop any features less than 50% of data

In [None]:
# Figure out how many null + missing vals types exist
null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

#OUT OF 1460

In [None]:
# Percent of nulls
percent_null = ((train[null_columns].isnull().sum()/len(train[null_columns])) * 100).sort_values(ascending=False)
percent_null

In [None]:
# plot missing + null vals
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='90')
sns.barplot(x=percent_null.index, y=percent_null)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of null values + missing values', fontsize=15)

In [None]:
#ONE HOT ENCODED TRAIN - DECIDED TO DROP FIRST 5 VALS
trainDropped = train.drop(columns=['Id', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']).copy()

In [None]:
TrainOneHot = trainDropped.copy()

In [None]:
# Simple method of filling in missing values - Fill in numerical value with median and cat values with most frequent cat
TrainOneHot['MasVnrArea'] = TrainOneHot['MasVnrArea'].fillna(TrainOneHot['MasVnrArea'].median())
TrainOneHot['LotFrontage'] = TrainOneHot['LotFrontage'].fillna(TrainOneHot['LotFrontage'].median())
TrainOneHot['GarageYrBlt'] = TrainOneHot['GarageYrBlt'].fillna(TrainOneHot['GarageYrBlt'].median())
TrainOneHot['MasVnrType'] = TrainOneHot['MasVnrType'].fillna(TrainOneHot['MasVnrType'].value_counts().index[0])
TrainOneHot['Electrical'] = TrainOneHot['Electrical'].fillna(TrainOneHot['Electrical'].value_counts().index[0])
TrainOneHot['BsmtFinType2'] = TrainOneHot['BsmtFinType2'].fillna(TrainOneHot['BsmtFinType2'].value_counts().index[0])
TrainOneHot['BsmtFinType1'] = TrainOneHot['BsmtFinType1'].fillna(TrainOneHot['BsmtFinType1'].value_counts().index[0])
TrainOneHot['BsmtExposure'] = TrainOneHot['BsmtExposure'].fillna(TrainOneHot['BsmtExposure'].value_counts().index[0])
TrainOneHot['BsmtCond'] = TrainOneHot['BsmtCond'].fillna(TrainOneHot['BsmtCond'].value_counts().index[0])
TrainOneHot['BsmtQual'] = TrainOneHot['BsmtQual'].fillna(TrainOneHot['BsmtQual'].value_counts().index[0])
TrainOneHot['GarageType'] = TrainOneHot['GarageType'].fillna(TrainOneHot['GarageType'].value_counts().index[0])
TrainOneHot['GarageFinish'] = TrainOneHot['GarageFinish'].fillna(TrainOneHot['GarageFinish'].value_counts().index[0])
TrainOneHot['GarageQual'] = TrainOneHot['GarageQual'].fillna(TrainOneHot['GarageQual'].value_counts().index[0])
TrainOneHot['GarageCond'] = TrainOneHot['GarageCond'].fillna(TrainOneHot['GarageCond'].value_counts().index[0])

In [None]:
TrainOneHot = pd.get_dummies(TrainOneHot, columns=TrainOneHot.columns[TrainOneHot.dtypes == 'object'],drop_first=True)
TrainOneHot = pd.get_dummies(TrainOneHot, columns=['MSSubClass'], drop_first=True)
TrainOneHot.info()

In [None]:
y = TrainOneHot['SalePrice'].copy()
X = TrainOneHot.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformance(X,y)

#### Basic Model Accuracy - 0.186

#### 2.4.1.2 Test Model 2: Filling in missing values(Method 2) + one hot + dropping any feature with less than 50% data

In [None]:
test_model_copy = trainDropped.copy()

In [None]:
test_model_copy["LotFrontage"] = test_model_copy.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x_v: x_v.fillna(x_v.median()))
test_model_2 = test_model_copy.copy()

In [None]:
test_model_2['MasVnrArea'] = test_model_2['MasVnrArea'].fillna(test_model_2['MasVnrArea'].median())
test_model_2['MasVnrType'] = test_model_2['MasVnrType'].fillna(test_model_2['MasVnrType'].value_counts().index[0])
test_model_2['GarageYrBlt'] = test_model_2['GarageYrBlt'].fillna(test_model_2['GarageYrBlt'].median())
test_model_2['MasVnrType'] = test_model_2['MasVnrType'].fillna(test_model_2['MasVnrType'].value_counts().index[0])
test_model_2['Electrical'] = test_model_2['Electrical'].fillna(test_model_2['Electrical'].value_counts().index[0])
test_model_2['BsmtFinType2'] = test_model_2['BsmtFinType2'].fillna(test_model_2['BsmtFinType2'].value_counts().index[0])
test_model_2['BsmtFinType1'] = test_model_2['BsmtFinType1'].fillna(test_model_2['BsmtFinType1'].value_counts().index[0])
test_model_2['BsmtExposure'] = test_model_2['BsmtExposure'].fillna(test_model_2['BsmtExposure'].value_counts().index[0])
test_model_2['BsmtCond'] = test_model_2['BsmtCond'].fillna(test_model_2['BsmtCond'].value_counts().index[0])
test_model_2['BsmtQual'] = test_model_2['BsmtQual'].fillna(test_model_2['BsmtQual'].value_counts().index[0])
test_model_2['GarageType'] = test_model_2['GarageType'].fillna(test_model_2['GarageType'].value_counts().index[0])
test_model_2['GarageFinish'] = test_model_2['GarageFinish'].fillna(test_model_2['GarageFinish'].value_counts().index[0])
test_model_2['GarageQual'] = test_model_2['GarageQual'].fillna(test_model_2['GarageQual'].value_counts().index[0])
test_model_2['GarageCond'] = test_model_2['GarageCond'].fillna(test_model_2['GarageCond'].value_counts().index[0])

In [None]:
test_model_2["LotFrontage"] = test_model_2.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x_v: x_v.fillna(x_v.median()))

In [None]:
test_model_2 = pd.get_dummies(test_model_2, columns=test_model_2.columns[test_model_2.dtypes == 'object'],drop_first=True)
test_model_2 = pd.get_dummies(test_model_2, columns=['MSSubClass'], drop_first=True)
test_model_2.info()

In [None]:
y_2 = test_model_2['SalePrice'].copy()
X_2 = test_model_2.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformance(X_2,y_2)

#### Filling in median of neighboorhood for LotFrontage is doing better - 0.185

#### 2.4.1.3 Test Model 3: Filling in missing values(Method 3 - fill in 0s instead of median) + one hot + dropping any feature with less than 50% data

In [None]:
test_model_3_copy = test_model_copy.copy()

In [None]:
test_model_3_copy['MasVnrArea'] = test_model_3_copy['MasVnrArea'].fillna(0)
test_model_3_copy['MasVnrType'] = test_model_3_copy['MasVnrType'].fillna('None')
test_model_3_copy['GarageYrBlt'] = test_model_3_copy['GarageYrBlt'].fillna(0)
test_model_3_copy['Electrical'] = test_model_3_copy['Electrical'].fillna(test_model_3_copy['Electrical'].value_counts().index[0])
test_model_3_copy['BsmtFinType2'] = test_model_3_copy['BsmtFinType2'].fillna('None')
test_model_3_copy['BsmtFinType1'] = test_model_3_copy['BsmtFinType1'].fillna('None')
test_model_3_copy['BsmtExposure'] = test_model_3_copy['BsmtExposure'].fillna('None')
test_model_3_copy['BsmtCond'] = test_model_3_copy['BsmtCond'].fillna('None')
test_model_3_copy['BsmtQual'] = test_model_3_copy['BsmtQual'].fillna('None')
test_model_3_copy['GarageType'] = test_model_3_copy['GarageType'].fillna('None')
test_model_3_copy['GarageFinish'] = test_model_3_copy['GarageFinish'].fillna('None')
test_model_3_copy['GarageQual'] = test_model_3_copy['GarageQual'].fillna('None')
test_model_3_copy['GarageCond'] = test_model_3_copy['GarageCond'].fillna('None')

In [None]:
test_model_3_copy = pd.get_dummies(test_model_3_copy, columns=test_model_3_copy.columns[test_model_3_copy.dtypes == 'object'],drop_first=True)
test_model_3_copy = pd.get_dummies(test_model_3_copy, columns=['MSSubClass'], drop_first=True)
test_model_3_copy.info()

In [None]:
y_3 = test_model_3_copy['SalePrice'].copy()
X_3 = test_model_3_copy.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformance(X_3,y_3)

#### Using None instead of median makes the model worse

#### 2.4.1.4 Not dropping the features 

In [None]:
test_model_4 = train.copy()

In [None]:
test_model_4["LotFrontage"] = test_model_4.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x_v: x_v.fillna(x_v.median()))
test_model_4['MasVnrArea'] = test_model_4['MasVnrArea'].fillna(test_model_4['MasVnrArea'].median())
test_model_4['MasVnrType'] = test_model_4['MasVnrType'].fillna(test_model_4['MasVnrType'].value_counts().index[0])
test_model_4['GarageYrBlt'] = test_model_4['GarageYrBlt'].fillna(test_model_4['GarageYrBlt'].median())
test_model_4['MasVnrType'] = test_model_4['MasVnrType'].fillna(test_model_4['MasVnrType'].value_counts().index[0])
test_model_4['Electrical'] = test_model_4['Electrical'].fillna(test_model_4['Electrical'].value_counts().index[0])
test_model_4['BsmtFinType2'] = test_model_4['BsmtFinType2'].fillna(test_model_4['BsmtFinType2'].value_counts().index[0])
test_model_4['BsmtFinType1'] = test_model_4['BsmtFinType1'].fillna(test_model_4['BsmtFinType1'].value_counts().index[0])
test_model_4['BsmtExposure'] = test_model_4['BsmtExposure'].fillna(test_model_4['BsmtExposure'].value_counts().index[0])
test_model_4['BsmtCond'] = test_model_4['BsmtCond'].fillna(test_model_4['BsmtCond'].value_counts().index[0])
test_model_4['BsmtQual'] = test_model_4['BsmtQual'].fillna(test_model_4['BsmtQual'].value_counts().index[0])
test_model_4['GarageType'] = test_model_4['GarageType'].fillna(test_model_4['GarageType'].value_counts().index[0])
test_model_4['GarageFinish'] = test_model_4['GarageFinish'].fillna(test_model_4['GarageFinish'].value_counts().index[0])
test_model_4['GarageQual'] = test_model_4['GarageQual'].fillna(test_model_4['GarageQual'].value_counts().index[0])
test_model_4['GarageCond'] = test_model_4['GarageCond'].fillna(test_model_4['GarageCond'].value_counts().index[0])
test_model_4['MiscFeature'] = test_model_4['MiscFeature'].fillna('None')
test_model_4['Fence'] = test_model_4['Fence'].fillna('None')
test_model_4['PoolQC'] = test_model_4['PoolQC'].fillna('None')
test_model_4['FireplaceQu'] = test_model_4['FireplaceQu'].fillna('None')
test_model_4['Alley'] = test_model_4['Alley'].fillna('None')

In [None]:
test_model_4 = pd.get_dummies(test_model_4, columns=test_model_4.columns[test_model_4.dtypes == 'object'],drop_first=True)
test_model_4 = pd.get_dummies(test_model_4, columns=['MSSubClass'], drop_first=True)
test_model_4.info()

In [None]:
y_4 = test_model_4['SalePrice'].copy()
X_4 = test_model_4.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformance(X_4,y_4)

#### Not Dropping is terrible

### 2.4.2 Normalized Model

#### For some reason normalized model performs pretty bad

In [None]:
x = normalizedlinearRegressionPerformance(X_2,y_2)

#### Normalized Model RMLSE 2.691 -> Doing worse than expected

### 2.4.3 Applying Log of Sale Price

In [None]:
from scipy.stats import norm

sns.distplot(y_2 , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(y_2)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(y_2, plot=plt)
plt.show()

   ##### The target variable is right skewed. As (linear) models love normally distributed data , we need to transform this variable and make it more normally distributed.

In [None]:
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
test_model_2['SalePrice'] = np.log(1 + test_model_2['SalePrice'])

#Check the new distribution 
sns.distplot(test_model_2['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(test_model_2['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(test_model_2['SalePrice'], plot=plt)
plt.show()

In [None]:
y_log = test_model_2['SalePrice'].copy()
X_log = test_model_2.drop(columns = ['SalePrice']).copy()

In [None]:
x = linearRegressionPerformanceLog(X_log,y_log)

#### Pretty good improvement after predicting log SalePrices

### 2.4.4 Further feature engineering

In [None]:
new_features = test_model_2.copy()

#### 2.4.4.1 Lot Fontrage - remove outlier

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("LotFrontage", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 600), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.loc[new_features['LotFrontage'] < 300].copy()
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("LotFrontage", "SalePrice",data=feature, kind="reg",
                  xlim=(0, 400), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
x = linearRegressionPerformanceLog(X,y)

In [None]:
new_features = feature.copy()

#### Removing outlier improves the model significantly - 0.138

#### 2.4.4.2 LotArea - No improvment

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("LotArea", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 80000), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.loc[new_features['LotArea'] < 70000].copy()
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
feature.head()

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("LotArea", "SalePrice", data=feature, kind="reg",
                  xlim=(0, 80000), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
x = linearRegressionPerformanceLog(X,y)

#### No improvement after removing outlier

#### 2.4.4.3 MasVnrArea - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("MasVnrArea", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 1000), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'MasVnrArea'

In [None]:
feature.drop(feature_name, axis=1, inplace=True)
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformanceLog(X,y)

#### No improvement to dropping MasVnrArea

#### 2.4.4.4 BsmtFinSF2 - No Improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("BsmtFinSF2", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 1000), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'BsmtFinSF2'

In [None]:
feature = feature.drop(columns = ['BsmtFinSF2'])
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformanceLog(X,y)

#### No improvement to dropping feature

In [None]:
feature = new_features.copy()
feature_name = 'BsmtFinSF2'

In [None]:
feature[feature_name] = np.log((1+ feature[feature_name]))
feature[feature_name] = feature[feature_name].replace(0,feature[feature_name].loc[feature[feature_name] > 0].mean())
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("BsmtFinSF2", "SalePrice", data=feature, kind="reg",
                  xlim=(0, 10), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
linearRegressionPerformanceLog(X,y)

#### No improvement to log of this feature

#### 2.4.4.5 BsmtUnfSF - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("BsmtUnfSF", "SalePrice", data=new_features, kind="reg",
                  xlim=(0,2500), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'BsmtUnfSF'

In [None]:
feature[feature_name] = feature[feature_name].replace(0,feature[feature_name].loc[feature[feature_name] > 0].mean())
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("BsmtUnfSF", "SalePrice", data=feature, kind="reg",
                  xlim=(0,2500), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
linearRegressionPerformanceLog(X,y)

#### No improvement to shifting the '0' to mean

In [None]:
feature = new_features.copy()
feature[feature_name] = np.log((1+ feature[feature_name]))
feature[feature_name] = feature[feature_name].replace(0,feature[feature_name].loc[feature[feature_name] > 0].mean())
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("BsmtUnfSF", "SalePrice", data=feature, kind="reg",
                  xlim=(0,10), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
linearRegressionPerformanceLog(X,y)

#### No improvement to log of feature

In [None]:
feature = new_features.copy()
feature_name = 'BsmtUnfSF'
feature = feature.loc[(feature[feature_name] > 1000) | ((feature[feature_name] < 1000) & (feature['SalePrice'] < 600000))]
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("BsmtUnfSF", "SalePrice", data=feature, kind="reg",
                  xlim=(0,4000), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
linearRegressionPerformanceLog(X,y)

#### No improvement to outlier removal

#### 2.4.4.6 TotalBsmtSF - Improvement to outlier removal - 0.132

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("TotalBsmtSF", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 3500), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'TotalBsmtSF'

In [None]:
feature[feature_name] = feature[feature_name].replace(0,feature[feature_name].loc[feature[feature_name] > 0].mean())
feature = feature.loc[(feature[feature_name] < 3000) | ((feature[feature_name] > 3000) & (feature['SalePrice'] > 14))]
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("TotalBsmtSF", "SalePrice", data=feature, kind="reg",
                  xlim=(0, 3500), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
linearRegressionPerformanceLog(X,y)

In [None]:
new_features = feature.copy()

#### Improvement in outlier removal

#### 2.4.4.7 1stFlrSF - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("1stFlrSF", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 4000), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = '1stFlrSF'
feature = feature.loc[(feature[feature_name] < 2750) | ((feature[feature_name] > 2750) & (feature['SalePrice'] > 450000))]
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("1stFlrSF", "SalePrice", data=feature, kind="reg",
                  xlim=(0, 4000), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
linearRegressionPerformanceLog(X,y)

#### No major improvement 

#### 2.4.4.8 2ndFlrSF - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("2ndFlrSF", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 3500), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = '2ndFlrSF'
feature[feature_name] = feature[feature_name].replace(0,feature[feature_name].loc[feature[feature_name] > 0].mean())
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("2ndFlrSF", "SalePrice", data=feature, kind="reg",
                  xlim=(0, 3500), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
linearRegressionPerformanceLog(X,y)

In [None]:
feature = new_features.copy()
feature_name = '2ndFlrSF'
feature.drop(feature_name, axis=1, inplace=True)
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformanceLog(X,y)

### No improvement

#### 2.4.4.8 LowQualFinSF - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("LowQualFinSF", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 700), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'LowQualFinSF'
feature.drop(feature_name, axis=1, inplace=True)
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformanceLog(X,y)

### No Improvement

#### 2.4.4.9 GarageArea - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("GarageArea", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 3500), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'GarageArea'
feature = feature.loc[(feature[feature_name] < 1200) | ((feature[feature_name] > 1200) & (feature['SalePrice'] > 300000))]
feature[feature_name] = feature[feature_name].replace(0,feature[feature_name].loc[feature[feature_name] > 0].mean())
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("GarageArea", "SalePrice", data=feature, kind="reg",
                  xlim=(0, 3500), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
linearRegressionPerformanceLog(X,y)

### No improvement

#### 2.4.4.a WoodDeckSF - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("WoodDeckSF", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 1000), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'WoodDeckSF'
feature.drop(feature_name, axis=1, inplace=True)
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformanceLog(X,y)

#### Hurts the model. Don't drop

#### 2.4.4.b OpenPorchSF - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("OpenPorchSF", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 350), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'OpenPorchSF'
feature.drop(feature_name, axis=1, inplace=True)
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformanceLog(X,y)

### No improvement

#### 2.4.4.c EnclosedPorch - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("EnclosedPorch", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 350), ylim=(10, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'EnclosedPorch'
feature.drop(feature_name, axis=1, inplace=True)
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformanceLog(X,y)

### Don't drop feature

#### 2.4.4.d 3SsnPorch - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("3SsnPorch", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 500), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = '3SsnPorch'
feature.drop(feature_name, axis=1, inplace=True)
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformanceLog(X,y)

### No Improvement

#### 2.4.4.e ScreenPorch - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("ScreenPorch", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 350), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'ScreenPorch'
feature.drop(feature_name, axis=1, inplace=True)
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformanceLog(X,y)

### No improvement

#### 2.4.4.f PoolArea - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("PoolArea", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 1000), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'PoolArea'
feature.drop(feature_name, axis=1, inplace=True)
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformanceLog(X,y)

#### No Improvement

#### 2.4.4.g  Misc Val - No improvement

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("MiscVal", "SalePrice", data=new_features, kind="reg",
                  xlim=(0, 4000), ylim=(0, 20), color="m")
g.fig.set_size_inches(20,20)

In [None]:
feature = new_features.copy()
feature_name = 'MiscVal'
feature.drop(feature_name, axis=1, inplace=True)
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()

In [None]:
linearRegressionPerformanceLog(X,y)

#### No improvement

## Final Eval

In [None]:
feature = new_features.copy()
y = feature['SalePrice'].copy()
X = feature.drop(columns = ['SalePrice']).copy()
linearRegressionPerformanceLog(X,y)