In [1]:
import os
import sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

%load_ext autoreload
%autoreload 2

import statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.diagnostic import linear_rainbow, het_breuschpagan
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from src.eda import *
from src.regression import *
from src.visualization import *

## 1) Data Preparation and Understanding
> ### Make a function call to read and merge to a consolidated dataframe.
> ### Function: Consodidate_data (year, create)
>> ####    Parameter: 1) Year: Year to analyze (as yyyy)
>> ####               2) Create: `True` -- Create a `consolidated.csv` from reading all input files in data directory
>> ####                          `False`-- Read from 'consolidated.csv' 
>> ####.   Returns: dataframe of consolidated attributes to use it in the model

In [2]:
# Returns df_merged with selected columns from each file. Also, save the contents to ./data/consolidated.csv to load to avoid creating data each time.
df_merged = consolidate_data(year=2019, create=False)

Reading Sales Data from ./data/raw/EXTR_RPSale.csv ...


FileNotFoundError: [Errno 2] File b'./data/raw/EXTR_RPSale.csv' does not exist: b'./data/raw/EXTR_RPSale.csv'

In [None]:
# Cleanup data before building regression model
df_merged = data_cleanup(df_merged)

In [None]:
# Get the pairwise correlation of 'SalePrice' with other factors & a complete correlation matrix
corr = data_correlation(df_merged)
corr

In [None]:
# Create heatmap for all factors to get a general idea
create_heatmap(df_merged, 'merged_heatmap' )

In [None]:
# also, create a pairplot .. though not very useful but will help to spot some visual clues
# create_pairplot(df_merged, 'merged_pairplot')

# create regression models

In [None]:
x_sqft = ['SalePrice', 'SqFtLot','SqFt1stFloor','SqFtHalfFloor','SqFt2ndFloor', 'SqFtUpperFloor',\
          'SqFtUnfinFull','SqFtUnfinHalf', 'SqFtTotLiving', 'SqFtTotBasement', 'SqFtFinBasement', 'SqFtGarageAttached',\
          'SqFtOpenPorch','SqFtEnclosedPorch', 'SqFtDeck']
x_amenity = ['SalePrice', 'NbrLivingUnits', 'Stories','HeatSystem', 'Bedrooms' , 'DaylightBasement',\
            'BathHalfCount', 'Bath3qtrCount', 'BathFullCount', 'FpSingleStory', 'FpMultiStory', 'YrRenovated', 'PcntComplete' ]

x_reg = ['SalePrice', 'PrincipalUse','SqFtTotLiving', 'NbrLivingUnits', 'LakeWashington', 'SqFt1stFloor', \
 'TidelandShoreland','BathFullCount','SqFtOpenPorch','SqFtEnclosedPorch','Stories','Bedrooms','Area', 'PropertyClass', 'SqFtLot', 'WfntLocation', \
         'SqFtTotBasement', 'SqFtFinBasement']

# x_cat = ['PropType','WfntAccessRights','WfntProximityInfluence','PowerLines','OtherNuisances']
x_cat = ['PowerLines', 'DaylightBasement']

#For nuisance analysis
x_nuisance = ['SalePrice', 'TrafficNoise',  "PowerLines", 'OtherNuisances']


df_sqft = df_merged[x_sqft]
df_amenity = df_merged[x_amenity]
df_reg = df_merged[x_reg]
df_cat = df_merged[x_cat]

df_nuisance = df_merged[x_nuisance]
df_porch = df_merged[['SalePrice','SqFtOpenPorch', 'SqFtEnclosedPorch']]

> ### Create regression model for all factors that has SqFt characteistics

In [None]:
# Create heatmap for all factors that has SqFt characteistics
create_heatmap(df_sqft, 'sqft_heatmap' )

# Create paiplot of all factors that has SqFt characteristics
# create_pairplot(df_merged, 'sqft_pairplot')

# Run ols regression model
run_ols_regression(df_sqft, 'SalePrice')

> ### Create regression model with high degree of correlation 

In [None]:
# Create heatmap for all factors that has SqFt characteistics
create_heatmap(df_reg, 'sqft_heatmap' )

# Create paiplot of all factors that has SqFt characteristics
# create_pairplot(df_reg, 'sqft_pairplot')

# Run ols regression model
run_ols_regression(df_reg, 'SalePrice')

> ### Looks, good. We can add a few categorical predictors and check if we hav better R-squared

In [None]:
# Let's add a categorical
#ohe = OneHotEncoder(drop='first', sparse=False)
ohe = OneHotEncoder(sparse=False)
df_cat_trans = ohe.fit_transform(df_cat)
df_cat_t = pd.DataFrame(df_cat_trans, index=df_cat.index, columns=ohe.get_feature_names())
df_cat_new = df_reg.join(df_cat_t )

# Create heatmap for all factors that has SqFt characteistics
create_heatmap(df_cat_new, 'reg_cat_heatmap' )

# Create paiplot of all factors that has SqFt characteristics
# create_pairplot(df_cat_new, 'reg_cat_pairplot')

# Run ols regression model
run_ols_regression(df_cat_new, 'SalePrice')

In [None]:
# Let's go create charts for the amenity features

f = plt.figure(figsize=(10, 10))
gs = f.add_gridspec(3,2)

# 1) Sale Price vs Number of Living Units
with sns.axes_style("darkgrid"):
    ax = f.add_subplot(gs[0, 0])
    sns.barplot(x=df_amenity['NbrLivingUnits'], y=df_amenity['SalePrice']).set( \
                title='Sale Price vs Number of Living Units', \
                xlabel='Number of Living Units', ylabel='Sale Price');

# 2) Chart for Number of Stories
with sns.axes_style("darkgrid"):
    ax = f.add_subplot(gs[0, 1])
    sns.barplot(x=df_amenity['Stories'], y=df_amenity['SalePrice']).set( \
            title='Sale Price vs Stories', 
            xlabel='Stories', ylabel='Sale Price');
    
# 3) Chart for Daylight Basement
with sns.axes_style("darkgrid"):
    ax = f.add_subplot(gs[1, 0])
    sns.barplot(x=df_amenity['DaylightBasement'], y=df_amenity['SalePrice']).set( \
            title='Sale Price vs Daylight Basement', \
            xlabel='Daylight Basement', ylabel='Sale Price');
    
# 4) Chart for Heat Systems
with sns.axes_style("darkgrid"):
    ax = f.add_subplot(gs[1, 1])    
    sns.barplot(x=df_amenity['HeatSystem'], y=df_amenity['SalePrice']).set( \
            title='Sale Price vs Heat Systems', \
            xlabel='Heat System', ylabel='Sale Price');
    
# 5) Chart for Number of Bedrooms
with sns.axes_style("darkgrid"):
    ax = f.add_subplot(gs[2,0])
    sns.barplot(x=df_amenity['Bedrooms'], y=df_amenity['SalePrice']).set( \
            title='Single Family: Sale Price vs Bedrooms', \
            xlabel='Bedrooms', ylabel='Sale Price');

# 6) Chart for Bedrooms
with sns.axes_style("darkgrid"):
    ax = f.add_subplot(gs[2, 1]) 
    df_t = df_amenity[df_amenity['YrRenovated'] >= 2010]
    sns.barplot(x=df_t['YrRenovated'], y=df_t['SalePrice']).set( \
            title='Sale Price vs Year Renovated', \
            xlabel='Year Renovated', ylabel='Sale Price');
f.tight_layout()
f.savefig("./visualization/amenities.png")

In [None]:
# Create lmplots to create regression charts

# 1) Sale Price vs Number of Living Units
with sns.axes_style("darkgrid"):
    ax = f.add_subplot()
    sns.lmplot(x="SqFtTotLiving",y="SalePrice",data=df_sqft).set( \
                 title='Sale Price vs SqFt Total Living', \
                 xlabel='SqFt Total Living', ylabel='Sale Price');
    plt.show()
    f.savefig("./visualization/reg-SqFtTotLiving.png")
    
# 2) Sale Price vs SqFt Lot size
with sns.axes_style("darkgrid"):
    ax = f.add_subplot()
    sns.lmplot(x="SqFtLot",y="SalePrice",data=df_sqft).set( \
                title='Sale Price vs Lot SqFt', \
                xlabel='Lot SqFt', ylabel='Sale Price');
    plt.show()
    f.savefig("./visualization/reg-SqFtLot.png")
    
# 3) Chart SQFt Total Basement
with sns.axes_style("darkgrid"):
    ax = f.add_subplot()
    sns.lmplot(x="SqFtTotBasement",y="SalePrice",data=df_sqft).set( \
                title='Sale Price vs Total Basement(SqFt)', \
                xlabel='Total Basement(SqFt)', ylabel='Sale Price');
    plt.show()
    f.savefig("./visualization/reg-SqFtBasement.png")

# # 4) Chart SQFt Finished Basement
with sns.axes_style("darkgrid"):
    ax = f.add_subplot()
    sns.lmplot(x="SqFtFinBasement",y="SalePrice",data=df_sqft).set( \
                title='Sale Price vs Finished Basement(SqFt)', \
                xlabel='Finished Basement(SqFt)', ylabel='Sale Price');                                                
    plt.show()
    f.savefig("./visualization/reg_SqFtFinBasement.png")                                                    

# # 5) Sale Price vs Number of Living Units
with sns.axes_style("darkgrid"):
    ax = f.add_subplot()
    sns.lmplot(x="SqFt1stFloor",y="SalePrice",data=df_sqft).set( \
                title='Sale Price vs Number of Living Units', \
                xlabel='Number of Living Units', ylabel='Sale Price');  
    plt.show()
    f.savefig("./visualization/reg_SqFt1stFroor.png")  

# 2. Having a porch increases home sale price

In [None]:
df_porch = df_reg[['SalePrice','SqFtOpenPorch', 'SqFtEnclosedPorch']]

In [None]:
df_porch['SqFtEnclosedPorch'].describe() 

In [None]:
df_porch['SqFtOpenPorch'].describe()

In [None]:
create_scatterplot_df(df_porch, "porch")

In [None]:
create_boxplot(df_porch, 'SqFtOpenPorch')

In [None]:
create_boxplot(df_porch, 'SqFtEnclosedPorch')

In [None]:
df_porch['TotalPorch'] = df_porch['SqFtEnclosedPorch'] + df_porch['SqFtOpenPorch']

In [None]:
create_scatterplot(df_porch, 'TotalPorch', 'SalePrice', 'scatter_SalePrice_TotalPorcjh')

In [None]:
create_scatterplot(df_porch, "SqFtEnclosedPorch", "SalePrice", 'saleprice_sqftEnclosedPorch' )

In [None]:
df_porch.sort_values(by=['SqFtOpenPorch'], ascending = False)
df_porch.sort_values(by=['SqFtEnclosedPorch'], ascending = False)


In [None]:
create_scatterplot(df_porch, "SqFtEnclosedPorch", "SqFtOpenPorch", 'SqFtEnc_SqFtOpen' )

In [None]:
porch_vs_house_price(df_porch)

# 3) Having a beachfront or lakefront increases home sale price

In [None]:
water_front_sales(df_merged)

## 3) Nuisance factors reduce the sale price of houses

In [None]:
# Create heatmap for all factors that has SqFt characteistics
#create_heatmap(df_nuisance, 'nuisance_heatmap' )

# Create paiplot of all factors that has SqFt characteristics
#create_pairplot(df_nuisance, 'nuisance_pairplot')

