# Some potential audiences are:

1. Homeowners who want to increase the sale price of their homes through home improvement projects
2. Advocacy groups who want to promote affordable housing
3. Local elected officials who want to understand how their policy ideas (e.g. zoning changes, permitting) might impact home prices
4. Real estate investors looking for potential "fixer-uppers" or "tear-downs"

# Three things to be sure you establish during this phase are:

1. **Objectives:** what questions are you trying to answer, and for whom?
2. **Project plan:** you may want to establish more formal project management practices, such as daily stand-ups or using a Trello board, to plan the time you have remaining. Regardless, you should determine the division of labor, communication expectations, and timeline.
3. **Success criteria:** what does a successful project look like? How will you know when you have achieved it?

# READ THIS: Import the following data files from https://info.kingcounty.gov/assessor/DataDownload/default.aspx
## Download the files to local repo data directory
> 1) Real Property Sales (.ZIP, csv) <BR>
> 2) Parcel (.ZIP, csv) <BR>
> 3) Residential Building (.ZIP, csv) <BR>
> 4) Unit Breakdown (.ZIP)<BR>


In [2]:
import os
import sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.diagnostic import linear_rainbow, het_breuschpagan
from statsmodels.stats.outliers_influence import variance_inflation_factor

from eda.eda import *
# from lr_model.build_lr import *

sns.set_style("whitegrid")
sns.set_context("poster")


ValueError: attempted relative import beyond top-level package

### Andrew's scratchwork below:
____

In [None]:
df_merged = consolidate_data(year=2019, create=True)

In [None]:
cols = list(df_merged.columns)
cols = cols[2:4]  + cols[7:8] + cols[10:11] + cols[14:18] + cols[27:30] + cols[38:41] + cols[42:43] + cols[44:47] 
df = df_merged[cols] 

In [None]:
df = df_merged.copy()
df = df.dropna()
df = df[df['SalePrice'] != -300]
df = df[df['SalePrice'] != -100]
df.reset_index(drop = True, inplace = True)
# df.drop(columns = ['DocumentDate', 'DistrictName', 'Address', 'Merged_Key', 'PropertyType'], inplace = True)
print(df.shape)
df.head()


In [None]:
df.info()

In [None]:
df.corr()

In [None]:
col = []
dic = df_merged.corr()['SalePrice'].to_dict()
for x in dic:
    if dic[x] >= abs(0.10):
        col.append(x)
        print(x)

In [None]:
# # create a smaller df to save space and processing power
fsm_df = df_merged[col].copy()
fsm_df.dropna(inplace=True)
fsm = ols(formula="SalePrice ~ NbrLivingUnits + Stories + SqFt1stFloor + SqFt2ndFloor + SqFtTotLiving + SqFtGarageAttached + SqFtOpenPorch + Bedrooms + BathFullCount", data=fsm_df)
fsm_results = fsm.fit()
fsm_results.summary()

In [None]:
# # create a smaller df to save space and processing power
fsm_df = df_merged[['SalePrice', 'SqFt1stFloor', 'SqFt2ndFloor', 'SqFtTotLiving', 'SqFtGarageAttached','SqFtGarageAttached','SqFtOpenPorch','SqFtEnclosedPorch','Bedrooms','BathHalfCount','BathFullCount']].copy()
fsm_df.dropna(inplace=True)
fsm = ols(formula="SalePrice ~ SqFt1stFloor + SqFt2ndFloor + SqFtTotLiving + SqFtGarageAttached + SqFtGarageAttached + SqFtOpenPorch + SqFtEnclosedPorch + Bedrooms + BathHalfCount + BathFullCount", data=fsm_df)
fsm_results = fsm.fit()
fsm_results.summary()

In [None]:
df

In [None]:
df.select_dtypes(include = 'object').head()

In [None]:
def dummying(df, col_name, keep = True):
    if col_name in df.columns:
        label_encoder = LabelEncoder()
        status_labels = label_encoder.fit_transform(df[f"{col_name}"])
        label_encoder.classes_
        df[f"{col_name}_Encoded"] = status_labels
    if keep == False: df.drop(columns = [col_name], inplace = True, errors = 'ignore')
    return df

In [None]:
label_encoder = LabelEncoder()
status_labels = label_encoder.fit_transform(df["WfntAccessRights"])
label_encoder.classes_
df["WfntAccessRights_Encoded"] = status_labels
df.drop(columns = ["WfntAccessRights"], inplace = True, errors = 'ignore')

In [None]:
df["WfntAccessRights_Encoded"]

In [None]:
label_encoder = LabelEncoder()
status_labels = label_encoder.fit_transform(df["WfntProximityInfluence"])
label_encoder.classes_
df["WfntProximityInfluence_Encoded"] = status_labels

In [None]:
label_encoder = LabelEncoder()
status_labels = label_encoder.fit_transform(df["PowerLines"])
label_encoder.classes_
df["PowerLines"] = status_labels

In [None]:
label_encoder = LabelEncoder()
status_labels = label_encoder.fit_transform(df["OtherNuisances"])
label_encoder.classes_
df["OtherNuisances_Encoded"] = status_labels

In [None]:
label_encoder = LabelEncoder()
status_labels = label_encoder.fit_transform(df["BuildingNumber"])
label_encoder.classes_
df["BuildingNumber_Encoded"] = status_labels

In [None]:
label_encoder = LabelEncoder()
status_labels = label_encoder.fit_transform(df["ZipCode"])
label_encoder.classes_
df["ZipCode_Encoded"] = status_labels

In [None]:
label_encoder = LabelEncoder()
status_labels = label_encoder.fit_transform(df["DaylightBasement"])
label_encoder.classes_
df["DaylightBasement_Encoded"] = status_labels

In [None]:
df.corr()

In [None]:
df.select_dtypes(include = 'object').columns

In [None]:
def dummying(df, col_name, keep = True):
    if col_name in df.columns:
        label_encoder = LabelEncoder()
        status_labels = label_encoder.fit_transform(df[f"{col_name}"])
        label_encoder.classes_
        df[f"{col_name}_Encoded"] = status_labels
    if keep == False: df.drop(columns = [col_name], inplace = True, errors = 'ignore')
    return df

In [None]:
def dummying_df(df, keep = True):
    cols = df.select_dtypes(include = 'object').columns
    label_encoder = LabelEncoder()
    for col in cols:
        status_labels = label_encoder.fit_transform(df[f"{col}"])
        df[f"{col}_Encoded"] = status_labels
        if keep == False: df.drop(columns = [col], inplace = True, errors = 'ignore')
    return df

In [None]:
df.select_dtypes(include = 'object').columns

In [None]:
df = dummying_df(df, False)

In [None]:
df.select_dtypes(include = 'object').columns

In [None]:
df

In [None]:
df.columns

# 2. Having a porch increases home sale price

In [None]:
df_porch = df[['SalePrice','SqFtOpenPorch', 'SqFtEnclosedPorch']].copy()
df_porch.head()

In [None]:
df_porch['SqFtEnclosedPorch'].describe()

In [None]:
df_porch['SqFtOpenPorch'].describe()

In [None]:
sns.scatterplot(data=df_porch, x="SqFtEnclosedPorch", y="SqFtOpenPorch")

In [None]:
sns.boxplot(df_porch['SqFtOpenPorch'])

In [None]:
sns.boxplot(df_porch['SqFtEnclosedPorch'])

In [None]:
# # create a smaller df to save space and processing power
fsm_df = df_porch[['SalePrice', 'SqFtEnclosedPorch', 'SqFtOpenPorch']].copy()
fsm_df.dropna(inplace=True)
fsm = ols(formula="SalePrice ~ SqFtEnclosedPorch + SqFtOpenPorch", data=fsm_df)
fsm_results = fsm.fit()
fsm_results.summary()

In [None]:
sns.scatterplot(data=df_porch, x="SqFtEnclosedPorch", y="SalePrice")

In [None]:
sns.scatterplot(data=df_porch, x="SqFtOpenPorch", y="SalePrice")

In [None]:
df_porch.sort_values(by=['SqFtOpenPorch'], ascending = False).head()

In [None]:
df_porch.sort_values(by=['SqFtEnclosedPorch'], ascending = False).head()

In [None]:
sns.scatterplot(data=df_porch, x="SqFtEnclosedPorch", y="SqFtOpenPorch")

In [None]:
df_porch['SalePrice'].describe()

In [None]:
df_porch['SalePrice'].min()

In [None]:
df_porch = df_porch[df_porch['SalePrice'] != -300]
df_porch = df_porch[df_porch['SalePrice'] != -100]

In [None]:
df_porch['SalePrice'].describe()

In [None]:
def sale_tier(s):
    d = df_porch['SalePrice'].describe()
    r = ''
    if s >= d['75%']:
        r = 'Luxury Housing'
    elif s >= d['50%']:
        r = 'High-End Housing'
    elif s >= d['25%']:
        r = 'Medium-End Housing'
    else:
        r = 'Low-End Housing'
    return r

In [None]:
df_porch['SaleTier'] = df_porch['SalePrice'].apply(lambda x: sale_tier(x))

In [None]:
sns.set_context("notebook")
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.scatterplot(data=df_porch, x="SqFtEnclosedPorch", y="SqFtOpenPorch", hue = 'SaleTier', s=100)

In [None]:
sns.set_context("paper")
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.scatterplot(data=df_porch, x="SqFtEnclosedPorch", y="SqFtOpenPorch", hue = 'SaleTier', s=100)

In [None]:
sns.set_context("talk")
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.scatterplot(data=df_porch, x="SqFtEnclosedPorch", y="SqFtOpenPorch", hue = 'SaleTier', s=100)

In [None]:
sns.set_context("poster")
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.scatterplot(data=df_porch, x="SqFtEnclosedPorch", y="SqFtOpenPorch", hue = 'SaleTier', s=100)

In [None]:
df_porch['TotalPorch'] = df_porch['SqFtOpenPorch'] + df_porch['SqFtEnclosedPorch']
df_porch.head()

In [None]:
# # create a smaller df to save space and processing power
fsm_df = df_porch[['SalePrice', 'TotalPorch']].copy()
fsm_df.dropna(inplace=True)
fsm = ols(formula="SalePrice ~ TotalPorch", data=fsm_df)
fsm_results = fsm.fit()
fsm_results.summary()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.scatterplot(data=df_porch, x="TotalPorch", y="SalePrice", hue = 'SaleTier', alpha = 0.8, s=500)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.scatterplot(data=df_porch, y="SqFtEnclosedPorch", x="SqFtOpenPorch", size = 'SalePrice', hue = 'SaleTier', s=1000)

In [None]:
x_ticks = ['0', '0.0k', '0.5k', '1.0k', '1.5k', '2.0k', '2.5k', '3.0k']
y_ticks = ['0', '0.0k', '0.5k', '1.0k', '1.5k', '2.0k', '2.5k', '3.0k']
colors = ['#f47a60', '#ced7d8', '#7fe7dc', '#316879']
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.scatterplot(data=df_porch, x="SqFtOpenPorch", y="SqFtEnclosedPorch", hue = 'SaleTier', alpha = 0.8, s=500, palette = colors)
ax.set_title("House Price compared to Porch Area")
plt.xlim(-90,3000)
plt.ylim(-90,3000)
ax.set_xlabel('Open Porch Area (SqFt)')
ax.set_xticklabels(x_ticks)
ax.set_ylabel('Enclosed Porch Area (SqFt)')
ax.set_yticklabels(y_ticks)

handles, labels = ax.get_legend_handles_labels()
handles = [handles[4], handles[3], handles[1], handles[2]]
labels = [labels[4], labels[3], labels[1], labels[2]]
ax.legend(handles, labels)

# figure = ax.get_figure().savefig("porch_porch_area", dpi = 400, bbox_inches = "tight")

In [None]:
False == False

In [None]:
df_por = df[(df['SqFtOpenPorch'] > 0) | (df['SqFtEnclosedPorch'] > 0)]
df_por = df_por[(df_por['SqFtOpenPorch'] > 0) == (df_por['SqFtEnclosedPorch'] > 0)]

# x_ticks = ['0', '0.0k', '0.5k', '1.0k', '1.5k', '2.0k', '2.5k', '3.0k']
# y_ticks = ['0', '0.0k', '0.5k', '1.0k', '1.5k', '2.0k', '2.5k', '3.0k']
# colors = ['#f47a60', '#ced7d8', '#7fe7dc', '#316879']
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.scatterplot(data=df_por, x="SqFtEnclosedPorch", y="SalePrice", alpha = 0.8, s=500)

ax = sns.scatterplot(data=df_por, x="SqFtOpenPorch", y="SalePrice", alpha = 0.8, s=500)
# ax.set_title("House Price compared to Porch Area")
# plt.xlim(-90,1500)
# plt.ylim(-90,7500000)
# ax.set_xlabel('Open Porch Area (SqFt)')
# ax.set_xticklabels(x_ticks)
# ax.set_ylabel('Enclosed Porch Area (SqFt)')
# ax.set_yticklabels(y_ticks)

# handles, labels = ax.get_legend_handles_labels()
# handles = [handles[4], handles[3], handles[1], handles[2]]
# labels = [labels[4], labels[3], labels[1], labels[2]]
# ax.legend(handles, labels)

# figure = ax.get_figure().savefig("porch_porch_area", dpi = 400, bbox_inches = "tight")

In [None]:
df_por = df.copy()
df_por['TotalPorch'] = df_por['SqFtOpenPorch'] + df_por['SqFtEnclosedPorch']
df_por = df_por[df_por['TotalPorch'] > 0]

# x_ticks = ['0', '0.0k', '0.5k', '1.0k', '1.5k', '2.0k', '2.5k', '3.0k']
# y_ticks = ['0', '0.0k', '0.5k', '1.0k', '1.5k', '2.0k', '2.5k', '3.0k']
# colors = ['#f47a60', '#ced7d8', '#7fe7dc', '#316879']
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.scatterplot(data=df_por, x="TotalPorch", y="SalePrice", alpha = 0.8, s=500)

# ax.set_title("House Price compared to Porch Area")
# plt.xlim(-90,1500)
# plt.ylim(-90,7500000)
# ax.set_xlabel('Open Porch Area (SqFt)')
# ax.set_xticklabels(x_ticks)
# ax.set_ylabel('Enclosed Porch Area (SqFt)')
# ax.set_yticklabels(y_ticks)

# handles, labels = ax.get_legend_handles_labels()
# handles = [handles[4], handles[3], handles[1], handles[2]]
# labels = [labels[4], labels[3], labels[1], labels[2]]
# ax.legend(handles, labels)

# figure = ax.get_figure().savefig("porch_porch_area", dpi = 400, bbox_inches = "tight")

In [None]:
df_por.sort_values(by='TotalPorch', ascending=False)

In [None]:
def porchyn(p):
    if p > 0:
        return 1
    else:
        return 0

In [None]:
df_porch['Porchyn'] = df_porch['TotalPorch'].apply(lambda x: porchyn(x))
df_porch.head()

In [None]:
df_p = df_porch[df_porch['TotalPorch'] > 0]['SalePrice']
df_p

In [None]:
df_p = df_porch[df_porch['TotalPorch'] > 0]['SalePrice']
df_np = df_porch[df_porch['TotalPorch'] == 0]['SalePrice']
p_outlier = ( 3 * df_p.std() ) + df_p.mean()
np_outlier = ( 3 * df_np.std() ) + df_np.mean()
df1 = df_porch[(df_porch['Porchyn'] == 0) & (df_porch['SalePrice'] <= np_outlier)]
df2 = df_porch[(df_porch['Porchyn'] == 1) & (df_porch['SalePrice'] <= p_outlier)]
df_porch_cheap = df1.append(df2)
df_porch_cheap.shape

In [None]:
# # create a smaller df to save space and processing power
fsm_df = df_porch_cheap[['SalePrice', 'Porchyn']].copy()
fsm_df.dropna(inplace=True)
fsm = ols(formula="SalePrice ~ Porchyn", data=fsm_df)
fsm_results = fsm.fit()
fsm_results.summary()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
bar_colors = ['#aed6dc', '#ff9a8d', '#4a536b']
x_ticks = ['House w/o Porch', 'House w/ Porch']
y_ticks = ['$0.0 mil', '$0.2 mil', '$0.4 mil', '$0.6 mil', '$0.8 mil', '$1.0 mil']
ax = sns.barplot(data = df_porch_cheap, x = "Porchyn", y = "SalePrice", alpha = 1, palette = bar_colors, ci = 80, zorder = 0)
sns.lineplot(data = df_porch_cheap, x = 'Porchyn', y = 'SalePrice', alpha = 1, color = bar_colors[2], zorder = 10)
ax.set_title("Average Price Increase of a House with a Porch")
plt.ylim(0,1000000)
ax.set_xlabel('')
ax.set_xticklabels(x_ticks)
ax.set_ylabel('')
ax.set_yticklabels(y_ticks)
plt.show()
figure = ax.get_figure().savefig("poch_avg_price", dpi = 400, bbox_inches = "tight")

In [None]:
print(df_porch_cheap[df_porch_cheap['Porchyn'] == 0]['SalePrice'].mean(), df_porch_cheap[df_porch_cheap['Porchyn'] == 1]['SalePrice'].mean())
m, n = df_porch_cheap[df_porch_cheap['Porchyn'] == 0]['SalePrice'].mean(), df_porch_cheap[df_porch_cheap['Porchyn'] == 1]['SalePrice'].mean()
n/m

In [None]:
sns.pairplot(df_porch)

In [None]:
# # create a smaller df to save space and processing power
fsm_df = df_porch[['SalePrice', 'Porchyn']].copy()
fsm_df.dropna(inplace=True)
fsm = ols(formula="SalePrice ~ Porchyn", data=fsm_df)
fsm_results = fsm.fit()
fsm_results.summary()

In [None]:
df_p = df_porch[df_porch['TotalPorch'] > 0]['SalePrice'].reset_index(drop = True)
df_np = df_porch[df_porch['TotalPorch'] == 0]['SalePrice'].reset_index(drop = True)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.boxplot(df_p)
ax.set(xlim=(0, 10000000), ylim = (-0.5,0.5))

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.boxplot(df_np)
ax.set(xlim=(0, 10000000), ylim = (-0.5,0.5))

In [None]:
# fig, ax = plt.subplots(figsize=(10, 10))
# ax = sns.boxplot(df_p)
# ax.set(xlim=(0, 10000000), ylim = (-0.5,0.5))
sns.set_context('notebook')
g = sns.FacetGrid(df_porch, row = "Porchyn")
g.map(sns.boxplot, "SalePrice", order = [0,1])

In [None]:
sns.set_context('poster')
fig, axs = plt.subplots(2, 1, figsize = (10,10))
sns.boxplot(y=df_p,  orient='h' , ax=axs[0], showfliers = False)
sns.boxplot(y=df_np,  orient='h' , ax=axs[1], showfliers = False)

In [None]:
from scipy.stats import ttest_ind
ttest_result = ttest_ind(df_p, df_np, equal_var = False)
ttest_result

In [None]:
p = ttest_result.pvalue
p

# p value is small so we fail to reject the null hypothesis; we accept the alternative hypothesis

# 3) Having a beachfront or lakefront increases home sale price

In [None]:
df.columns

In [None]:
df['WfntLocation'].value_counts()

In [None]:
df_wf = df[['SalePrice', 'WfntLocation']].copy()

In [None]:
def wfyn(w):
    if w > 0:
        return 1
    else:
        return 0

In [None]:
df_wf['Wfntyn'] = df_wf['WfntLocation'].apply(lambda x: wfyn(x))
df_wf.head()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.barplot(data=df_wf, x="Wfntyn", y="SalePrice", alpha = 0.8)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.boxplot(data=df_wf, y="WfntLocation", x="SalePrice",hue = 'WfntLocation')

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.boxplot(data=df_wf, y="WfntLocation", x="SalePrice", orient = 'v', hue = 'WfntLocation')

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.boxplot(data=df_wf, x="WfntLocation", y="SalePrice", orient = 'v', hue = 'WfntLocation', showfliers = False)

In [None]:
df_wf['WfntLocation_cat'] = df_wf['WfntLocation'].astype('str')

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
colors = ['#4d5198', '#daf2dc', '#81b7d2', '#ffcce7']
x_ticks = ['0', '$0 mil','$2 mil','$4 mil','$6 mil','$8 mil', '$10 mil', '$12 mil']
y_ticks = ['Non-Waterfront','Duwamish', 'Puget Sound', 'Ship Canal', 'Lake Wash', 'Lake Samm', 'Other Lakes', 'River/Slough']
ax = sns.boxplot(x=df_wf['SalePrice'], y=df_wf['WfntLocation_cat'], orient ='h', showfliers = False, boxprops=dict(alpha=.8), palette = colors)
plt.xlim(-600000,12000000)
ax.set_title("House Price")
ax.set_xlabel('')
ax.set_xticklabels(x_ticks)
ax.set_ylabel('')
ax.set_yticklabels(y_ticks)
figure = ax.get_figure().savefig("wf_avg_breakdown", dpi = 400, bbox_inches = "tight")

In [None]:
df_wf['WfntLocation_cat'].value_counts()

In [None]:
df_w = df[df['WfntLocation'] > 0]['SalePrice'].copy()
df_nw = df[df['WfntLocation'] == 0]['SalePrice'].copy()
print(len(df_w), len(df_nw))

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.boxplot(df_w)
ax.set(xlim=(0, 10000000), ylim = (-0.5,0.5))

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.boxplot(df_nw)
ax.set(xlim=(0, 10000000), ylim = (-0.5,0.5))

In [None]:
sns.pairplot(df_wf, kind = 'reg')

In [None]:
df_wf['Wfntyn'].value_counts()

In [None]:
# # create a smaller df to save space and processing power
fsm_df = df_wf[['SalePrice', 'Wfntyn']].copy()
fsm_df.dropna(inplace=True)
fsm = ols(formula="SalePrice ~ Wfntyn", data=fsm_df)
fsm_results = fsm.fit()
fsm_results.summary()

In [None]:
ttest_result = ttest_ind(df_w, df_nw, equal_var = False)
ttest_result

In [None]:
p = ttest_result.pvalue
p

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
bar_colors = ['#c6d7eb', '#d9a5b3', '#1868ae']
x_ticks = ['Normal House', 'Waterfront House']
y_ticks = ['$0k', '$25k', '$50k', '$75k', '$100k', '$125k', '$150k', '$175k', '$200k']
ax = sns.barplot(data = df_wf, x = "Wfntyn", y = "SalePrice", alpha = 0.9, palette = bar_colors, ci = 93, zorder = 0)
sns.lineplot(data = df_wf, x = 'Wfntyn', y = 'SalePrice', alpha = 1, color = bar_colors[2], zorder = 10)
ax.set_title("Average Price Increase of a Waterfront House")
ax.set_xlabel('')
ax.set_xticklabels(x_ticks)
ax.set_ylabel('')
ax.set_yticklabels(y_ticks)
250000

In [None]:
w_outlier = ( 3 * df_w.std() ) + df_w.mean()
nw_outlier = ( 3 * df_nw.std() ) + df_nw.mean()
df1 = df_wf[(df_wf['Wfntyn'] == 0) & (df_wf['SalePrice'] <= nw_outlier)]
df2 = df_wf[(df_wf['Wfntyn'] == 1) & (df_wf['SalePrice'] <= w_outlier)]
df_wf_cheap = df1.append(df2)
df_wf_cheap.shape

In [None]:
df_wf_cheap[df_wf_cheap['Wfntyn'] == 1]['SalePrice'].mean() - df_wf_cheap[df_wf_cheap['Wfntyn'] == 0]['SalePrice'].mean()

In [None]:
# # create a smaller df to save space and processing power
fsm_df = df_wf_cheap[['SalePrice', 'Wfntyn']].copy()
fsm_df.dropna(inplace=True)
fsm = ols(formula="SalePrice ~ Wfntyn", data=fsm_df)
fsm_results = fsm.fit()
fsm_results.summary()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
bar_colors = ['#aed6dc', '#ff9a8d', '#4a536b']
x_ticks = ['Normal House', 'Waterfront House']
y_ticks = ['$0.0 mil', '$0.2 mil', '$0.4 mil', '$0.6 mil', '$0.8 mil', '$1.0 mil', '$1.2 mil', '$1.4 mil', '$1.6 mil', '$1.8 mil']
ax = sns.barplot(data = df_wf_cheap, x = "Wfntyn", y = "SalePrice", alpha = 0.9, palette = bar_colors, ci = 95, zorder = 0)
sns.lineplot(data = df_wf_cheap, x = 'Wfntyn', y = 'SalePrice', alpha = 1, color = bar_colors[2], zorder = 10)
ax.set_title("Average Price Increase of a Waterfront House")
plt.ylim(0,1800000)
ax.set_xlabel('')
ax.set_xticklabels(x_ticks)
ax.set_ylabel('')
ax.set_yticklabels(y_ticks)
plt.show()
figure = ax.get_figure().savefig("wf_avg_price", dpi = 400, bbox_inches = "tight")

In [None]:
print(df_wf_cheap[df_wf_cheap['Wfntyn'] == 0]['SalePrice'].mean(), df_wf_cheap[df_wf_cheap['Wfntyn'] == 1]['SalePrice'].mean())
m, n = df_wf_cheap[df_wf_cheap['Wfntyn'] == 0]['SalePrice'].mean(), df_wf_cheap[df_wf_cheap['Wfntyn'] == 1]['SalePrice'].mean()
n/m

# Waterfront housing are way more expensive

In [None]:
print(df_wf[df_wf['Wfntyn'] == 0]['SalePrice'].mean(), df_wf_cheap[df_wf_cheap['Wfntyn'] == 0]['SalePrice'].mean())

In [None]:
df_wf_cheap[df_wf_cheap['Wfntyn'] == 0]['SalePrice'].mean()

# p value is small so we fail to reject the null hypothesis; we accept the alternative hypothesis

In [None]:
df_cat.head()

In [None]:
df_cat = df.select_dtypes(include = 'object')
df_cat.drop(columns = ['Merged_Key', 'DocumentDate', 'PropType', 'Address', 'DaylightBasement'], inplace = True, errors = 'ignore')
df_cat.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
# transformed = jobs_encoder.transform(df_cat['WfntAccessRights'].to_numpy().reshape(-1, 1))
#Create a Pandas DataFrame of the hot encoded column
# ohe_df = pd.DataFrame(transformed, columns=jobs_
#                       .get_feature_names())
# #concat with original data
# data = pd.concat([data, ohe_df], axis=1).drop(['Profession'], axis=1)

In [None]:
enc.fit(df_cat)

In [None]:
enc.categories_

In [None]:
df.NbrLivingUnits.value_counts()

In [None]:
df.loc[df['NbrLivingUnits'].isin([1,2])]

In [None]:
len(df_nw)

In [None]:
len(df_w)