## Data Exploration

In [10]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
df = pd.read_csv('../data/kc_house_data.csv')

In [11]:
# Functions to help convert our object datatypes into int64 / float
# Convert 'waterfront' column to 0's and 1's to represent 'NO' and 'YES'
def map_waterfront_bool(arr):
    if arr == 'YES':
        return 1
    elif arr == 'NO':
        return 0

# Convert 'condition' column into an int64 format from object
def condition_conv(vals):
    if vals == 'Very Good':
        return 5
    elif vals == 'Good':
        return 4
    elif vals == 'Average':
        return 3
    elif vals == 'Fair':
        return 2
    elif vals == 'Poor':
        return 1

# Convert the '?' from "sqft_basement" column to 0 (best to assume they do not have a basement)
def sqft_question(sqft):
    if sqft == '?':
        return 0
    else:
        return sqft

# Convert all 'NONE' views to 0's in order to represent it in a boolean form
def view_bool(views):
    if views == 'NONE':
        return 0
    else:
        return 1

def basement_conv(basement):
    if basement > 0:
        return 1
    else:
        return 0
    
def renovate(yr_reno):
    if yr_reno > 2001:
        return 1
    else:
        return 0

def bool_to_num(x):
    if x:
        return 1
    else:
        return 0

def density(zipcode):
    if zipcode in seattle:
        return 0
    elif zipcode in suburbs:
        return 1
    else:
        return 2

# Convert 'date' column into int64 with year | Unnecessary as we dont really care about the posted date
# Columns for renovation year and built year are much more relevant.
df['date'] = df['date'].str[5:]
df['date'] = df['date'].str.replace('/','').astype(np.int64)

In [12]:
# Call all functions to turn data into a manipulatable format
# First fill missing values in 'waterfront' column to 'NO'
df['waterfront'].fillna('NO', inplace = True)
df['waterfront'] = df['waterfront'].map(map_waterfront_bool)
df['condition'] = df['condition'].map(condition_conv)
df['sqft_basement'] = df['sqft_basement'].map(sqft_question).astype(float)
df['grade'] = df['grade'].str[:2].astype(np.int64)
df['view'] = df['view'].map(view_bool)
df['sqft_basement'] = df['sqft_basement'].map(basement_conv)
# Rename column to more accurately label our new datatype
df.rename({'sqft_basement': 'has_basement'}, inplace = True, axis = 1)
df['yr_renovated'] = df['yr_renovated'].map(renovate)

# Defining a limitation so Folium doesn't crash this notebook
df_below_par = df[df['condition'] < 3]

## Feature Engineering

In [13]:
df['relative_living_area'] = df['sqft_living'] / df['sqft_living15']
df['relative_lot_area'] = df['sqft_lot'] / df['sqft_lot15']

In [14]:
seattle = [
    98133, 98125, 98117, 98103, 98115, 98105, 98107, 98199, 98119, \
    98109, 98102, 98112, 98121, 98101, 98154, 98104, 98122, 98144, 98134, 98116, \
    98136, 98126, 98106, 98144, 98118, 98108
]
suburbs = [
    98177, 98160, 98155, 98028, 98011, 98072, 98052, 98034, 98033, 98039, \
    98004, 98005, 98007, 98008, 98074, 98075, 98029, 98056, 98178, 98146,\
    98166, 98148, 98158, 98188, 98057, 98055, 98031, 98032, 98198, \
    98023, 98003, 98001, 98047, 98002, 98030, 98042, 98168, 98040, 98006, \
    98059, 98058, 98077, 98053
]

df['city_type'] = df['zipcode'].map(density)

In [15]:
ohe = OneHotEncoder(drop='first')
city_type_trans = ohe.fit_transform(df[['city_type']])
city_type_df = pd.DataFrame(city_type_trans.todense(),columns=ohe.get_feature_names())

In [16]:
df.reset_index(drop=True,inplace=True)
city_type_df.reset_index(drop=True,inplace=True)
df_fin = pd.concat((df,city_type_df),axis=1)

## Dropping Rows Outside Of Three Standard Deviations for Key Numerical Input Variables

In [17]:
df_prospects = df_fin[(np.abs(stats.zscore(df_fin['price'])) < 3)].copy(deep=True)
df_prospects = df_prospects[(np.abs(stats.zscore(df_prospects['bedrooms'])) < 3)]
df_prospects = df_prospects[(np.abs(stats.zscore(df_prospects['floors'])) < 3)]
df_prospects = df_prospects[(np.abs(stats.zscore(df_prospects['sqft_living'])) < 3)]
df_prospects = df_prospects[(np.abs(stats.zscore(df_prospects['bathrooms'])) < 3)]
df_prospects = df_prospects[(np.abs(stats.zscore(df_prospects['sqft_lot'])) < 3)]

df_prospects.rename({'x0_1':'suburb','x0_2':'rural'},axis=1,inplace=True)
df_prospects.drop(['city_type'],axis=1,inplace=True)

## Exporting To CSV

In [18]:
df_prospects.to_csv('../data/cleaned_data.csv')
df_prospects.shape

(20530, 25)