In [1]:
import pandas as pd
import numpy as np
from acquire import get_zillow_data

In [2]:
def handle_missing_values(df, prop_required_column, prop_required_row):
    '''
    This function takes in a Dataframe, 
    proportion(0-1) of nulls required for a column
    and a proportion(0-1) of nulls required for rows
    then returns a dataframe without the nulls 
    under the threshold
    '''
    #setting threshold for row, only accepts integer
    thresh_row = int(round(prop_required_column*df.shape[0],0))
    #dropping nulls under threshold
    df.dropna(axis=1, thresh=thresh_row, inplace=True)
    #setting threshold for columns, only accepts integer
    thresh_col = int(round(prop_required_row*df.shape[1],0))
    #dropping nulls under threshold
    df.dropna(axis=0, thresh=thresh_col, inplace=True)
    return df 

In [3]:
df = get_zillow_data()

In [6]:
df.poolcnt = df.poolcnt.fillna(0)

In [7]:
df.fireplacecnt = df.fireplacecnt.fillna(0)

In [8]:
df.heatingorsystemdesc = df.heatingorsystemdesc.fillna('None')

In [9]:
#changing fips number to labeled county
df['county'] = df.fips.replace([6037, 6059, 6111],['los_angeles', 'orange', 'ventura'])

In [10]:
#filter out columns and rows with more than 40% null values
df = handle_missing_values(df, .6, .6)

In [11]:
#filter out bedrooms and bathrooms == 0
df = df[(df.bedroomcnt > 0) & (df.bedroomcnt <= 7) & (df.bathroomcnt > 0) & (df.bathroomcnt <= 7)]

In [12]:
#filter out houses less than 400 square feet
df = df[(df.calculatedfinishedsquarefeet > 400) & (df.calculatedfinishedsquarefeet < 7000)]

In [13]:
df.unitcnt = df.unitcnt.fillna(1)

In [14]:
#filter out all units not equal to 1
df = df[df.unitcnt == 1]

In [15]:
#drop duplicate or unnecessary columns
df = df.drop(columns = ['propertylandusetypeid', 'calculatedbathnbr', 'finishedsquarefeet12', 'heatingorsystemtypeid', 'id', 'fips', 'fullbathcnt', 'propertyzoningdesc', 'regionidcounty', 'id.1'])

In [16]:
def zillow_split(df):
    '''
    This function splits a dataframe into train, validate, and test sets
    '''
    train_and_validate, test = train_test_split(df, train_size=.8, random_state=123)
    train, validate = train_test_split(train_and_validate, train_size = .7, random_state=123)
    return train, validate, test

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
train, validate, test = zillow_split(df)

In [19]:
cols_fixed = ['buildingqualitytypeid', 'regionidcity', 'censustractandblock', 'regionidzip', 'yearbuilt']
for col in cols_fixed:
    mode = int(train[col].mode())
    train[col].fillna(value = mode, inplace = True)
    validate[col].fillna(value = mode, inplace = True)
    test[col].fillna(value = mode, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [20]:
#missing continuous values will be replaced with the median
cols_cont = ['lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'calculatedfinishedsquarefeet', 'taxamount', 'landtaxvaluedollarcnt', 'taxvaluedollarcnt']
for col in cols_cont:
    median = train[col].median()
    train[col].fillna(median, inplace=True)
    validate[col].fillna(median, inplace=True)
    test[col].fillna(median, inplace=True)