In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats

# Impute missing values
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format
pd.set_option('display.max_columns', None, 'display.max_rows', None)

import acquire
import wrangle_zillow

In [2]:
zillow = acquire.get_zillow(acquire.sql)
# Save file to csv so I can quit asking the server for this data
zillow.to_csv('zillow.csv')

In [3]:
zillow = pd.read_csv('zillow.csv', index_col='id')
# drop extra column that comes in from csv files
# zillow = zillow.drop(columns='Unnamed: 0')

In [4]:
orig_rows = zillow.shape[0]
orig_cols = zillow.shape[1]
print (f'There are {orig_rows} rows and {orig_cols} columns in the starting dataframe.')

There are 77575 rows and 67 columns in the starting dataframe.


# Project

### Goal: Improve our original estimate of the log error by using clustering methodologies.

## Acquisition, Prep, and Initial Exploration

Using the notebook and files you created during the exercises make any changes, additions, etc. you want at this point. NOTE: You will NOT be splitting into train and test at this point.

In [None]:
# Drop rows:
# Restrict df to only properties that meet single use criteria
single_use = [261, 262, 263, 264, 266, 268, 273, 276, 279]
zillow = zillow[zillow.propertylandusetypeid.isin(single_use)]

In [None]:
# Restrict df to only those properties with at least 1 bath & bed
zillow = zillow[(zillow.bedroomcnt > 0) & (zillow.bathroomcnt > 0)]

In [None]:
print (f'There are now {zillow.shape[0]} rows in the zillow dataframe.')

In [None]:
# Change the Y in taxdelinquencyflag to 1
zillow.taxdelinquencyflag = np.where(zillow.taxdelinquencyflag == 'Y', 1, 0)
zillow.taxdelinquencyflag.value_counts()

In [None]:
# Add column for counties
zillow['county'] = np.where(zillow.fips == 6037, 'Los_Angeles',
                            np.where(zillow.fips == 6059, 'Orange', 'Ventura'))

In [None]:
# Address that zip code with too many digits
zillow[zillow.regionidzip > 100000].regionidzip.value_counts()

In [None]:
# pretty sure its just an extra digit
# check if 99675 is a zipcode: nope, so its probably a typo
zillow[zillow.regionidzip == 99675].regionidzip.value_counts()

In [None]:
zillow['regionidzip'] = zillow['regionidzip'].replace(399675, 99675)

### Ideas:

1. Data types:

Write a function that takes in a dataframe and a list of column names and returns the dataframe with the datatypes of those columns changed to a non-numeric type.
Use this function to appropriately transform any numeric columns that should not be treated as numbers.

In [None]:
def numeric_to_object(df, num_cols):
    """
    Takes in a dataframe and a list of the columns to be transformed. 
    Changes the type of each column in the list to object type.
    """
    for col in num_cols:
        df[col] = df[col].astype('object')
    return df

In [None]:
# added this function to be sure numeric columns that should be integers and not floats are treated as such
def numeric_to_int(df, num_cols):
    """
    Takes in a dataframe and a list of the columns to be transformed. 
    Changes the type of each column in the list to integer type.
    """
    for col in num_cols:
        df[col] = df[col].astype('int')
    return df

2. Missing Values: Impute the values in land square feet.

For land square feet, the goal is to impute the missing values by creating a linear model where landtaxvaluedollarcnt is the x-variable and the output/y-variable is the estimated land square feet.
We'll then use this model to make predictions and fill in the missing values.

In [None]:
zillow[zillow.landtaxvaluedollarcnt.isna()]

Write a function that accepts the zillow data frame and returns the data frame with the missing values filled in.

In [None]:
def fill_nulls_with_zero(df, col_names):
    for col in col_names:
        df[col] = df[col].fillna(0)
    return df

In [None]:
# Test function
null_cols = ['airconditioningtypeid', 'basementsqft', 'decktypeid',
             'fireplacecnt', 'garagecarcnt', 'garagetotalsqft',
             'hashottuborspa', 'lotsizesquarefeet', 'poolcnt',
             'poolsizesum', 'taxdelinquencyyear']

zillow = fill_nulls_with_zero(zillow, null_cols)

3. Missing Values: Of the remaining missing values, can they be imputed or otherwise estimated?

Impute those that can be imputed with the method you feel best fits the attribute.

In [None]:
def handle_other_nulls(df):
    df.heatingorsystemtypeid.fillna(13, inplace=True)
    df.numberofstories.fillna(1, inplace=True)
    df.unitcnt.fillna(1, inplace=True)
    return df

In [None]:
# Test function

zillow = handle_other_nulls(zillow)

In [None]:
# use scikit learn's IterativeImputer
# pre-emptively drop unnecessary columns

zillow = wrangle_zillow.remove_columns(zillow, 
                                       ['parcelid',
                                        'architecturalstyletypeid',
                                        'buildingclasstypeid',
                                        'finishedsquarefeet13',
                                        'finishedsquarefeet15',
                                        'finishedsquarefeet50',
                                        'finishedsquarefeet6',
                                        'finishedfloor1squarefeet',
                                        'pooltypeid10',
                                        'pooltypeid2',
                                        'pooltypeid7',
                                        'fireplaceflag',
                                        'airconditioningdesc',
                                        'storydesc',
                                        'heatingorsystemdesc',
                                        'architecturalstyledesc',
                                        'buildingclassdesc',
                                        'typeconstructiondesc',
                                        'yardbuildingsqft17',
                                        'yardbuildingsqft26',
                                        'calculatedbathnbr',
                                        'fullbathcnt',
                                        'threequarterbathnbr',
                                        'typeconstructiontypeid',
                                        'storytypeid',
                                        'propertyzoningdesc', 
                                        'calculatedfinishedsquarefeet', 
                                        'regionidneighborhood',
                                        'regionidcity',
                                        'regionidcounty',
                                        'propertylandusetypeid',
                                        'rawcensustractandblock',
                                        'propertylandusedesc',
                                        'assessmentyear'])

In [None]:
zillow.isna().sum().sort_values(ascending = False)

In [None]:
# Try using imputer function here
# imputer function requires only numerical columns

num_vars = list(zillow.select_dtypes('number').columns)
null_df = zillow[num_vars]

imp = IterativeImputer(random_state=423)

imp.fit(null_df)
imputed_vals = pd.DataFrame(data=imp.transform(null_df),
                            columns=num_vars)
imputed_vals.head()

In [None]:
# Build imputer function

def zillow_imputer(df):
    num_vars = list(df.select_dtypes('number').columns)
    imp = IterativeImputer(random_state=423)
    imp.fit(df[num_vars])
    df[num_vars] = imp.transform(df[num_vars])
    return df

In [None]:
# Test function 

zillow = zillow_imputer(zillow)
zillow.isna().sum().sort_values(ascending = False)

In [None]:
# Imputed values are floats - be sure to reset integers to int

# transform float cols to int
int_cols = ['basementsqft', 'bathroomcnt', 'bedroomcnt', 
            'finishedsquarefeet12', 'fireplacecnt',
            'garagecarcnt', 'garagetotalsqft', 'hashottuborspa',
            'lotsizesquarefeet', 'poolcnt', 'poolsizesum',
            'roomcnt', 'unitcnt', 'yearbuilt', 'numberofstories',
            'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
            'landtaxvaluedollarcnt', 'taxamount',
            'taxdelinquencyflag', 'taxdelinquencyyear']

zillow = numeric_to_int(zillow, int_cols)

# transform float cols to object
obj_cols = ['regionidzip', 'airconditioningtypeid',
            'buildingqualitytypeid', 'decktypeid',
            'heatingorsystemtypeid', 'censustractandblock']

zillow = numeric_to_object(zillow, obj_cols)

Decide whether to remove the rows or columns of any that cannot be reasonably imputed.

Document your reasons for the decisions on how to handle each of those.

4. Outliers: Original from exercises. Adapt as you see fit.

Write a function that accepts a series (i.e. one column from a data frame) and summarizes how many outliers are in the series. This function should accept a second parameter that determines how outliers are detected, with the ability to detect outliers in 3 ways: IQR, standard deviations (z-score), percentiles)

5. Use your function defined above to identify columns where you should handle the outliers.

Write a function that accepts the zillow data frame and removes the outliers. You should make a decision and document how you will remove outliers.

Is there erroneous data you have found that you need to remove or repair? If so, take action.

Are there outliers you want to "squeeze in" to a max value? (e.g. all bathrooms > 6 => bathrooms = 6). If so, make those changes.