In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from env import get_db_url
import os

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format


In [2]:
def get_zillow_data():
    filename = 'zillow_data.csv'
    
    if os.path.isfile(filename):
        return pd.read_csv(filename, index_col = 0)
    else:
        df = pd.read_sql(
        '''
        SELECT *
        FROM properties_2017
        JOIN propertylandusetype 
        USING (propertylandusetypeid)
        JOIN predictions_2017
        USING (parcelid)
        WHERE propertylandusedesc = 'Single Family Residential';
        '''
        ,
        get_db_url('zillow')
        )
        
        df.to_csv(filename)
        
        return df

In [3]:
df = get_zillow_data()
df_original = df.copy()
df_original.shape

(52442, 63)

In [4]:
def handle_missing_values(df, prop_required_column = .5, prop_required_row = .75):
    threshold = int(round(prop_required_column * len(df.index), 0))
    df.dropna(axis=1, thresh = threshold, inpolace = True)
    threshold = int(round(prop_requred_row * len(df.columns), 0))
    df.dropna(axis = 0, thresh = threshold, inplace = True)

In [5]:
def handle_missing_values(df, prop_required_column = .5, prop_required_row = .75):
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df


In [6]:
df = handle_missing_values(df, prop_required_column = .8, prop_required_row = .9)

In [7]:
52349/52442

0.998226612257351

In [8]:
df.isna().sum()

parcelid                           0
propertylandusetypeid              0
id                                 0
bathroomcnt                        0
bedroomcnt                         0
calculatedbathnbr                 61
calculatedfinishedsquarefeet       7
finishedsquarefeet12             171
fips                               0
fullbathcnt                       61
latitude                           0
longitude                          0
lotsizesquarefeet                347
propertycountylandusecode          0
rawcensustractandblock             0
regionidcity                    1013
regionidcounty                     0
regionidzip                        7
roomcnt                            0
yearbuilt                         41
structuretaxvaluedollarcnt        71
taxvaluedollarcnt                  1
assessmentyear                     0
landtaxvaluedollarcnt              1
taxamount                          4
censustractandblock               97
propertylandusedesc                0
i