In [1]:
import pandas as pd
import numpy as np
import wrangle_zillow as wz
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

In [2]:
df = wz.get_zillow_data()

In [3]:
df.head()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,id.1,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,14297519,,,261.0,,,,,1727539,,...,0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,17052889,,,261.0,,,,,1387261,,...,1,0.055619,2017-01-01,,,,,Single Family Residential,,
2,14186244,,,261.0,,,,,11677,,...,2,0.005383,2017-01-01,,,,,Single Family Residential,,
3,12177905,,,261.0,2.0,,,,2288172,,...,3,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,10887214,,,266.0,2.0,,,1.0,1970746,,...,4,0.00694,2017-01-01,Central,,,Central,Condominium,,


In [4]:
def single_unit_properties(df):
    df = df[df.propertylandusetypeid.isin([260,261,262,279])]
    df = df[(df.bedroomcnt > 0) & (df.bathroomcnt > 0)]
    df.unitcnt = df.unitcnt.fillna(1)
    df = df[df.unitcnt == 1.0]
    df = df.drop(columns=["propertylandusetypeid", "heatingorsystemtypeid", 'propertyzoningdesc', 'calculatedbathnbr'])
    df['heatingorsystemdesc'].replace(np.nan, 'none', inplace=True)
    return df

def handle_missing_values(df, prop_required_column = .60, prop_required_row = .60):
    ''' I filtered for single homes, homes with at least 1 bedrooms & bathrooms '''
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df


def impute_missing_values(df):
    
    train_and_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_and_validate, test_size=.3, random_state=123)
    
    cols1 = [
    "buildingqualitytypeid",
    "regionidcity",
    "regionidzip",
    "yearbuilt",
    "regionidcity",
    "censustractandblock"
    ]

    for col in cols1:
        mode = int(train[col].mode()) # I had some friction when this returned a float (and there were no decimals anyways)
        train[col].fillna(value=mode, inplace=True)
        validate[col].fillna(value=mode, inplace=True)
        test[col].fillna(value=mode, inplace=True)

    return train, validate, test

def impute_missing_values_1():
    cols = [
        "structuretaxvaluedollarcnt",
        "taxamount",
        "taxvaluedollarcnt",
        "landtaxvaluedollarcnt",
        "structuretaxvaluedollarcnt",
        "finishedsquarefeet12",
        "calculatedfinishedsquarefeet",
        "fullbathcnt",
        "lotsizesquarefeet"
    ]

    for col in cols:
        median = train[col].median()
        train[col].fillna(median, inplace=True)
        validate[col].fillna(median, inplace=True)
        test[col].fillna(median, inplace=True)
        
    return train, validate, test

In [5]:
df= single_unit_properties(df)

In [6]:
df=handle_missing_values(df, prop_required_column = .60, prop_required_row = .60)

In [7]:
train, validate, test =impute_missing_values(df)

In [9]:
train, validate, test = impute_missing_values_1()

In [10]:
train.isnull().sum()

parcelid                        0
id                              0
bathroomcnt                     0
bedroomcnt                      0
buildingqualitytypeid           0
calculatedfinishedsquarefeet    0
finishedsquarefeet12            0
fips                            0
fullbathcnt                     0
latitude                        0
longitude                       0
lotsizesquarefeet               0
propertycountylandusecode       0
rawcensustractandblock          0
regionidcity                    0
regionidcounty                  0
regionidzip                     0
roomcnt                         0
unitcnt                         0
yearbuilt                       0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
assessmentyear                  0
landtaxvaluedollarcnt           0
taxamount                       0
censustractandblock             0
id.1                            0
logerror                        0
transactiondate                 0
heatingorsyste