In [1]:
import pandas as pd
import numpy as np
from acquire import get_zillow_data

In [2]:
def fill_nulls(df):
    
    df.poolcnt = df.poolcnt.fillna(0)
    df.fireplacecnt = df.fireplacecnt.fillna(0)
    df.heatingorsystemdesc = df.heatingorsystemdesc.fillna('None')
    df.unitcnt = df.unitcnt.fillna(1)
    return df

In [3]:
def remove_outliers(df):
    #filter out bedrooms and bathrooms == 0
    df = df[(df.bedroomcnt > 0) & (df.bedroomcnt <= 7) & (df.bathroomcnt > 0) & (df.bathroomcnt <= 7)]
    #filter out houses less than 400 square feet
    df = df[(df.calculatedfinishedsquarefeet > 400) & (df.calculatedfinishedsquarefeet < 7000)]
    #filter out all units not equal to 1
    df = df[df.unitcnt == 1]
    #removing heating or system source outliers
    df = df[~df.heatingorsystemdesc.isin(['Yes', 'Gravity', 'Radiant', 'Baseboard', 'Solar', 'Forced air'])]
    return df

In [4]:
def create_features(df):
    df['age'] = 2017 - df.yearbuilt
    # create taxrate variable
    df['taxrate'] = df.taxamount/df.taxvaluedollarcnt
    # create acres variable
    df['acres'] = df.lotsizesquarefeet/43560
    # dollar per square foot-structure
    df['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt/df.calculatedfinishedsquarefeet
    # dollar per square foot-land
    df['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet
    # ratio of beds to baths
    df['bed_bath_ratio'] = df.bedroomcnt/df.bathroomcnt
    #changing numbered labels into appropriate names
    df['county'] = df.fips.replace([6037, 6059, 6111],['los_angeles', 'orange', 'ventura'])
    df.heatingorsystemdesc = df.heatingorsystemdesc.replace(['Central', 'Floor/Wall', 'None'], ['central_heating', 'floor_wall_heating', 'no_heating'])
    #creating dummy variables
    county_df = pd.get_dummies(df.county)
    heating_or_system_df = pd.get_dummies(df.heatingorsystemdesc)
    #adding dummies back into main dataframe
    df = pd.concat([df, county_df, heating_or_system_df], axis=1)
    #duplicating logerror so it will be at the end of the list
    df['error'] = df.logerror
    #filter out outliers on new features
    df = df[(df.acres < 10) & (df.taxrate < .05)]
    #drop duplicate columns
    df = df.drop(columns = ['bathroomcnt', 'county', 'taxamount', 'taxvaluedollarcnt', 
                       'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt', 
                       'yearbuilt', 'lotsizesquarefeet', 'logerror', 'heatingorsystemdesc'])
    return df

In [5]:
def handle_missing_values(df, prop_required_column, prop_required_row):
    '''
    This function takes in a Dataframe, 
    proportion(0-1) of nulls required for a column
    and a proportion(0-1) of nulls required for rows
    then returns a dataframe without the nulls 
    under the threshold
    '''
    #setting threshold for row, only accepts integer
    thresh_row = int(round(prop_required_column*df.shape[0],0))
    #dropping nulls under threshold
    df.dropna(axis=1, thresh=thresh_row, inplace=True)
    #setting threshold for columns, only accepts integer
    thresh_col = int(round(prop_required_row*df.shape[1],0))
    #dropping nulls under threshold
    df.dropna(axis=0, thresh=thresh_col, inplace=True)
    return df 

In [6]:
df = get_zillow_data()

In [7]:
#filling nulls with appropriate values
df = fill_nulls(df)

In [8]:
df = remove_outliers(df)

In [9]:
df = create_features(df)

In [10]:
#filter out columns and rows with more than 40% null values
df = handle_missing_values(df, .6, .6)

In [11]:
#drop duplicate or unnecessary columns
df = df.drop(columns = ['propertylandusetypeid', 'propertycountylandusecode', 'propertylandusedesc',
                             'calculatedbathnbr', 'finishedsquarefeet12', 'heatingorsystemtypeid', 
                            'id', 'fips', 'fullbathcnt', 'propertyzoningdesc', 'unitcnt',
                            'regionidcounty', 'id.1', 'assessmentyear', 'censustractandblock', 'rawcensustractandblock'])

In [12]:
def zillow_split(df):
    '''
    This function splits a dataframe into train, validate, and test sets
    '''
    train_and_validate, test = train_test_split(df, train_size=.8, random_state=123)
    train, validate = train_test_split(train_and_validate, train_size = .7, random_state=123)
    return train, validate, test

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train, validate, test = zillow_split(df)

In [15]:
cols_fixed = ['buildingqualitytypeid', 'regionidcity', 'regionidzip', 'age']
for col in cols_fixed:
    mode = int(train[col].mode())
    train[col].fillna(value = mode, inplace = True)
    validate[col].fillna(value = mode, inplace = True)
    test[col].fillna(value = mode, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [16]:
#missing continuous values will be replaced with the median
cols_cont = ['taxrate', 'acres', 'structure_dollar_per_sqft', 'land_dollar_per_sqft']
for col in cols_cont:
    median = train[col].median()
    train[col].fillna(median, inplace=True)
    validate[col].fillna(median, inplace=True)
    test[col].fillna(median, inplace=True)

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28525 entries, 47592 to 42185
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      28525 non-null  int64  
 1   bedroomcnt                    28525 non-null  float64
 2   buildingqualitytypeid         28525 non-null  float64
 3   calculatedfinishedsquarefeet  28525 non-null  float64
 4   fireplacecnt                  28525 non-null  float64
 5   latitude                      28525 non-null  float64
 6   longitude                     28525 non-null  float64
 7   poolcnt                       28525 non-null  float64
 8   regionidcity                  28525 non-null  float64
 9   regionidzip                   28525 non-null  float64
 10  roomcnt                       28525 non-null  float64
 11  transactiondate               28525 non-null  object 
 12  age                           28525 non-null  float64
 1

In [18]:
validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12226 entries, 17111 to 28039
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      12226 non-null  int64  
 1   bedroomcnt                    12226 non-null  float64
 2   buildingqualitytypeid         12226 non-null  float64
 3   calculatedfinishedsquarefeet  12226 non-null  float64
 4   fireplacecnt                  12226 non-null  float64
 5   latitude                      12226 non-null  float64
 6   longitude                     12226 non-null  float64
 7   poolcnt                       12226 non-null  float64
 8   regionidcity                  12226 non-null  float64
 9   regionidzip                   12226 non-null  float64
 10  roomcnt                       12226 non-null  float64
 11  transactiondate               12226 non-null  object 
 12  age                           12226 non-null  float64
 1

In [19]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10188 entries, 31819 to 3026
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      10188 non-null  int64  
 1   bedroomcnt                    10188 non-null  float64
 2   buildingqualitytypeid         10188 non-null  float64
 3   calculatedfinishedsquarefeet  10188 non-null  float64
 4   fireplacecnt                  10188 non-null  float64
 5   latitude                      10188 non-null  float64
 6   longitude                     10188 non-null  float64
 7   poolcnt                       10188 non-null  float64
 8   regionidcity                  10188 non-null  float64
 9   regionidzip                   10188 non-null  float64
 10  roomcnt                       10188 non-null  float64
 11  transactiondate               10188 non-null  object 
 12  age                           10188 non-null  float64
 13

In [None]:
def scaled_zillow_columns(cached = True):
    '''
    This function uses a MinMaxScaler to scale numeric columns
    from the wrangle_zillow function
    '''
    train, validate, test = wrangle_zillow()
    columns_to_scale= ['bedroomcnt', 'buildingqualitytypeid', 'calculatedfinishedsquarefeet', 'fireplacecnt', 'latitude', 'longitude', 'poolcnt', 'regionidcity', 'regionidzip', 'roomcnt', 'age', 'taxrate', 'taxrate', 'acres', 'structure_dollar_per_sqft', 'land_dollar_per_sqft', 'bed_bath_ratio']
    #initialize scaler function
    scaler = sklearn.preprocessing.MinMaxScaler()
    #adds '_scaled' to columns that will be scaled
    new_column_names = [c + '_scaled' for c in columns_to_scale]
    #fitting columns to be scaled
    scaler.fit(train[columns_to_scale])
    #adding scaled columns back into their respective dataframes
    train = pd.concat([
        train,
        pd.DataFrame(scaler.transform(train[columns_to_scale]), columns=new_column_names, index=train.index),
    ], axis=1)
    validate = pd.concat([
        validate,
        pd.DataFrame(scaler.transform(validate[columns_to_scale]), columns=new_column_names, index=validate.index),
    ], axis=1)
    test = pd.concat([
        test,
        pd.DataFrame(scaler.transform(test[columns_to_scale]), columns=new_column_names, index=test.index),
    ], axis=1)
    
    train = train.drop(columns = columns_to_scale)
    validate= validate.drop(columns = columns_to_scale)
    test = test.drop(columns = columns_to_scale)

In [20]:
train_scaled, validate_scaled, test_scaled = scaled_zillow_columns()

In [21]:
train_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28525 entries, 47592 to 42185
Data columns (total 27 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   parcelid                             28525 non-null  int64  
 1   transactiondate                      28525 non-null  object 
 2   heatingorsystemdesc                  28525 non-null  object 
 3   los_angeles                          28525 non-null  uint8  
 4   orange                               28525 non-null  uint8  
 5   ventura                              28525 non-null  uint8  
 6   central_heating                      28525 non-null  uint8  
 7   floor_wall_heating                   28525 non-null  uint8  
 8   no_heating                           28525 non-null  uint8  
 9   error                                28525 non-null  float64
 10  bedroomcnt_scaled                    28525 non-null  float64
 11  buildingqualitytypeid_sc