# Environment

In [1]:
import pandas as pd
import numpy as np
import ddfloww as dd
import scipy.stats as stats

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression


# Acquisition

In [2]:
df = dd.get_zillow()

# Preparation

In [3]:
df = dd.prep_zillow(df, preq_col=.5, preq_row=.6, cols_to_remove=['parcelid', 'id'], coords_to_validate=['latitude', 'longitude'])

In [4]:
df = dd.impute_missing(df)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
[32683.87758287]
[[-0.01046837]]
Univariate - final_exam = b + m * exam1
    y-intercept (b): 32683.88
    coefficient (m): -0.01



In [5]:
dd.missing_values_col(df)

Unnamed: 0,num_missing,missing_percentage,num_empty,empty_percentage,nan_count,nan_percentage
logerror,0,0.0,0,0.0,0,0.0
transactiondate,0,0.0,0,0.0,0,0.0
bathroomcnt,0,0.0,0,0.0,0,0.0
bedroomcnt,0,0.0,0,0.0,0,0.0
buildingqualitytypeid,56590,34.80962,0,0.0,0,0.0
calculatedbathnbr,112,0.068893,0,0.0,0,0.0
calculatedfinishedsquarefeet,26,0.015993,0,0.0,0,0.0
finishedsquarefeet12,5699,3.505567,0,0.0,0,0.0
fips,0,0.0,0,0.0,0,0.0
fullbathcnt,112,0.068893,0,0.0,0,0.0


In [None]:
# at this point, none would benefit by imputing with zeroes

In [None]:
# remove the rows that cannot be reasonably imputed.
df = df.dropna()

In [None]:
dd.missing_values_col(df)

In [None]:
dd.missing_values_row(df)

In [None]:
df.info()

In [None]:
df = dd.convert_to_string(df, 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid',
                         'propertylandusetypeid', 'regionidcity', 'regionidcounty',
                         'regionidzip', 'censustractandblock')

In [None]:
df = dd.convert_to_int(df, 'yearbuilt', 'assessmentyear')

In [None]:
df.info()

In [None]:
# Remove outliers and nonsensical observations
# TODO: do this using function with for loop
df = df[df.bathroomcnt <= 12.5]
df = df[df.bathroomcnt <= 12.5]
df = df[df.calculatedfinishedsquarefeet <= 14000]
df = df[df.fullbathcnt <= 12.5]
df = df[df.taxvaluedollarcnt <= 20000000.0]
df = df[df.taxamount <= 200000]
df = df[df.bedroomcnt <= 8]

# Exploration

In [None]:
tbl = pd.crosstab(df['fips'], df['regionidcounty'])

stat, p, dof, expected = stats.chi2_contingency(tbl)
p

In [None]:
tbl = pd.crosstab(df['heatingorsystemtypeid'], df['heatingorsystemdesc'])

stat, p, dof, expected = stats.chi2_contingency(tbl)
p

In [None]:
df = dd.remove_columns(df, cols_to_remove=['finishedsquarefeet12', 'fullbathcnt', 'fips',
                                           'heatingorsystemtypeid', 'propertylandusetypeid',
                                           'rawcensustractandblock', 'regionidcity',
                                           'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
                                           'taxamount', 'bathroomcnt', 'bedroomcnt',
                                           ])

In [None]:
num_cols = df.select_dtypes(include='number').columns.tolist()
num_cols

In [None]:
str_cols = df.select_dtypes(exclude='number').columns.tolist()
str_cols

In [None]:
all_cols = num_cols + str_cols
all_cols

In [None]:
df = dd.standardize_data(df, columns=['logerror', 'fullbathcnt', 'latitude',
                      'longitude', 'lotsizesquarefeet', 'roomcnt',
                      'unitcnt', 'yearbuilt', 'landtaxvaluedollarcnt'])

In [None]:
dd.plot_hist(df)

In [None]:
dd.plot_pairs(df, num_cols)

In [None]:
df_heat = df[num_cols]
dd.plot_heat(df_heat)

In [None]:
df.propertylandusedesc.value_counts()

In [None]:
df = dd.remove_columns(df, cols_to_remove=['calculatedfinishedsquarefeet', 'finishedsquarefeet12',
                                           'fips', 'heatingorsystemtypeid', 'propertylandusetypeid',
                                           'rawcensustractandblock', 'regionidcity', 'structuretaxvaluedollarcnt',
                                           'taxvaluedollarcnt', 'taxamount', 'bathroomcnt', 'bedroomcnt',
                                           'calculatedbathnbr'])