In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
import math

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from env import get_db_url
import os
from itertools import combinations
# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format
from sklearn.model_selection import train_test_split
import wrangle_zillow as wrangle
import utilities
from sklearn.preprocessing import MinMaxScaler


# Zillow Regression Project

# Executive Summary

<hr style=“border-top: 5px groove LightCyan ; margin-top: 1px; margin-bottom: 1px”></hr>

# The Gameplan:

- Attempt to explain what factors can be used to explain (and potentially lower) logerror:

- Acquire and prepare the data

- Explore the different factors that could contribute to logerror

- Try out a few different models

- Offer final recommendations

# Acquiring and Preparing the data

In [2]:
# calling the wrangle function from the wrangle.py file
df = wrangle.wrangle_zillow()

In [4]:
# This function pulls from the wrangle.py file and splits the data into train, validate, and test
# This will be used in the exploration
train, validate, test = wrangle.split_zillow_data(df)

In [23]:
# we will go ahead and scale the data to use later on for the machine learning portion
columns_to_scale = df.select_dtypes('number').columns.drop(['propertylandusetypeid', 'fips', 'regionidcity', 'censustractandblock','id.1', 'latitude', 'longitude','regionidzip', 'logerror'])
train_s, validate_s, test_s = wrangle.data_scaled(train, validate, test, columns_to_scale)
train_s.shape, validate_s.shape, test_s.shape

((26026, 22), (11155, 22), (9296, 22))

# Exploration


In [24]:
columns_corr = train.corr()
columns_corr.logerror

propertylandusetypeid                           NaN
bathroomcnt                                    0.06
bedroomcnt                                     0.06
calculatedfinishedsquarefeet                   0.07
fips                                           0.02
latitude                                      -0.02
longitude                                      0.01
lotsizesquarefeet                              0.00
regionidcity                                  -0.00
regionidzip                                    0.01
yearbuilt                                      0.04
taxvaluedollarcnt                              0.01
taxamount                                      0.01
censustractandblock                            0.02
id.1                                          -0.00
logerror                                       1.00
age                                           -0.04
month                                         -0.01
Name: logerror, dtype: float64

Takeaway : 

    - the closer that a value is to one, the more similar it is to log error
    
    - There doesn't seem to be anything that has a particularly strong relationship with log error right off the bat