In [1]:
# Standrad imports
import pandas as pd
import numpy as np
import os
import scipy.stats as stats

# For inserting random values into nulls if necessary
import random

# To acquire MYSQL Data
import acquire
from env import username, password, host
import wrangle

# For data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# For running modeling
from sklearn.model_selection import train_test_split
import sklearn.metrics as mtc
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer

# Exercises
##### Let's set up an example scenario as perspective for our regression exercises using the Zillow dataset.

### As a Codeup data science graduate, you want to show off your skills to the Zillow data science team in hopes of getting an interview for a position you saw pop up on LinkedIn. You thought it might look impressive to build an end-to-end project in which you use some of their Kaggle data to predict property values using some of their available features; who knows, you might even do some feature engineering to blow them away. Your goal is to predict the values of single unit properties using the obervations from 2017.

## In these exercises, you will complete the first step toward the above goal: acquire and prepare the necessary Zillow data from the zillow database in the Codeup database server.

#### 1. Acquire bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet, taxvaluedollarcnt, yearbuilt, taxamount, and fips from the zillow database for all 'Single Family Residential' properties.

#### 2. Using your acquired Zillow data, walk through the summarization and cleaning steps in your wrangle.ipynb file like we did above. You may handle the missing values however you feel is appropriate and meaningful; remember to document your process and decisions using markdown and code commenting where helpful.

#### 3. Store all of the necessary functions to automate your process from acquiring the data to returning a cleaned dataframe with no missing values in your wrangle.py file. Name your final function wrangle_zillow.

In [2]:
# I've already looked into the zillow data and create a function in acquire.py to get it
#acquire.get_zillow_2017()

In [3]:
# Specify columns as laid out in the lesson exercises
#w_zillow = get_zillow_2017[['bedroomcnt','bathroomcnt','calculatedfinishedsquarefeet','taxvaluedollarcnt','yearbuilt','taxamount','fips']]
#w_zillow

In [4]:
wrangle.get_zillow_2017()

Unnamed: 0.1,Unnamed: 0,propertylandusetypeid,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,propertylandusedesc
0,0,269.0,0,10754147,,,,0.0,0.0,,...,,,9.0,2016.0,9.0,,,,,Planned Unit Development
1,1,261.0,1,10759547,,,,0.0,0.0,,...,,,27516.0,2015.0,27516.0,,,,,Single Family Residential
2,2,47.0,2,10843547,,,,0.0,0.0,5.0,...,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,,Store/Office (Mixed Use)
3,3,47.0,3,10859147,,,,0.0,0.0,3.0,...,,580059.0,1174475.0,2016.0,594416.0,14557.57,,,,Store/Office (Mixed Use)
4,4,31.0,4,10879947,,,,0.0,0.0,4.0,...,,196751.0,440101.0,2016.0,243350.0,5725.17,,,,Commercial/Office/Residential Mixed Used
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2982280,2982280,263.0,2982280,11366340,,,,1.0,2.0,,...,,55200.0,469300.0,2016.0,414100.0,5764.45,,,,Mobile Home
2982281,2982281,266.0,2982281,13010327,1.0,,,3.0,3.0,,...,,356454.0,594022.0,2016.0,237568.0,7343.47,,,,Condominium
2982282,2982282,261.0,2982282,12385768,1.0,,,4.0,4.0,,...,,336548.0,554009.0,2016.0,217461.0,6761.20,Y,15.0,,Single Family Residential
2982283,2982283,266.0,2982283,11795063,1.0,,,1.0,1.0,,...,,371756.0,620284.0,2016.0,248528.0,7611.91,,,,Condominium


In [5]:
# Dropping obvious unnecessary columns right off the bat
zillow17 = wrangle.get_zillow_2017().drop(columns={'id', 'Unnamed: 0'})
zillow17.head(3)

Unnamed: 0,propertylandusetypeid,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,propertylandusedesc
0,269.0,10754147,,,,0.0,0.0,,,,...,,,9.0,2016.0,9.0,,,,,Planned Unit Development
1,261.0,10759547,,,,0.0,0.0,,,,...,,,27516.0,2015.0,27516.0,,,,,Single Family Residential
2,47.0,10843547,,,,0.0,0.0,5.0,,,...,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,,Store/Office (Mixed Use)


In [6]:
# Split the data and verify sizes by checking their shapes
train, validate, test = wrangle.split(zillow17)

df shape: (2982285, 59)
Train shape: (1789371, 59)
Validate shape: (596457, 59)
Test shape: (596457, 59)


In [10]:
# Sending the data to a .csv file for future use and quicker pulls
train.to_csv('zillow_train.csv')
validate.to_csv('zillow_validate.csv')
test.to_csv('zillow_test.csv')

In [7]:
# I am specifically looking for SFH in these exercises
train.propertylandusedesc.value_counts()

Single Family Residential                     1292340
Condominium                                    289724
Duplex (2 Units, Any Combination)               68338
Planned Unit Development                        36893
Mobile Home                                     35724
Quadruplex (4 Units, Any Combination)           24532
Triplex (3 Units, Any Combination)              24006
Commercial/Office/Residential Mixed Used         5696
Cluster Home                                     5675
Store/Office (Mixed Use)                         2634
Residential General                              1800
Cooperative                                      1100
Manufactured, Modular, Prefabricated Homes        729
Townhouse                                         156
Residential Common Area                            23
Inferred Single Family Residential                  1
Name: propertylandusedesc, dtype: int64

In [12]:
# Narrow down the data frame and reassign it back into the variable
train = train[(train.propertylandusedesc == 'Single Family Residential') | (train.propertylandusedesc == 'Inferred Single Family Residential')]

In [13]:
train.head()

Unnamed: 0,propertylandusetypeid,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,propertylandusedesc
2236567,261.0,11216329,1.0,,,3.0,3.0,,8.0,3.0,...,,212207.0,265257.0,2016.0,53050.0,5537.14,,,60379110000000.0,Single Family Residential
1804351,261.0,17147507,,,,3.0,5.0,,,3.0,...,,224000.0,642000.0,2016.0,418000.0,7906.32,,,61110030000000.0,Single Family Residential
1861128,261.0,14142821,,,,3.5,6.0,,,3.5,...,,168445.0,822352.0,2016.0,653907.0,9254.54,,,60590020000000.0,Single Family Residential
807,261.0,17181523,,,,2.0,4.0,,,2.0,...,,130302.0,217168.0,2016.0,86866.0,2532.56,,,61110050000000.0,Single Family Residential
1004951,261.0,11946788,,,,3.0,3.0,,8.0,3.0,...,,175837.0,219511.0,2016.0,43674.0,2712.17,,,60371950000000.0,Single Family Residential


In [14]:
sfh_train = train[['bedroomcnt','bathroomcnt','calculatedfinishedsquarefeet','taxvaluedollarcnt','yearbuilt','taxamount','fips']]

In [15]:
sfh_train.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
2236567,3.0,3.0,2988.0,265257.0,2006.0,5537.14,6037.0
1804351,5.0,3.0,2826.0,642000.0,2005.0,7906.32,6111.0
1861128,6.0,3.5,2850.0,822352.0,1966.0,9254.54,6059.0
807,4.0,2.0,1576.0,217168.0,1971.0,2532.56,6111.0
1004951,3.0,3.0,2395.0,219511.0,1939.0,2712.17,6037.0


In [16]:
# Looking for null values
sfh_train.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1292341 entries, 2236567 to 2594527
Data columns (total 7 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   bedroomcnt                    1292337 non-null  float64
 1   bathroomcnt                   1292337 non-null  float64
 2   calculatedfinishedsquarefeet  1287285 non-null  float64
 3   taxvaluedollarcnt             1292055 non-null  float64
 4   yearbuilt                     1286778 non-null  float64
 5   taxamount                     1289667 non-null  float64
 6   fips                          1292341 non-null  float64
dtypes: float64(7)
memory usage: 78.9 MB


In [17]:
# Getting a sum of all the nulls in each column
sfh_train.isnull().sum()

bedroomcnt                         4
bathroomcnt                        4
calculatedfinishedsquarefeet    5056
taxvaluedollarcnt                286
yearbuilt                       5563
taxamount                       2674
fips                               0
dtype: int64

In [18]:
# Using this for loop to show the number of nulls in each column as well as the percentage of that number, 
# compared to the rest of the non-null values
for col in sfh_train:
    print(f'{col} null count: {sfh_train[col].isnull().sum()}')
    print(sfh_train[col].isnull().sum()/len(sfh_train))
    print('------')

bedroomcnt null count: 4
3.095158321217078e-06
------
bathroomcnt null count: 4
3.095158321217078e-06
------
calculatedfinishedsquarefeet null count: 5056
0.0039122801180183865
------
taxvaluedollarcnt null count: 286
0.0002213038199670211
------
yearbuilt null count: 5563
0.004304591435232651
------
taxamount null count: 2674
0.0020691133377336167
------
fips null count: 0
0.0
------


In [21]:
# Utilizing this function to find the info on each column that'll help direct what value to input for nulls if necessary
def v(df):    
    for col in df:
        print(f'Column: {col}')
        print(f'Min: {df[col].min()}')
        print(f'Max: {df[col].max()}')
        print(f'Mean: {df[col].mean()}')
        print(f'Mode: {df[col].mode()}')
        print(f'Median: {df[col].median()}')
        print(f'{df[col].value_counts()}')
        print('-------')

In [22]:
# Using the function for sfh_train
v(sfh_train)

Column: bedroomcnt
Min: 0.0
Max: 25.0
Mean: 3.2877709142429565
Mode: 0    3.0
Name: bedroomcnt, dtype: float64
Median: 3.0
3.0     578721
4.0     380599
2.0     201382
5.0      90682
6.0      15260
1.0      13936
0.0       7876
7.0       2919
8.0        668
9.0        167
10.0        79
11.0        20
13.0        10
12.0         7
14.0         4
15.0         4
25.0         1
18.0         1
16.0         1
Name: bedroomcnt, dtype: int64
-------
Column: bathroomcnt
Min: 0.0
Max: 32.0
Mean: 2.2312968676127047
Mode: 0    2.0
Name: bathroomcnt, dtype: float64
Median: 2.0
2.00     565842
3.00     253894
1.00     248718
2.50      85953
4.00      49561
1.50      18814
3.50      17167
5.00      17066
4.50      11687
0.00       7800
6.00       6526
5.50       3716
7.00       2610
8.00       1030
6.50        803
9.00        423
7.50        236
10.00       189
11.00        79
8.50         59
12.00        46
9.50         27
13.00        26
14.00        13
15.00         9
16.00         8
0.50        

In [23]:
#random.randint(0, 0)

In [29]:
# Determining the range of values to input for bedroomcnt
# 2-5 bedrooms seems to encompass most of the data, so the remaining nulls will fall within those values
(578721+380599+201382+90682)/len(sfh_train['bedroomcnt'])

0.968307900159478

In [30]:
sfh_train['bedroomcnt'].fillna(random.randint(2.0, 5.0), inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sfh_train['bedroomcnt'].fillna(random.randint(2.0, 5.0), inplace = True)


In [34]:
# Making sure it worked
sfh_train.isnull().sum()

bedroomcnt                         0
bathroomcnt                        4
calculatedfinishedsquarefeet    5056
taxvaluedollarcnt                286
yearbuilt                       5563
taxamount                       2674
fips                               0
dtype: int64

In [54]:
#sfh_train.bathroomcnt.value_counts()

In [None]:
# This will output an integer value between 1.0 and 3.0, but can be 
random.randint(1.0, 3.0)

In [48]:
sfh_train['bathroomcnt'].fillna(random.randint(1.0, 3.0), inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sfh_train['bathroomcnt'].fillna(random.randint(1.0, 3.0), inplace = True)


In [55]:
# I'm using 1624(median) as it falls between the mode and the mean values
sfh_train.calculatedfinishedsquarefeet.fillna(1624.0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sfh_train.calculatedfinishedsquarefeet.fillna(1624.0, inplace = True)


In [None]:
#sfh_train.taxvaluedollarcnt.value_counts(bins=500)

In [56]:
sfh_train['taxvaluedollarcnt'].fillna(450000.0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sfh_train['taxvaluedollarcnt'].fillna(450000.0, inplace = True)


In [58]:
sfh_train.yearbuilt.value_counts(bins=10)

(1951.5, 1973.0]      539232
(1930.0, 1951.5]      272467
(1973.0, 1994.5]      228289
(1908.5, 1930.0]      121538
(1994.5, 2016.0]      114506
(1887.0, 1908.5]       10508
(1865.5, 1887.0]         227
(1800.784, 1822.5]         5
(1822.5, 1844.0]           3
(1844.0, 1865.5]           3
Name: yearbuilt, dtype: int64

In [59]:
sfh_train['yearbuilt'].fillna(random.randint(1908.0, 1994.0), inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sfh_train['yearbuilt'].fillna(random.randint(1908.0, 1994.0), inplace = True)


In [60]:
sfh_train.taxamount.value_counts(bins=10)

(-1335.905, 133777.251]       1289092
(133777.251, 267552.652]          473
(267552.652, 401328.053]           66
(401328.053, 535103.454]           16
(535103.454, 668878.855]           10
(668878.855, 802654.256]            2
(802654.256, 936429.657]            2
(936429.657, 1070205.058]           2
(1070205.058, 1203980.459]          2
(1203980.459, 1337755.86]           2
Name: taxamount, dtype: int64

In [61]:
sfh_train['taxamount'].fillna(4108.0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sfh_train['taxamount'].fillna(4108.0, inplace = True)


In [63]:
sfh_train.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
2236567,3.0,3.0,2988.0,265257.0,2006.0,5537.14,6037.0
1804351,5.0,3.0,2826.0,642000.0,2005.0,7906.32,6111.0
1861128,6.0,3.5,2850.0,822352.0,1966.0,9254.54,6059.0
807,4.0,2.0,1576.0,217168.0,1971.0,2532.56,6111.0
1004951,3.0,3.0,2395.0,219511.0,1939.0,2712.17,6037.0


In [65]:
# Checking one last time
sfh_train.isnull().sum()

bedroomcnt                      0
bathroomcnt                     0
calculatedfinishedsquarefeet    0
taxvaluedollarcnt               0
yearbuilt                       0
taxamount                       0
fips                            0
dtype: int64

In [69]:
# Now to create the function that will be added to wrangle.py (only for 'bedroomcnt','bathroomcnt',
# 'calculatedfinishedsquarefeet','taxvaluedollarcnt','yearbuilt','taxamount','fips')

def wrangle_zillow(df):
    df.bedroomcnt.fillna(random.randint(2.0, 5.0), inplace = True)
    df.bathroomcnt.fillna(random.randint(1.0, 3.0), inplace = True)
    df.calculatedfinishedsquarefeet.fillna(df.calculatedfinishedsquarefeet.median(), inplace = True)
    df.taxvaluedollarcnt.fillna(df.taxvaluedollarcnt.mode(), inplace = True)
    # For yearbuilt I'll use 1958 as it falls in the middle of the mean and mode and they are all fairly close in value
    df.yearbuilt.fillna(1958.0, inplace = True)
    df.taxamount.fillna(df.taxamount.median(), inplace = True)
    print(df)
    return df

In [71]:
# testing the function on validate
wrangle_zillow(validate)

         propertylandusetypeid  parcelid  airconditioningtypeid  \
142912                   261.0  12727667                    NaN   
1631671                  266.0  12033409                    NaN   
1607459                  261.0  14145481                    NaN   
2058368                  263.0  13131685                    NaN   
11207                    261.0  14054536                    NaN   
...                        ...       ...                    ...   
1361768                  261.0  11674477                    NaN   
953406                   261.0  17108995                    NaN   
175217                   261.0  14056876                    NaN   
379015                   261.0  12093741                    NaN   
2927612                  261.0  11820635                    NaN   

         architecturalstyletypeid  basementsqft  bathroomcnt  bedroomcnt  \
142912                        NaN           NaN          1.0         2.0   
1631671                       NaN          

Unnamed: 0,propertylandusetypeid,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,propertylandusedesc
142912,261.0,12727667,,,,1.0,2.0,,4.0,1.0,...,,50155.0,250787.0,2016.0,200632.0,3319.43,,,6.037503e+13,Single Family Residential
1631671,266.0,12033409,,,,1.0,1.0,,6.0,1.0,...,,93197.0,341725.0,2016.0,248528.0,4172.82,,,6.037190e+13,Condominium
1607459,261.0,14145481,,,,4.5,3.0,,,4.5,...,,147671.0,580002.0,2016.0,432331.0,6629.26,,,6.059002e+13,Single Family Residential
2058368,263.0,13131685,,,,0.0,0.0,,,,...,,,,2016.0,,69.91,,,,Mobile Home
11207,261.0,14054536,,,,3.0,3.0,,,3.0,...,,319686.0,800732.0,2016.0,481046.0,8879.04,,,6.059099e+13,Single Family Residential
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1361768,261.0,11674477,,,,2.0,2.0,,7.0,2.0,...,,272194.0,453653.0,2016.0,181459.0,5764.15,,,6.037800e+13,Single Family Residential
953406,261.0,17108995,,,,3.5,3.0,,,3.5,...,,245000.0,700000.0,2016.0,455000.0,7862.06,,,6.111001e+13,Single Family Residential
175217,261.0,14056876,,,,2.0,3.0,,,2.0,...,,143075.0,337936.0,2016.0,194861.0,3903.56,,,6.059099e+13,Single Family Residential
379015,261.0,12093741,,,,2.0,3.0,,8.0,2.0,...,,276306.0,516520.0,2016.0,240214.0,6154.61,,,6.037461e+13,Single Family Residential


In [72]:
validate.isnull().sum()

propertylandusetypeid                0
parcelid                             0
airconditioningtypeid           433117
architecturalstyletypeid        595185
basementsqft                    596103
bathroomcnt                          0
bedroomcnt                           0
buildingclasstypeid             593884
buildingqualitytypeid           207772
calculatedbathnbr                22860
decktypeid                      592978
finishedfloor1squarefeet        555815
calculatedfinishedsquarefeet         0
finishedsquarefeet12             52431
finishedsquarefeet13            594915
finishedsquarefeet15            558403
finishedsquarefeet50            555815
finishedsquarefeet6             592084
fips                                 0
fireplacecnt                    533890
fullbathcnt                      22860
garagecarcnt                    418473
garagetotalsqft                 418473
hashottuborspa                  586434
heatingorsystemtypeid           222303
latitude                 