In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from acquire import new_zillow_data, get_zillow_data

### 1. Acquire data from mySQL using the python module to connect and query. You will want to end with a single dataframe. Make sure to include: the logerror, all fields related to the properties that are available. You will end up using all the tables in the database.

- Be sure to do the correct join (inner, outer, etc.). We do not want to eliminate properties purely because they may have a null value for airconditioningtypeid.
- Only include properties with a transaction in 2017, and include only the last transaction for each properity (so no duplicate property ID's), along with zestimate error and date of transaction.
- Only include properties that include a latitude and longitude value.

In [2]:
df = new_zillow_data()
df.head()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,14297519,,,261.0,,,,,1727539,,...,60590630000000.0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,17052889,,,261.0,,,,,1387261,,...,61110010000000.0,0.055619,2017-01-01,,,,,Single Family Residential,,
2,14186244,,,261.0,,,,,11677,,...,60590220000000.0,0.005383,2017-01-01,,,,,Single Family Residential,,
3,12177905,,,261.0,2.0,,,,2288172,,...,60373000000000.0,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,10887214,,,266.0,2.0,,,1.0,1970746,,...,60371240000000.0,0.00694,2017-01-01,Central,,,Central,Condominium,,


### 2. Summarize your data (summary stats, info, dtypes, shape, distributions, value_counts, etc.)

In [3]:
df.shape

(77580, 68)

In [4]:
df.longitude.isnull().sum()

0

In [5]:
# This will allow for all columns to be shown in your df
#pd.set_option('display.max_columns', None)

In [6]:
df.isna().sum()

parcelid                      0
typeconstructiontypeid    77357
storytypeid               77530
propertylandusetypeid         0
heatingorsystemtypeid     28008
                          ...  
buildingclassdesc         77565
heatingorsystemdesc       28008
propertylandusedesc           0
storydesc                 77530
typeconstructiondesc      77357
Length: 68, dtype: int64

In [7]:
df.isnull().sum()

parcelid                      0
typeconstructiontypeid    77357
storytypeid               77530
propertylandusetypeid         0
heatingorsystemtypeid     28008
                          ...  
buildingclassdesc         77565
heatingorsystemdesc       28008
propertylandusedesc           0
storydesc                 77530
typeconstructiondesc      77357
Length: 68, dtype: int64

In [8]:
df.describe()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock,logerror
count,77580.0,223.0,50.0,77580.0,49572.0,15.0,207.0,25007.0,77580.0,50.0,...,17599.0,172.0,77465.0,77579.0,77580.0,77578.0,77575.0,2900.0,77333.0,77580.0
mean,13008280.0,6.040359,7.0,261.824465,3.921811,3.933333,7.386473,1.812013,1495404.0,679.72,...,1.434286,1.0,189279.6,490147.6,2016.0,301150.0,5995.927626,14.088276,60496670000000.0,0.0168
std,3519376.0,0.556035,0.0,5.141564,3.59477,0.258199,2.72803,2.965768,860970.0,689.703546,...,0.544515,0.0,230409.5,653794.2,0.0,492721.9,7628.81649,2.181281,1533329000000.0,0.170739
min,10711860.0,4.0,7.0,31.0,1.0,3.0,2.0,1.0,349.0,38.0,...,1.0,1.0,44.0,1000.0,2016.0,161.0,19.92,3.0,60371010000000.0,-4.65542
25%,11538200.0,6.0,7.0,261.0,2.0,4.0,7.0,1.0,752143.0,273.0,...,1.0,1.0,84171.0,206899.0,2016.0,85293.25,2712.65,14.0,60373110000000.0,-0.02431
50%,12530560.0,6.0,7.0,261.0,2.0,4.0,7.0,1.0,1498256.0,515.0,...,1.0,1.0,136402.0,358878.0,2016.0,203181.0,4448.23,15.0,60376030000000.0,0.006675
75%,14211350.0,6.0,7.0,266.0,7.0,4.0,7.0,1.0,2240950.0,796.5,...,2.0,1.0,218734.0,569000.0,2016.0,366739.8,6926.885,15.0,60590420000000.0,0.039291
max,167689300.0,13.0,7.0,275.0,24.0,4.0,21.0,13.0,2982274.0,3560.0,...,6.0,1.0,11421790.0,49061240.0,2016.0,48952200.0,586639.3,99.0,483030100000000.0,5.262999


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77580 entries, 0 to 77579
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      77580 non-null  int64  
 1   typeconstructiontypeid        223 non-null    float64
 2   storytypeid                   50 non-null     float64
 3   propertylandusetypeid         77580 non-null  float64
 4   heatingorsystemtypeid         49572 non-null  float64
 5   buildingclasstypeid           15 non-null     float64
 6   architecturalstyletypeid      207 non-null    float64
 7   airconditioningtypeid         25007 non-null  float64
 8   id                            77580 non-null  int64  
 9   basementsqft                  50 non-null     float64
 10  bathroomcnt                   77580 non-null  float64
 11  bedroomcnt                    77580 non-null  float64
 12  buildingqualitytypeid         49810 non-null  float64
 13  c

### 3. Write a function that takes in a dataframe of observations and attributes and returns a dataframe where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [14]:
missing_row_value = df.isnull().sum()
missing_row_value

parcelid                      0
typeconstructiontypeid    77357
storytypeid               77530
propertylandusetypeid         0
heatingorsystemtypeid     28008
                          ...  
buildingclassdesc         77565
heatingorsystemdesc       28008
propertylandusedesc           0
storydesc                 77530
typeconstructiondesc      77357
Length: 68, dtype: int64

In [18]:
percent_row_missing = round(df.isnull().sum()/len(df),2)*100
percent_row_missing

parcelid                    0.0
typeconstructiontypeid    100.0
storytypeid               100.0
propertylandusetypeid       0.0
heatingorsystemtypeid      36.0
                          ...  
buildingclassdesc         100.0
heatingorsystemdesc        36.0
propertylandusedesc         0.0
storydesc                 100.0
typeconstructiondesc      100.0
Length: 68, dtype: float64

In [20]:
def missing_values(df):
    # Gives value counts of missing rows
    missing_row_value = df.isnull().sum()
    # Gives the percentage of rows missing
    percent_row_missing = round(df.isnull().sum()/len(df),2)*100
    # Creates a new df for the missing rows and percent missing
    missing_df = pd.DataFrame({'missing_rows' : missing_row_value, 'percent_missing' : percent_row_missing})
    return missing_df
missing_values(df)

Unnamed: 0,missing_rows,percent_missing
parcelid,0,0.0
typeconstructiontypeid,77357,100.0
storytypeid,77530,100.0
propertylandusetypeid,0,0.0
heatingorsystemtypeid,28008,36.0
...,...,...
buildingclassdesc,77565,100.0
heatingorsystemdesc,28008,36.0
propertylandusedesc,0,0.0
storydesc,77530,100.0


In [28]:
missing_values(df).head(30)

Unnamed: 0,missing_rows,percent_missing
parcelid,0,0.0
typeconstructiontypeid,77357,100.0
storytypeid,77530,100.0
propertylandusetypeid,0,0.0
heatingorsystemtypeid,28008,36.0
buildingclasstypeid,77565,100.0
architecturalstyletypeid,77373,100.0
airconditioningtypeid,52573,68.0
id,0,0.0
basementsqft,77530,100.0


In [29]:
missing_values(df).tail(30)

Unnamed: 0,missing_rows,percent_missing
rawcensustractandblock,0,0.0
regionidcity,1472,2.0
regionidcounty,0,0.0
regionidneighborhood,46606,60.0
regionidzip,50,0.0
roomcnt,0,0.0
threequarterbathnbr,67474,87.0
unitcnt,26876,35.0
yardbuildingsqft17,75187,97.0
yardbuildingsqft26,77510,100.0


There are a lot of rows missing data. Going to set a threshold if more than 5% is missing, to drop.

### 4.Write a function that takes in a dataframe and returns a dataframe with 3 columns: the number of columns missing, percent of columns missing, and number of rows with n columns missing. Run the function and document takeaways from this on how you want to handle missing values.

In [45]:
# df.loc[ : ].count() means we're looking at every row to count the number of null values in each row 
# .isna() shows if there are booleans if there is nulls. .any() looks for true values from the isna()
# summing the count of trues with .count()
missing_cols = df.loc[:, df.isna().any()].count()
missing_cols

typeconstructiontypeid            223
storytypeid                        50
heatingorsystemtypeid           49572
buildingclasstypeid                15
architecturalstyletypeid          207
airconditioningtypeid           25007
basementsqft                       50
buildingqualitytypeid           49810
calculatedbathnbr               76964
decktypeid                        614
finishedfloor1squarefeet         6037
calculatedfinishedsquarefeet    77379
finishedsquarefeet12            73924
finishedsquarefeet13               42
finishedsquarefeet15             3027
finishedsquarefeet50             6037
finishedsquarefeet6               386
fireplacecnt                     8289
fullbathcnt                     76964
garagecarcnt                    25520
garagetotalsqft                 25520
hashottuborspa                   1539
lotsizesquarefeet               69322
poolcnt                         16174
poolsizesum                       869
pooltypeid10                      465
pooltypeid2 

In [51]:
len(df.index)

77580

In [48]:
# len(df.index) shows the number of rows
percent_cols_missing = round(missing_cols/ len(df.index) *100 ,2)
percent_cols_missing

typeconstructiontypeid            0.29
storytypeid                       0.06
heatingorsystemtypeid            63.90
buildingclasstypeid               0.02
architecturalstyletypeid          0.27
airconditioningtypeid            32.23
basementsqft                      0.06
buildingqualitytypeid            64.20
calculatedbathnbr                99.21
decktypeid                        0.79
finishedfloor1squarefeet          7.78
calculatedfinishedsquarefeet     99.74
finishedsquarefeet12             95.29
finishedsquarefeet13              0.05
finishedsquarefeet15              3.90
finishedsquarefeet50              7.78
finishedsquarefeet6               0.50
fireplacecnt                     10.68
fullbathcnt                      99.21
garagecarcnt                     32.90
garagetotalsqft                  32.90
hashottuborspa                    1.98
lotsizesquarefeet                89.36
poolcnt                          20.85
poolsizesum                       1.12
pooltypeid10             

In [52]:
def missing_cols(df):
    # df.loc[ : ].count() means we're looking at every row to count the number of null values in each row 
    # .isna() shows if there are booleans if there is nulls. .any() looks for true values from the isna()
    # summing the count of trues with .count()
    missing_cols = df.loc[:, df.isna().any()].count()
    # len(df.index) shows the number of rows
    percent_cols_missing = round(missing_cols/ len(df.index) *100 ,2)
    missing_cols_df = pd.DataFrame({'missing_columns' : missing_cols, 'percent_columns_missing' : percent_cols_missing})
    return missing_cols_df
missing_cols(df)

Unnamed: 0,missing_columns,percent_columns_missing
typeconstructiontypeid,223,0.29
storytypeid,50,0.06
heatingorsystemtypeid,49572,63.9
buildingclasstypeid,15,0.02
architecturalstyletypeid,207,0.27
airconditioningtypeid,25007,32.23
basementsqft,50,0.06
buildingqualitytypeid,49810,64.2
calculatedbathnbr,76964,99.21
decktypeid,614,0.79


Same observation on columns. Going to get rid of data that has over 5% missing

# Prepare

### 1. Remove any properties that are likely to be something other than single unit properties. (e.g. no duplexes, no land/lot, ...). There are multiple ways to estimate that a property is a single unit, and there is not a single "right" answer. But for this exercise, do not purely filter by unitcnt as we did previously. Add some new logic that will reduce the number of properties that are falsely removed. You might want to use # bedrooms, square feet, unit type or the like to then identify those with unitcnt not defined.