In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats
import pandas_profiling

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

import acquire
import summarize
import prepare
import env

In [None]:
# """SELECT p2.*, p1.logerror FROM predictions_2016 p1
# LEFT JOIN properties_2016 p2  USING(parcelid)
# WHERE (bedroomcnt > 0 AND bathroomcnt > 0 AND calculatedfinishedsquarefeet > 500 
# AND latitude IS NOT NULL AND longitude IS NOT NULL) 
# AND (unitcnt = 1 OR unitcnt IS NULL);"""

### Acquire df

- Use function from acquire.py to bring in df using sql query.

In [19]:
df = acquire.get_zillow_data()
df.sample()

Unnamed: 0,county,tax_rate,id,parcelid,airconditioningtypeid,airconditioningdesc,architecturalstyletypeid,architecturalstyledesc,basementsqft,bathroomcnt,...,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,typeconstructiontypeid,typeconstructiondesc,censustractandblock,transactiondate,logerror,transactions
7006,Los Angeles,0.01,1181571,11367981,,,,,,2.0,...,84058.0,2169.02,,,,,60379002011007.0,2017-05-22,-0.04,2


### Summarize df

In [5]:
summarize.df_summary(df)

--- Shape: (52169, 72)
--- Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52169 entries, 0 to 52168
Data columns (total 72 columns):
county                          52169 non-null object
tax_rate                        52164 non-null float64
id                              52169 non-null int64
parcelid                        52169 non-null int64
airconditioningtypeid           13605 non-null float64
airconditioningdesc             13605 non-null object
architecturalstyletypeid        70 non-null float64
architecturalstyledesc          70 non-null object
basementsqft                    47 non-null float64
bathroomcnt                     52169 non-null float64
bedroomcnt                      52169 non-null float64
buildingclasstypeid             0 non-null object
buildingclassdesc               0 non-null object
buildingqualitytypeid           33628 non-null float64
calculatedbathnbr               52153 non-null float64
calculatedfinishedsquarefeet    52161 non-null float64
deckt

   num_cols_missing    pct_cols_missing  num_rows
0                23  31.944444444444443         2
1                24   33.33333333333333        12
2                25   34.72222222222222        11
3                26   36.11111111111111        30
4                27                37.5       177
5                28   38.88888888888889       389
6                29   40.27777777777778      2527
7                30   41.66666666666667      2194
8                31   43.05555555555556      5986
9                32   44.44444444444444      8880
10               33   45.83333333333333     11960
11               34   47.22222222222222     11151
12               35   48.61111111111111      3459
13               36                50.0      4121
14               37  51.388888888888886      1016
15               38   52.77777777777778       214
16               39  54.166666666666664        22
17               40   55.55555555555556        13
18               41   56.94444444444444         3


(33339141.852, 33488434.7]      753
(33488434.7, 33636249.4]       3203
(33636249.4, 33784064.1]       6059
(33784064.1, 33931878.8]      10912
(33931878.8, 34079693.5]       9002
(34079693.5, 34227508.2]      12404
(34227508.2, 34375322.9]       4596
(34375322.9, 34523137.6]       1816
(34523137.6, 34670952.3]       2183
(34670952.3, 34818767.0]       1241
Name: latitude, dtype: int64
longitude:
(-119477336.781, -119283338.0]      158
(-119283338.0, -119091260.0]       1291
(-119091260.0, -118899182.0]       1050
(-118899182.0, -118707104.0]       1776
(-118707104.0, -118515026.0]       3838
(-118515026.0, -118322948.0]       9297
(-118322948.0, -118130870.0]       9990
(-118130870.0, -117938792.0]      11197
(-117938792.0, -117746714.0]       9121
(-117746714.0, -117554636.0]       4451
Name: longitude, dtype: int64
lotsizesquarefeet:
(-6734.775000000001, 697313.4]    51796
(697313.4, 1394390.8]                 8
(1394390.8, 2091468.2]                2
(2091468.2, 2788545.6]         

In [None]:
#df = df.loc[:,~df.columns.duplicated()] 

### Explore df

- Here I'll explore the dataframe to help me decide what needs to be prepped/cleaned for my model.

In [4]:
pandas_profiling.ProfileReport(df)



- Write a function that takes in a dataframe of observations and attributes and returns a df where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [20]:
summarize.nulls_by_col(df)

Unnamed: 0,num_rows_missing,pct_rows_missing
county,0,0.00
tax_rate,5,0.00
id,0,0.00
parcelid,0,0.00
airconditioningtypeid,38564,0.74
airconditioningdesc,38564,0.74
architecturalstyletypeid,52099,1.00
architecturalstyledesc,52099,1.00
basementsqft,52122,1.00
bathroomcnt,0,0.00


#### Takeaways from nulls in columns function

- I can see that there are columns that have no data in them, and those I will certainly drop. 


- There are others that are more than 50% NULL values, and I'm going to drop those as well. That is too high of a percentage of Nulls to make the data meaningful.

- Write a function that takes in a dataframe and returns a dataframe with 3 columns: the number of columns missing, percent of columns missing, number of rows with n columns missing. Run the function and document takeaways from this on how you want to handle missing values.

In [21]:
summarize.nulls_by_row(df)

Unnamed: 0,num_cols_missing,pct_cols_missing,num_rows
0,23,31.944444444444443,2
1,24,33.33333333333333,12
2,25,34.72222222222222,11
3,26,36.11111111111111,30
4,27,37.5,177
5,28,38.88888888888889,389
6,29,40.27777777777778,2527
7,30,41.66666666666667,2194
8,31,43.05555555555556,5986
9,32,44.44444444444444,8880


#### Takeaways from the nulls by row function

- 


- 

- This function will drop columns that are not 50% non-missing values and rows that are not 75% non-missing values

In [22]:
df = prepare.handle_missing_values(df)

In [23]:
df.isnull().sum()

county                              0
tax_rate                            5
id                                  0
parcelid                            0
bathroomcnt                         0
bedroomcnt                          0
buildingqualitytypeid           18541
calculatedbathnbr                  16
calculatedfinishedsquarefeet        8
finishedsquarefeet12              166
fips                                0
state                               0
fullbathcnt                        16
heatingorsystemtypeid           18345
heatingorsystemdesc             18345
latitude                            0
longitude                           0
lotsizesquarefeet                 354
propertycountylandusecode           0
propertylandusetypeid               0
propertylandusedesc                 0
propertyzoningdesc              18479
rawcensustractandblock              0
regionidcity                     1028
regionidcounty                      0
regionidzip                        23
roomcnt     

- I am removing columns for the following reasons...

    - "unitcnt", "propertyzoningdesc", "heatingorsystemdesc", 
      "heatingorsystemtypeid", "buildingqualitytypeid" : missing 18,451 values
    
    - "assessmentyear": constant value of 2016
    
    - "calculatedbathnbr": has a correlation of 1.0 with bathroomcnt
    
    - "finishedsquarefeet12": has a correlcation of 1.0 with calculatedsquarefeet
    
    - "propertylandusedesc", "propertylandusetypeid": these are all "261" or single family residential.
    
    - "rawcensustractandblack": is highly correlated with fips/county
    
    - "state": are all California; not useful in analysis
    
- I am removing the rows with missing values from the following columns...

    - "yearbuilt": has 40 missing values that I can't impute in a meaningful way. Dropping 40 rows out of the df should not be a problem.
    
    - "censustractandblock"- has 112 values I can't impute in a meaningful way. I'll drop these rows.
    
    - "fullbathcnt" - has 16 values missing, so I will just drop those rows.
    
    - 

In [24]:
cols_to_remove = ["unitcnt", "propertyzoningdesc", "heatingorsystemdesc", 
                  "heatingorsystemtypeid", "buildingqualitytypeid", "assessmentyear",
                  "calculatedbathnbr", "finishedsquarefeet12", "propertylandusedesc", 
                  "propertylandusetypeid", "rawcensustractandblock" , "state"]

- This function removes the columns above from the dataframe for the reasons I listed above.

In [25]:
df = prepare.remove_columns(df, cols_to_remove)

- 

In [15]:
df.isnull().sum()

county                             0
tax_rate                           5
id                                 0
parcelid                           0
bathroomcnt                        0
bedroomcnt                         0
calculatedfinishedsquarefeet       8
fips                               0
fullbathcnt                       16
latitude                           0
longitude                          0
lotsizesquarefeet                354
propertycountylandusecode          0
regionidcity                    1028
regionidcounty                     0
regionidzip                       23
roomcnt                            0
taxvaluedollarcnt                  1
yearbuilt                         40
structuretaxvaluedollarcnt        72
landtaxvaluedollarcnt              1
taxamount                          4
censustractandblock              112
transactiondate                    0
logerror                           0
transactions                       0
dtype: int64

In [16]:
df.censustractandblock.value_counts(dropna=False)

                 nan     112
60,379,201,162,006.00     32
60,379,203,391,054.00     27
60,590,320,571,003.00     24
60,371,417,002,000.00     24
60,590,320,421,002.00     23
60,590,320,581,007.00     23
60,590,320,231,024.00     22
60,371,943,002,001.00     20
60,379,005,042,000.00     20
60,379,200,281,007.00     20
60,372,360,001,001.00     19
61,110,058,022,011.00     19
60,372,622,001,000.00     19
60,379,203,134,000.00     18
60,590,320,531,008.00     18
60,590,218,151,007.00     18
60,590,524,263,000.00     18
61,110,058,022,000.00     18
60,590,320,463,024.00     18
60,590,320,592,003.00     17
60,590,423,303,000.00     17
60,590,320,232,001.00     17
60,590,626,331,001.00     17
60,590,626,432,011.00     17
60,590,524,271,001.00     17
60,372,611,022,004.00     16
60,379,010,043,001.00     16
60,590,423,332,009.00     16
60,379,107,052,000.00     16
                        ... 
60,590,633,012,007.00      1
60,372,213,021,001.00      1
60,378,003,291,038.00      1
60,371,097,001

In [None]:
import seaborn as sns

In [None]:
#sns.heatmap(df, annot=True)