In [24]:
import pandas as pd
import numpy as np

from env import get_db_url

import wrangle as w

In [2]:
sql_query = '''SELECT bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet,
                taxvaluedollarcnt, yearbuilt, taxamount, fips
               FROM properties_2017'''

In [3]:
url = get_db_url('zillow')

In [4]:
df = pd.read_sql(sql_query, url)

In [8]:
df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,0.0,0.0,,9.0,,,6037.0
1,0.0,0.0,,27516.0,,,6037.0
2,0.0,0.0,73026.0,1434941.0,1959.0,20800.37,6037.0
3,0.0,0.0,5068.0,1174475.0,1948.0,14557.57,6037.0
4,0.0,0.0,1776.0,440101.0,1947.0,5725.17,6037.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2985217 entries, 0 to 2985216
Data columns (total 7 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   bedroomcnt                    float64
 1   bathroomcnt                   float64
 2   calculatedfinishedsquarefeet  float64
 3   taxvaluedollarcnt             float64
 4   yearbuilt                     float64
 5   taxamount                     float64
 6   fips                          float64
dtypes: float64(7)
memory usage: 159.4 MB


## Initial goals for cleaning
### * Bedroom count
    * change the column name
    * handle nulls - median?
    * convert to int  
    
### *  Bathroom count
    * change the column name
    * handle nulls
    
### * Square Footage
    * change column name
    * handle nulls
    * convert to int
    
### *Year Built
    * change column name?
    * handle nulls
    * convert from float to datetime, year only
    
### *Tax Amount
    * handle nulls

### *Fips
    * FIPS is the county identification code. All 3 counties in this data are in California.
    * handle nulls
    * drop decimal points and convert to object dtype.


In [19]:
df.bedroomcnt.value_counts()

3.0     1172757
4.0      731475
2.0      606782
5.0      182765
0.0      118705
1.0       86941
6.0       48915
8.0       13542
7.0       12763
9.0        4279
10.0       1702
12.0        959
11.0        425
13.0         86
14.0         69
16.0         50
15.0         24
17.0         11
18.0          9
20.0          8
25.0          1
23.0          1
19.0          1
24.0          1
21.0          1
Name: bedroomcnt, dtype: int64

In [20]:
df.bathroomcnt.value_counts()

2.00     1219811
3.00      633089
1.00      499332
2.50      208809
4.00      133922
0.00      113470
1.50       45735
5.00       38514
3.50       31835
4.50       19864
6.00       16416
5.50        6275
7.00        6221
8.00        4548
6.50        1352
9.00        1341
10.00        496
7.50         385
12.00        269
11.00        200
8.50         113
13.00         53
9.50          50
14.00         39
16.00         25
15.00         21
0.50          16
10.50         14
18.00         12
17.00          8
20.00          8
1.75           4
19.00          3
12.50          3
11.50          3
19.50          1
14.50          1
32.00          1
31.00          1
Name: bathroomcnt, dtype: int64

In [32]:
# What are fips?
df.fips.value_counts()

6037.0    1970806
6059.0     717971
6111.0     213141
Name: fips, dtype: int64

In [30]:
# drop the nulls to do the calculation
df = df.dropna()

# testing to see if changing floats in 
(df.calculatedfinishedsquarefeet.astype(int) != df.calculatedfinishedsquarefeet).sum()

0

In [12]:
# see how many nulls are in each column
df.isnull().sum()

bedroomcnt                       2945
bathroomcnt                      2957
calculatedfinishedsquarefeet    45097
taxvaluedollarcnt               34266
yearbuilt                       47833
taxamount                       22752
fips                             2932
dtype: int64

In [17]:
df.size

20896519

In [23]:
# compare how much data would be lost if nulls are simply dropped
df.dropna().size

20313426

### For nulls, for initial cleaning I am going to just drop the nulls. Doing so only accounts for a loss of approximately 2.5% of the data. 

In [25]:
# testing acquire function
zillow = w.get_zillow_data()

In [26]:
zillow.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,0.0,0.0,,9.0,,,6037.0
1,0.0,0.0,,27516.0,,,6037.0
2,0.0,0.0,73026.0,1434941.0,1959.0,20800.37,6037.0
3,0.0,0.0,5068.0,1174475.0,1948.0,14557.57,6037.0
4,0.0,0.0,1776.0,440101.0,1947.0,5725.17,6037.0


In [33]:
# renaming the columns 
df = df.rename(columns={'bedroomcnt':'bedcount',
                        'bathroomcnt':'bathcount',
                        'calculatedfinishedsquarefeet': 'sqfeet',
                        'taxvaluedollarcnt': 'taxvalue',})

In [34]:
df.head()

Unnamed: 0,bedcount,bathcount,sqfeet,taxvalue,yearbuilt,taxamount,fips
2,0.0,0.0,73026.0,1434941.0,1959.0,20800.37,6037.0
3,0.0,0.0,5068.0,1174475.0,1948.0,14557.57,6037.0
4,0.0,0.0,1776.0,440101.0,1947.0,5725.17,6037.0
5,0.0,0.0,2400.0,287634.0,1943.0,3661.28,6037.0
7,0.0,0.0,3611.0,698984.0,1946.0,7857.84,6037.0


In [38]:
df['bedcount'] = df.bedcount.astype(int)

In [35]:
df['sqfeet'] = df.sqfeet.astype(int)

In [48]:
# trying to convert yearbuilt to datetime
pd.to_datetime(df['yearbuilt'].astype(str).replace(r'),format='%Y')

ValueError: unconverted data remains: .0

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2901918 entries, 2 to 2982283
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   bedcount   int64  
 1   bathcount  float64
 2   sqfeet     int64  
 3   taxvalue   float64
 4   yearbuilt  float64
 5   taxamount  float64
 6   fips       float64
dtypes: float64(5), int64(2)
memory usage: 177.1 MB


In [40]:
df.head()

Unnamed: 0,bedcount,bathcount,sqfeet,taxvalue,yearbuilt,taxamount,fips
2,0,0.0,73026,1434941.0,1959.0,20800.37,6037.0
3,0,0.0,5068,1174475.0,1948.0,14557.57,6037.0
4,0,0.0,1776,440101.0,1947.0,5725.17,6037.0
5,0,0.0,2400,287634.0,1943.0,3661.28,6037.0
7,0,0.0,3611,698984.0,1946.0,7857.84,6037.0
