## King County Data Clean

This notebook will read in the raw datafile downloaded from the King County website and output/save a .csv file with only the information we need moving forward.

In [None]:
# Libraries

import pandas as pd

In [None]:
# Set better options to allow viewing more of the dataframe

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
# Import csv

king_raw = pd.read_csv('../data/King_County.csv')

In [None]:
king_raw.head()

In [None]:
# Fill the zipcode nulls with 0 so we can turn it into an int
king_raw.ZIP5.fillna(0, inplace=True)

In [None]:
# Set to int
king_raw['ZIP5'] = king_raw.ZIP5.astype(int)

In [None]:
# Double check
king_raw.dtypes

In [None]:
# Use only the affected zip codes
# These zip codes were found by combining the zipcode map and the lahar flow map. The affected zipcodes were 
# pulled out manually. This was done to reduce the size the data we were working with. This is not neccessary 
# or advised for the process to run in a production environment, unless we can find a way to programatically d
# determine affected zip codes.

king_zips = king_raw[(king_raw['ZIP5'] == 98022) | 
         (king_raw['ZIP5'] == 98092) | 
         (king_raw['ZIP5'] == 98001) |
         (king_raw['ZIP5'] == 98047) |
         (king_raw['ZIP5'] == 98003)]

In [None]:
# A list of all the columns we aren't interested in. Again, only necessary to reduce file size.

dropcolumns = ['OBJECTID', 'MAJOR', 'MINOR', 'PIN', 'COMMENTS', 'SITETYPE', 'SITEID'
               , 'POINT_X', 'POINT_Y', 'COUNTY', 'KROLL',
       'KCTP_CITY', 'KCTP_STATE', 'PLSS', 'PROP_NAME', 'PLAT_NAME', 'PLAT_LOT',
       'PLAT_BLOCK', 'LOTSQFT', 'LEVYCODE', 'LEVY_JURIS', 'NEW_CONSTR','TAXVAL_RSN', 
       'ACCNT_NUM', 'KCTP_TAXYR', 'UNIT_NUM', 'BLDG_NUM', 'CONDOSITUS', 'QTS',
       'SEC', 'TWP', 'RNG', 'PRIMARY_ADDR', 'ANNEXING_CITY', 'PAAUNIQUENAME',
       'PROPTYPE', 'ALIAS1', 'ALIAS2', 'KCA_ZONING', 'KCA_ACRES', 'LEGALDESC',
       'PREUSE_CODE', 'PREUSE_DESC', 'Shape_Length', 'Shape_Area', 'ADDR_SN',
       'ADDR_ST','ADDR_SD', 'ADDR_HN','ADDR_PD','ADDR_PT','ADDR_NUM', 'FULLNAME']

In [None]:
# Drop unwanted columns

king_zips.drop(columns=dropcolumns, inplace=True)

In [None]:
king_zips.head()

In [None]:
king_zips.shape

In [None]:
king_zips.isna().sum()

In [None]:
# drop null tax values, as they're needed

king_zips.dropna(subset=['APPR_IMPR','TAX_LNDVAL','TAX_IMPR'], inplace=True)

In [None]:
king_zips.isna().sum()

In [None]:
king_zips.shape

I don't care about these other nulls, as we have the latitude and longitude to determine if they are in the affected zones, and a value to go along with them

In [None]:
# Drop any duplicates based on lattitude and longitude I don't need two of the same thing.

king_zips.drop_duplicates(subset = ['LAT','LON'], inplace=True)

In [None]:
# reset index

king_zips.reset_index(drop=True, inplace=True)

In [None]:
king_zips.head()

In [None]:
king_zips.shape

In [None]:
king_zips.to_csv('../data/kingvalues.csv')