# Initial Data Exploration
Goal is to understand the data structure of the files from both CNT and Eviction Lab and to identify the necessary cleaning and manipulation required to enable study of the data.

Files:
- htaindex_blkgrps_il_2017.csv
- evictions_blkgrps_il.csv

Both files are at the block group level and contain information specific to Illinois. The H+TA index data is based on calculations from 2017, while the Eviction data ranges in dates (**to be confirmed**)

In [1]:
# import packages

import pandas as pd

In [8]:
# set display options for pandas

pd.set_option('display.max_columns', 999)

In [2]:
# set filenames

htindex_filename = 'htaindex_blkgrps_il_2017.csv'
evictions_filename = 'evictions_blkgrps_il.csv'

In [4]:
# import data to pandas dataframes

htindex_raw_df = pd.read_csv(htindex_filename)
evictions_raw_df = pd.read_csv(evictions_filename)

### H+T Affordability Data

In [9]:
# inspect the H+TA index dataframe

htindex_raw_df.head()

Unnamed: 0,blkgrp,cbsa,blkgrps,population,households,land_acres,ht_ami,ht_80ami,ht_nmi,h_ami,h_80ami,h_nmi,t_ami,t_80ami,t_nmi,co2_per_hh_local,co2_per_acre_local,autos_per_hh_ami,autos_per_hh_80ami,autos_per_hh_nmi,vmt_per_hh_ami,vmt_per_hh_80ami,vmt_per_hh_nmi,pct_transit_commuters_ami,pct_transit_commuters_80ami,pct_transit_commuters_nmi,t_cost_ami,t_cost_80ami,t_cost_nmi,auto_ownership_cost_ami,auto_ownership_cost_80ami,auto_ownership_cost_nmi,vmt_cost_ami,vmt_cost_80ami,vmt_cost_nmi,transit_cost_ami,transit_cost_80ami,transit_cost_nmi,transit_trips_ami,transit_trips_80ami,transit_trips_nmi,compact_ndx,emp_ovrll_ndx,res_density,gross_hh_density,hh_gravity,frac_sfd,emp_gravity,emp_ndx,block_size,intersection_density,avg_block_perimeter_meters,h_cost,median_smoc,median_gross_rent,pct_owner_occupied_hu,pct_renter_occupied_hu
0,"""170310609002""","""Chicago-Naperville-Elgin, IL-IN-WI""",1.0,0.0,0.0,63.22,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.2,,0.0,0.0,,,,,,,1252,,,,,
1,"""170070105002""","""Rockford, IL""",1.0,3550.0,1192.0,21395.38,71.0,85.0,67.0,40.0,50.0,37.0,31.0,34.0,29.0,11.6,0.65,2.19,2.0,2.0,25841.0,22947.0,26282.0,0.0,0.0,0.0,15600.0,13706.0,15858.0,11942.0,10457.0,12138.0,3657.0,3249.0,3720.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,3.1,0.94,0.06,3229.0,91.0,3668.0,84.0,189.0,5.0,3061,1676.0,1676.0,,99.0,1.0
2,"""170070104003""","""Rockford, IL""",1.0,986.0,381.0,20189.48,68.0,81.0,64.0,37.0,46.0,34.0,31.0,35.0,29.0,11.07,0.21,2.2,2.0,2.0,25754.0,23026.0,26177.0,0.0,0.0,0.0,15635.0,13798.0,15886.0,11990.0,10537.0,12181.0,3645.0,3260.0,3705.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,3.3,0.38,0.02,3223.0,94.0,3799.0,84.0,198.0,5.0,3419,1536.0,1532.0,1565.0,88.0,12.0
3,"""170070105003""","""Rockford, IL""",1.0,2023.0,763.0,6723.64,69.0,81.0,64.0,38.0,48.0,36.0,30.0,33.0,28.0,11.33,1.29,2.11,2.0,2.0,25276.0,22428.0,25716.0,0.0,0.0,0.0,15035.0,13150.0,15297.0,11457.0,9974.0,11657.0,3578.0,3175.0,3640.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,4.2,1.05,0.11,5638.0,95.0,6776.0,85.0,79.0,14.0,2393,1601.0,1601.0,,96.0,4.0
4,"""170070104001""","""Rockford, IL""",1.0,1914.0,734.0,10058.02,62.0,73.0,58.0,34.0,43.0,32.0,28.0,31.0,27.0,10.56,0.77,1.96,2.0,2.0,24128.0,21066.0,24591.0,0.0,0.0,0.0,14067.0,12249.0,14323.0,10652.0,9266.0,10843.0,3415.0,2983.0,3481.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,5.1,2.21,0.07,5123.0,56.0,8460.0,86.0,73.0,9.0,2133,1417.0,1485.0,725.0,91.0,9.0


#### Notes & questions about H+T data
- Necessary to strip out '"' from both the blkgrp and cbsa columns
- Necessary to check that the max of the blkgrps column == 1
- Should check to see how closely the population in the H+TA index dataset matches the data from Eviction Lab
- Need to review and determine what fields Katie used in her analysis...

### Eviction Lab data

In [10]:
evictions_raw_df.head()

Unnamed: 0,GEOID,year,name,parent-location,population,poverty-rate,renter-occupied-households,pct-renter-occupied,median-gross-rent,median-household-income,median-property-value,rent-burden,pct-white,pct-af-am,pct-hispanic,pct-am-ind,pct-asian,pct-nh-pi,pct-multiple,pct-other,eviction-filings,evictions,eviction-rate,eviction-filing-rate,low-flag,imputed,subbed
0,170010001001,2000,1.1,"Adams County, Illinois",1058.0,10.14,41.0,10.25,459.0,44028.0,92500.0,15.5,93.76,3.12,1.32,0.28,0.95,0.0,0.57,0.0,1.0,1.0,2.44,2.44,1,0,0
1,170010001001,2001,1.1,"Adams County, Illinois",1058.0,10.14,42.0,10.25,459.0,44028.0,92500.0,15.5,93.76,3.12,1.32,0.28,0.95,0.0,0.57,0.0,0.0,0.0,0.0,0.0,1,0,0
2,170010001001,2002,1.1,"Adams County, Illinois",1058.0,10.14,43.0,10.25,459.0,44028.0,92500.0,15.5,93.76,3.12,1.32,0.28,0.95,0.0,0.57,0.0,5.0,2.0,4.63,11.57,1,0,0
3,170010001001,2003,1.1,"Adams County, Illinois",1058.0,10.14,44.0,10.25,459.0,44028.0,92500.0,15.5,93.76,3.12,1.32,0.28,0.95,0.0,0.57,0.0,1.0,1.0,2.26,2.26,1,0,0
4,170010001001,2004,1.1,"Adams County, Illinois",1058.0,10.14,45.0,10.25,459.0,44028.0,92500.0,15.5,93.76,3.12,1.32,0.28,0.95,0.0,0.57,0.0,1.0,1.0,2.2,2.2,1,0,0
