# Cleaning:

In [30]:
# Import libraries:

import pandas as pd
import numpy as np

## Demographic Data:

In [48]:
# Load Demographic Data:

demog = pd.read_csv('../data/tabular_data/acs_data/acs_demographic/ACSDP5Y2018.DP05_data_with_overlays_2020-05-13T160540.csv')
demog.head(2)

Unnamed: 0,GEO_ID,NAME,DP05_0031PM,DP05_0032E,DP05_0032M,DP05_0032PE,DP05_0032PM,DP05_0033E,DP05_0033M,DP05_0033PE,...,DP05_0029M,DP05_0029PE,DP05_0029PM,DP05_0030E,DP05_0030M,DP05_0030PE,DP05_0030PM,DP05_0031E,DP05_0031M,DP05_0031PE
0,id,Geographic Area Name,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!SEX AND AGE!!Total population!!65 ye...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!RACE!!Total population,Margin of Error!!RACE!!Total population,Percent Estimate!!RACE!!Total population,...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!SEX AND AGE!!Total population!!65 ye...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!SEX AND AGE!!Total population!!65 ye...,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...
1,1400000US11001000100,"Census Tract 1, District of Columbia, District...",8.2,105.7,35.3,(X),(X),5160,480,5160,...,160,864,(X),444,106,51.4,8.2,420,107,48.6


In [41]:
# Keep demographic variables of interest:

demog = demog[['GEO_ID',
               'NAME',
               'DP05_0001E',
               'DP05_0002PE',
               'DP05_0018E',
               'DP05_0071PE',
               'DP05_0077PE',
               'DP05_0078PE',
               'DP05_0079PE',
               'DP05_0080PE',
               'DP05_0081PE',
               'DP05_0082PE']]

In [42]:
# Rename column headers to descriptives (names pulled from metadata table in 'data' folder):

demog = demog.rename(columns={'GEO_ID' : 'geo_id',
                              'NAME' : 'name',
                              'DP05_0001E' : 'total_pop',
                              'DP05_0002PE' : 'pct_male',
                              'DP05_0018E'  : 'median_age',
                              'DP05_0071PE' : 'pct_hisp_latino',
                              'DP05_0077PE' : 'pce_white',
                              'DP05_0078PE' : 'pct_black',
                              'DP05_0079PE' : 'pct_american_ind',
                              'DP05_0080PE' : 'pct_asian',
                              'DP05_0081PE' : 'pct_hawaiian_pacisldr',
                              'DP05_0082PE' : 'pct_other_race',
                              'DP05_0086E'  : 'total_housing_units'})

In [43]:
# Drop duplicate name row from original census data:

demog = demog.drop([0])
demog.head()

Unnamed: 0,geo_id,name,total_pop,pct_male,median_age,pct_hisp_latino,pce_white,pct_black,pct_american_ind,pct_asian,pct_hawaiian_pacisldr,pct_other_race
1,1400000US11001000100,"Census Tract 1, District of Columbia, District...",5160,47.2,41.3,14.6,76.1,0.7,0.0,6.2,0.0,0.0
2,1400000US11001000201,"Census Tract 2.01, District of Columbia, Distr...",3817,46.7,19.8,9.9,57.2,8.7,0.2,18.5,0.0,0.0
3,1400000US11001000202,"Census Tract 2.02, District of Columbia, Distr...",4541,50.8,27.5,8.9,74.3,5.9,0.9,6.3,0.3,0.0
4,1400000US11001000300,"Census Tract 3, District of Columbia, District...",6334,44.9,31.5,6.6,77.1,7.3,0.0,7.4,0.0,0.0
5,1400000US11001000400,"Census Tract 4, District of Columbia, District...",1428,42.6,45.7,15.6,72.5,3.0,0.0,6.2,0.0,0.0


In [45]:
# Observe columns:

demog.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 181 entries, 1 to 181
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   geo_id                 181 non-null    object
 1   name                   181 non-null    object
 2   total_pop              181 non-null    object
 3   pct_male               181 non-null    object
 4   median_age             181 non-null    object
 5   pct_hisp_latino        181 non-null    object
 6   pce_white              181 non-null    object
 7   pct_black              181 non-null    object
 8   pct_american_ind       181 non-null    object
 9   pct_asian              181 non-null    object
 10  pct_hawaiian_pacisldr  181 non-null    object
 11  pct_other_race         181 non-null    object
dtypes: object(12)
memory usage: 18.4+ KB


## Economic Data:

In [49]:
# Load Data:

econ = pd.read_csv('../data/tabular_data/acs_data/acs_econ/ACSDP5Y2018.DP03_data_with_overlays_2020-05-13T155408.csv')
econ.head(2)

Unnamed: 0,GEO_ID,NAME,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,DP03_0135PE,DP03_0135PM,DP03_0136E,DP03_0136M,DP03_0136PE,DP03_0136PM,DP03_0137E,DP03_0137M,DP03_0137PE,DP03_0137PM
0,id,Geographic Area Name,Estimate!!EMPLOYMENT STATUS!!Population 16 yea...,Margin of Error!!EMPLOYMENT STATUS!!Population...,Percent Estimate!!EMPLOYMENT STATUS!!Populatio...,Percent Margin of Error!!EMPLOYMENT STATUS!!Po...,Estimate!!EMPLOYMENT STATUS!!Population 16 yea...,Margin of Error!!EMPLOYMENT STATUS!!Population...,Percent Estimate!!EMPLOYMENT STATUS!!Populatio...,Percent Margin of Error!!EMPLOYMENT STATUS!!Po...,...,Percent Estimate!!PERCENTAGE OF FAMILIES AND P...,Percent Margin of Error!!PERCENTAGE OF FAMILIE...,Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WH...,Margin of Error!!PERCENTAGE OF FAMILIES AND PE...,Percent Estimate!!PERCENTAGE OF FAMILIES AND P...,Percent Margin of Error!!PERCENTAGE OF FAMILIE...,Estimate!!PERCENTAGE OF FAMILIES AND PEOPLE WH...,Margin of Error!!PERCENTAGE OF FAMILIES AND PE...,Percent Estimate!!PERCENTAGE OF FAMILIES AND P...,Percent Margin of Error!!PERCENTAGE OF FAMILIE...
1,1400000US11001000100,"Census Tract 1, District of Columbia, District...",4282,361,4282,(X),3323,322,77.6,5.6,...,2.5,3.9,(X),(X),2.6,3.1,(X),(X),11.0,9.5


In [None]:
# Keep Economic Variables of Interest:

demog = demog[['GEO_ID',
               'NAME',
               'DP03_0005PE' : 'pct_unemployed',
               'DP03_0025E' : 'avg_wrk_commute_mins',
               'DP03_0062PE' : 'median_hsld_income',
               'DP03_0088E' : 'per_cap_income'
              
              ]]