This file accesses, pre-processes, and merges multiple ACS datasets together. Then NPI table is imported in and merged with the ACS Data.

In [1]:
## import libraries
import pandas as pd
import numpy as np
import re

### Import ACS Data

In [2]:
from ACSDataImport import getS1903, getS1501, getDP02, getB17001, getS0101

In [3]:
s1903 = getS1903()
s1501 = getS1501()
dp02 = getDP02()
b17001 = getB17001()
s0101 = getS0101()

### Clean ACS Data

In [4]:
## Function to convert Census Tract value to County
def getCounty(name):
    county = re.sub(r'\b(Census Tract\s+[^;]+;) |;\s+Colorado', '', name)
    return county

In [5]:
##### Median Income
s1903_table = s1903.loc[: , ['GEO_ID', 'NAME', 'S1903_C03_001E']]
## Replace '+' and ',' with empty string
s1903_table['S1903_C03_001E'] = (s1903_table['S1903_C03_001E'].str.replace(r'[\+,]', '', regex=True))

## Change GEO_ID to County
s1903_table = s1903_table.rename(columns={'GEO_ID': 'County'})
s1903_table['County'] = s1903_table['County'].str[9:14]
## Change Name to County Name
s1903_table = s1903_table.rename(columns={'NAME': 'County Name'})
s1903_table['County Name'] = s1903_table.apply(lambda row: getCounty(row['County Name']), axis = 1)
## Covert column to numeric
s1903_table['S1903_C03_001E'] = pd.to_numeric(s1903_table['S1903_C03_001E'], errors='coerce')
## Keep rows where income is over 0
s1903_table = s1903_table[s1903_table['S1903_C03_001E'] > 0]

# s1903_table.describe() # used to confirm max house value returns as 250000
# s1903_table.dtypes    # used to confirm column types 
s1903_table

Unnamed: 0,County,County Name,S1903_C03_001E
0,08001,Adams County,44286
1,08001,Adams County,50646
2,08001,Adams County,67273
3,08001,Adams County,77679
4,08001,Adams County,50000
...,...,...,...
1442,08123,Weld County,88438
1443,08123,Weld County,77423
1444,08123,Weld County,91293
1445,08125,Yuma County,73656


In [6]:
##### Educational Attainment
s1501_table = s1501.loc[: , ['GEO_ID', 'NAME', 'S1501_C01_001E', 'S1501_C01_002E', 'S1501_C01_003E', 'S1501_C01_004E', 'S1501_C01_005E', 'S1501_C01_006E', 'S1501_C01_007E', 'S1501_C01_008E', 'S1501_C01_009E', 'S1501_C01_010E', 'S1501_C01_011E', 'S1501_C01_012E', 'S1501_C01_013E', 'S1501_C01_014E', 'S1501_C01_015E']]

# Merge into the following groups:
# 'S1501_C01_001E' - Total (18-24)
# 'S1501_C01_006E' - Total (25+)
s1501_table['001E_006E'] = s1501_table[['S1501_C01_001E', 'S1501_C01_006E']].apply(pd.to_numeric).sum(axis=1)

# 'S1501_C01_002E' - Less than high school (18-24)
# 'S1501_C01_007E' - Less than 9th (25+)
s1501_table['002E_007E'] = s1501_table[['S1501_C01_002E', 'S1501_C01_007E']].apply(pd.to_numeric).sum(axis=1)

# 'S1501_C01_008E' - 9th to 12th, no diploma (25+)
s1501_table['008E'] = s1501_table['S1501_C01_008E'].apply(pd.to_numeric)

# 'S1501_C01_003E' - High school graduate (18-24)
# 'S1501_C01_009E' - High school graduate (25+)
s1501_table['003E_009E'] = s1501_table[['S1501_C01_003E', 'S1501_C01_009E']].apply(pd.to_numeric).sum(axis=1)

# 'S1501_C01_004E' - Some college or associates degree (18-24)
# 'S1501_C01_010E' - Some college, no degree (25+)
# 'S1501_C01_011E' - Associates Degree (25+)
s1501_table['004E_010E_011E'] = s1501_table[['S1501_C01_004E', 'S1501_C01_010E', 'S1501_C01_011E']].apply(pd.to_numeric).sum(axis=1)

# 'S1501_C01_005E' - Bachelor's degree or higher (18-24)
# 'S1501_C01_012E' - Bachelor's degree (25+)
s1501_table['005E_012E'] = s1501_table[['S1501_C01_005E', 'S1501_C01_012E']].apply(pd.to_numeric).sum(axis=1)

# 'S1501_C01_013E' - graduate or professional degree (25+)
s1501_table['013E'] = s1501_table['S1501_C01_013E'].apply(pd.to_numeric)

## Change GEO_ID to County
s1501_table = s1501_table.rename(columns={'GEO_ID': 'County'})
s1501_table['County'] = s1501_table['County'].str[9:14]
## Change Name to County
s1501_table = s1501_table.rename(columns={'NAME': 'County Name'})
s1501_table['County Name'] = s1501_table.apply(lambda row: getCounty(row['County Name']), axis = 1)

s1501_table = s1501_table[['County', 'County Name', '001E_006E', '002E_007E', '008E', '003E_009E', '004E_010E_011E', '005E_012E', '013E']]
# s1501_table.describe()
# s1501_table.dtypes    # used to confirm column types
s1501_table


Unnamed: 0,County,County Name,001E_006E,002E_007E,008E,003E_009E,004E_010E_011E,005E_012E,013E
0,08001,Adams County,2869,699,339,847,831,107,46
1,08001,Adams County,3011,840,126,1134,583,250,78
2,08001,Adams County,4320,731,431,1305,1076,522,255
3,08001,Adams County,4402,788,284,1293,854,681,502
4,08001,Adams County,1599,19,10,238,381,563,388
...,...,...,...,...,...,...,...,...,...
1442,08123,Weld County,4939,456,224,1321,1554,1021,363
1443,08123,Weld County,4390,192,73,1512,1576,861,176
1444,08123,Weld County,5667,482,286,2087,1898,769,145
1445,08125,Yuma County,3207,181,276,737,971,690,352


In [7]:
##### Household - Total, Married-couple, Co-habitating couple, Male Householder no spouse, Female householder no spouse
dp02_table = dp02.loc[: , ['GEO_ID', 'NAME', 'DP02_0001E', 'DP02_0002E', 'DP02_0004E', 'DP02_0006E', 'DP02_0010E']]

## Change GEO_ID to County
dp02_table = dp02_table.rename(columns={'GEO_ID': 'County'})
dp02_table['County'] = dp02_table['County'].str[9:14]
## Change Name to County
dp02_table = dp02_table.rename(columns={'NAME': 'County Name'})
dp02_table['County Name'] = dp02_table.apply(lambda row: getCounty(row['County Name']), axis = 1)
## Covert column to numberic
dp02_table[['DP02_0001E', 'DP02_0002E', 'DP02_0004E', 'DP02_0006E', 'DP02_0010E']] = dp02_table[['DP02_0001E', 'DP02_0002E', 'DP02_0004E', 'DP02_0006E', 'DP02_0010E']].apply(pd.to_numeric, errors='coerce')
# dp02_table.describe()
# dp02_table.dtypes # used to confirm column types
dp02_table

Unnamed: 0,County,County Name,DP02_0001E,DP02_0002E,DP02_0004E,DP02_0006E,DP02_0010E
0,08001,Adams County,1332,276,153,391,512
1,08001,Adams County,1517,406,211,581,319
2,08001,Adams County,2264,764,347,595,558
3,08001,Adams County,2025,685,151,488,701
4,08001,Adams County,770,114,129,243,284
...,...,...,...,...,...,...,...
1442,08123,Weld County,2495,1516,47,482,450
1443,08123,Weld County,1952,1222,92,201,437
1444,08123,Weld County,2604,1656,144,379,425
1445,08125,Yuma County,1731,985,72,216,458


In [8]:
##### Household2 - Male Total, Female Total, Never married, now marred except separated, separated, widowed, divorced
dp02_table2 = dp02.loc[: , ['GEO_ID', 'NAME', 'DP02_0025E', 'DP02_0031E', 'DP02_0026E', 'DP02_0027E', 'DP02_0028E', 'DP02_0029E', 'DP02_0030E', 'DP02_0032E', 'DP02_0033E', 'DP02_0034E', 'DP02_0035E', 'DP02_0036E']]

## Change GEO_ID to County
dp02_table2 = dp02_table2.rename(columns={'GEO_ID': 'County'})
dp02_table2['County'] = dp02_table2['County'].str[9:14]
## Change Name to County
dp02_table2 = dp02_table2.rename(columns={'NAME': 'County Name'})
dp02_table2['County Name'] = dp02_table2.apply(lambda row: getCounty(row['County Name']), axis = 1)
## Covert column to numberic
dp02_table2[[ 'DP02_0025E', 'DP02_0031E','DP02_0026E', 'DP02_0027E', 'DP02_0028E', 'DP02_0029E', 'DP02_0030E', 'DP02_0032E', 'DP02_0033E', 'DP02_0034E', 'DP02_0035E', 'DP02_0036E']] = dp02_table2[[ 'DP02_0025E', 'DP02_0031E','DP02_0026E', 'DP02_0027E', 'DP02_0028E', 'DP02_0029E', 'DP02_0030E', 'DP02_0032E', 'DP02_0033E', 'DP02_0034E', 'DP02_0035E', 'DP02_0036E']].apply(pd.to_numeric, errors='coerce')
# dp02_table2.describe()
# dp02_table2.dtypes # used to confirm column types
dp02_table2

Unnamed: 0,County,County Name,DP02_0025E,DP02_0031E,DP02_0026E,DP02_0027E,DP02_0028E,DP02_0029E,DP02_0030E,DP02_0032E,DP02_0033E,DP02_0034E,DP02_0035E,DP02_0036E
0,08001,Adams County,1508,1566,656,422,150,58,222,806,353,94,87,226
1,08001,Adams County,1744,1375,962,534,0,46,202,682,452,0,32,209
2,08001,Adams County,2464,2340,1213,879,87,107,178,949,828,27,106,430
3,08001,Adams County,2414,2280,1122,1000,131,18,143,825,908,33,136,378
4,08001,Adams County,858,741,485,189,7,24,153,443,166,0,73,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1442,08123,Weld County,2698,2483,552,1678,6,82,380,496,1573,33,208,173
1443,08123,Weld County,2229,2329,621,1369,15,55,169,622,1291,21,147,248
1444,08123,Weld County,3091,2841,835,1816,48,94,298,706,1601,66,186,282
1445,08125,Yuma County,1725,1784,465,1072,14,43,131,343,1053,43,174,171


In [9]:
##### Household3 - Total population, Native born, foreign born
dp02_table3 = dp02.loc[: , ['GEO_ID', 'NAME', 'DP02_0088E', 'DP02_0089E', 'DP02_0094E']]

## Change GEO_ID to County
dp02_table3 = dp02_table3.rename(columns={'GEO_ID': 'County'})
dp02_table3['County'] = dp02_table3['County'].str[9:14]
## Change Name to County
dp02_table3 = dp02_table3.rename(columns={'NAME': 'County Name'})
dp02_table3['County Name'] = dp02_table3.apply(lambda row: getCounty(row['County Name']), axis = 1)
## Covert column to numberic
dp02_table3[['DP02_0088E', 'DP02_0089E', 'DP02_0094E']] = dp02_table3[['DP02_0088E', 'DP02_0089E', 'DP02_0094E']].apply(pd.to_numeric, errors='coerce')
# dp02_table3.describe()
# dp02_table3.dtypes # used to confirm column types
dp02_table3

Unnamed: 0,County,County Name,DP02_0088E,DP02_0089E,DP02_0094E
0,08001,Adams County,4145,2014,2131
1,08001,Adams County,4092,2393,1699
2,08001,Adams County,6173,4276,1897
3,08001,Adams County,5701,4287,1414
4,08001,Adams County,1615,1394,221
...,...,...,...,...,...
1442,08123,Weld County,6511,6102,409
1443,08123,Weld County,5914,5578,336
1444,08123,Weld County,7298,6443,855
1445,08125,Yuma County,4394,3968,426


In [10]:
##### Household4 - Civilian Pop over 18, Civilian veterans
dp02_table4 = dp02.loc[: , ['GEO_ID', 'NAME', 'DP02_0069E', 'DP02_0070E']]

## Change GEO_ID to County
dp02_table4 = dp02_table4.rename(columns={'GEO_ID': 'County'})
dp02_table4['County'] = dp02_table4['County'].str[9:14]
## Change Name to County
dp02_table4 = dp02_table4.rename(columns={'NAME': 'County Name'})
dp02_table4['County Name'] = dp02_table4.apply(lambda row: getCounty(row['County Name']), axis = 1)
## Covert column to numberic
dp02_table4[['DP02_0069E', 'DP02_0070E']] = dp02_table4[['DP02_0069E', 'DP02_0070E']].apply(pd.to_numeric, errors='coerce')
# dp02_table4.describe()
# dp02_table4.dtypes # used to confirm column types
dp02_table4

Unnamed: 0,County,County Name,DP02_0069E,DP02_0070E
0,08001,Adams County,2869,43
1,08001,Adams County,3011,237
2,08001,Adams County,4308,175
3,08001,Adams County,4402,377
4,08001,Adams County,1599,136
...,...,...,...,...
1442,08123,Weld County,4935,375
1443,08123,Weld County,4388,358
1444,08123,Weld County,5667,465
1445,08125,Yuma County,3207,235


In [11]:
##### Poverty - below poverty level, total
b17001_table = b17001.loc[:, ['GEO_ID', 'NAME', 'B17001_001E' , 'B17001_002E']]

## Change GEO_ID to County
b17001_table = b17001_table.rename(columns={'GEO_ID': 'County'})
b17001_table['County'] = b17001_table['County'].str[9:14]
## Change Name to County
b17001_table = b17001_table.rename(columns={'NAME': 'County Name'})
b17001_table['County Name'] = b17001_table.apply(lambda row: getCounty(row['County Name']), axis = 1)

b17001_table[['B17001_001E' , 'B17001_002E']] = b17001_table[['B17001_001E' , 'B17001_002E']].apply(pd.to_numeric, errors = 'coerce')
# b17001_table.describe()
# b17001_table.dtypes # used to confirm column types
b17001_table

Unnamed: 0,County,County Name,B17001_001E,B17001_002E
0,08001,Adams County,4110,1208
1,08001,Adams County,4070,1208
2,08001,Adams County,6173,1112
3,08001,Adams County,5667,621
4,08001,Adams County,1404,542
...,...,...,...,...
1442,08123,Weld County,6503,550
1443,08123,Weld County,5912,1398
1444,08123,Weld County,7278,788
1445,08125,Yuma County,4273,485


In [12]:
##### Age - Total population, Under 5 years, 5 to 9 years, 10 to 14 years, 15 to 19 years, 20 to 24 years, 25 to 29 years, 30 to 34 years, 35 to 39 years, 40 to 44 years, 45 to 49 years, 50 to 54 years, 55 to 59 years, 60 to 64 years, 65 to 69 years, 70 to 74 years, 75 to 79 years, 80 to 84 years, 85 years and over
s0101_table = s0101.loc[: , ['GEO_ID', 'NAME', 'S0101_C01_001E', 'S0101_C01_002E', 'S0101_C01_003E', 'S0101_C01_004E', 'S0101_C01_005E', 'S0101_C01_006E', 'S0101_C01_007E', 'S0101_C01_008E', 'S0101_C01_009E', 'S0101_C01_010E', 'S0101_C01_011E', 'S0101_C01_012E', 'S0101_C01_013E', 'S0101_C01_014E', 'S0101_C01_015E', 'S0101_C01_016E', 'S0101_C01_017E', 'S0101_C01_018E', 'S0101_C01_019E']]

## Change GEO_ID to County
s0101_table = s0101_table.rename(columns={'GEO_ID': 'County'})
s0101_table['County'] = s0101_table['County'].str[9:14]
## Change Name to County
s0101_table = s0101_table.rename(columns={'NAME': 'County Name'})
s0101_table['County Name'] = s0101_table.apply(lambda row: getCounty(row['County Name']), axis = 1)

s0101_table[['S0101_C01_001E', 'S0101_C01_002E', 'S0101_C01_003E', 'S0101_C01_004E', 'S0101_C01_005E', 'S0101_C01_006E', 'S0101_C01_007E', 'S0101_C01_008E', 'S0101_C01_009E', 'S0101_C01_010E', 'S0101_C01_011E', 'S0101_C01_012E', 'S0101_C01_013E', 'S0101_C01_014E', 'S0101_C01_015E', 'S0101_C01_016E', 'S0101_C01_017E', 'S0101_C01_018E', 'S0101_C01_019E']] = s0101_table[['S0101_C01_001E', 'S0101_C01_002E', 'S0101_C01_003E', 'S0101_C01_004E', 'S0101_C01_005E', 'S0101_C01_006E', 'S0101_C01_007E', 'S0101_C01_008E', 'S0101_C01_009E', 'S0101_C01_010E', 'S0101_C01_011E', 'S0101_C01_012E', 'S0101_C01_013E', 'S0101_C01_014E', 'S0101_C01_015E', 'S0101_C01_016E', 'S0101_C01_017E', 'S0101_C01_018E', 'S0101_C01_019E']].apply(pd.to_numeric, errors = 'coerce')
# s0101_table.describe()
# s0101_table.dtypes # used to confirm column types
s0101_table

Unnamed: 0,County,County Name,S0101_C01_001E,S0101_C01_002E,S0101_C01_003E,S0101_C01_004E,S0101_C01_005E,S0101_C01_006E,S0101_C01_007E,S0101_C01_008E,...,S0101_C01_010E,S0101_C01_011E,S0101_C01_012E,S0101_C01_013E,S0101_C01_014E,S0101_C01_015E,S0101_C01_016E,S0101_C01_017E,S0101_C01_018E,S0101_C01_019E
0,08001,Adams County,4145,262,509,300,396,408,320,298,...,97,525,222,64,127,73,51,38,10,10
1,08001,Adams County,4092,448,328,197,254,386,262,370,...,269,352,240,170,105,70,180,66,10,0
2,08001,Adams County,6173,560,424,385,657,401,564,575,...,475,551,212,117,248,259,113,21,2,26
3,08001,Adams County,5701,341,363,303,567,354,416,385,...,234,492,345,189,231,315,155,245,113,11
4,08001,Adams County,1615,16,0,0,22,249,368,317,...,56,38,87,59,86,80,38,53,15,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1442,08123,Weld County,6511,582,309,439,329,237,572,439,...,338,405,408,386,501,427,190,172,65,230
1443,08123,Weld County,5914,366,331,659,313,182,375,273,...,632,382,294,262,359,463,249,128,80,64
1444,08123,Weld County,7298,500,437,429,520,359,412,519,...,391,644,326,431,707,412,280,115,63,82
1445,08125,Yuma County,4394,174,414,297,381,170,283,165,...,312,240,273,283,311,268,227,165,56,77


### Group DF rows based on County

In [13]:

##### Median Income - I did the mean of median household incomes 
s1903_tableGrouped = s1903_table.groupby(['County', 'County Name'], as_index=False).mean(numeric_only=True)

##### Educational Attainment
s1501_tableGrouped = s1501_table.groupby(['County', 'County Name'], as_index=False).sum(numeric_only=True)

##### Household - Total, Married-couple, Co-habitating couple, Male Householder no spouse, Female householder no spouse
dp02_tableGrouped = dp02_table.groupby(['County', 'County Name'], as_index=False).sum(numeric_only=False)

##### Household2 - Male Total, Female Total, Never married, now marred except separated, separated, widowed, divorced
dp02_table2Grouped = dp02_table2.groupby(['County', 'County Name'], as_index=False).sum(numeric_only=False)

##### Household3 - Total population, Native born, foreign born
dp02_table3Grouped = dp02_table3.groupby(['County', 'County Name'], as_index=False).sum(numeric_only=False)

##### Household4 - Civilian Pop over 18, Civilian veterans
dp02_table4Grouped = dp02_table4.groupby(['County', 'County Name'], as_index=False).sum(numeric_only=False)

##### Poverty 
b17001_tableGrouped = b17001_table.groupby(['County', 'County Name'], as_index=False).sum(numeric_only=False)

##### Age
s0101_tableGrouped = s0101_table.groupby(['County', 'County Name'], as_index=False).sum(numeric_only=False)


### Create Quantile Bins for ACS

In [14]:
##### Median Income
## Income (S1903_C03_001E) > median income (s1903_medIncome) - create flag where 1 = True (income >= to than median), 0 = False (income < than median))
s1903_medIncome = np.median(s1903_tableGrouped['S1903_C03_001E'])

#Binary
s1903_tableGrouped['householdIncomeAboveMedian'] = (s1903_tableGrouped['S1903_C03_001E'] >= s1903_medIncome).astype(int)
print(f"Median Household Income ($): {s1903_medIncome}")

#Very Low/Low/Medium/High/Very High bins
s1903_tableGrouped['householdIncomeBins'] = pd.qcut(s1903_tableGrouped['S1903_C03_001E'], q =5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High'])
s1903_tableGrouped

Median Household Income ($): 71420.08333333334


Unnamed: 0,County,County Name,S1903_C03_001E,householdIncomeAboveMedian,householdIncomeBins
0,08001,Adams County,94843.386792,1,Very High
1,08003,Alamosa County,55087.200000,0,Very Low
2,08005,Arapahoe County,108172.687500,1,Very High
3,08007,Archuleta County,74983.800000,1,Medium
4,08009,Baca County,39398.000000,0,Very Low
...,...,...,...,...,...
59,08117,Summit County,110643.833333,1,Very High
60,08119,Teller County,80627.625000,1,High
61,08121,Washington County,64720.000000,0,Low
62,08123,Weld County,94688.222222,1,High


In [15]:
##### Educational Attainment
# '001E_006E' - Total
# '002E_007E' - Less than high school
# '008E' - High school, no diploma (25+)
# '003E_009E' - High school graduate 
# '004E_010E_011E' - Some college, no degree or Associates Degree
# '005E_012E' - Bachelor's degree or higher
# '013E' - Graduate or professional degree

# Percents
s1501_tableGrouped['lessThanHS_P'] = round((s1501_tableGrouped['002E_007E'] / s1501_tableGrouped['001E_006E'])*100, 2)
s1501_tableGrouped['HSNoDiploma_P'] = round((s1501_tableGrouped['008E'] / s1501_tableGrouped['001E_006E'])*100, 2)
s1501_tableGrouped['HSGrad_P'] = round((s1501_tableGrouped['003E_009E'] / s1501_tableGrouped['001E_006E'])*100, 2)
s1501_tableGrouped['SomeCollegeNoDeg_P'] = round((s1501_tableGrouped['004E_010E_011E'] / s1501_tableGrouped['001E_006E'])*100, 2)
s1501_tableGrouped['BachOrHigher_P'] = round((s1501_tableGrouped['005E_012E'] / s1501_tableGrouped['001E_006E'])*100, 2)
s1501_tableGrouped['GradOrProf_P'] = round((s1501_tableGrouped['013E'] / s1501_tableGrouped['001E_006E'])*100, 2)

# No HS vs HS Grad Percent
s1501_tableGrouped['noHSDegree_P'] = round(((s1501_tableGrouped['002E_007E'] + s1501_tableGrouped['008E']) / s1501_tableGrouped['001E_006E'])*100, 2)
s1501_tableGrouped['hasHSDegree_P'] = round(((s1501_tableGrouped['003E_009E'] + s1501_tableGrouped['004E_010E_011E'] + s1501_tableGrouped['005E_012E'] + s1501_tableGrouped['013E']) / s1501_tableGrouped['001E_006E'])*100, 2) 
s1501_tableGrouped.head()

# Very low/Low/Medium/High/Very High on hasHSDegree_P
s1501_tableGrouped['hsDegreeBins'] = pd.qcut(s1501_tableGrouped['hasHSDegree_P'], q =5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High'])
s1501_tableGrouped


Unnamed: 0,County,County Name,001E_006E,002E_007E,008E,003E_009E,004E_010E_011E,005E_012E,013E,lessThanHS_P,HSNoDiploma_P,HSGrad_P,SomeCollegeNoDeg_P,BachOrHigher_P,GradOrProf_P,noHSDegree_P,hasHSDegree_P,hsDegreeBins
0,08001,Adams County,391408,31934,27663,119838,110927,70420,30626,8.16,7.07,30.62,28.34,17.99,7.82,15.23,84.77,Very Low
1,08003,Alamosa County,12613,807,494,3278,4962,1911,1161,6.40,3.92,25.99,39.34,15.15,9.20,10.31,89.69,Low
2,08005,Arapahoe County,505658,24292,19689,104878,141297,136188,79314,4.80,3.89,20.74,27.94,26.93,15.69,8.70,91.30,Medium
3,08007,Archuleta County,11364,260,365,2892,3389,2796,1662,2.29,3.21,25.45,29.82,24.60,14.63,5.50,94.50,High
4,08009,Baca County,2688,159,215,807,900,446,161,5.92,8.00,30.02,33.48,16.59,5.99,13.91,86.09,Very Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,08117,Summit County,25793,911,648,4549,6934,7955,4796,3.53,2.51,17.64,26.88,30.84,18.59,6.04,93.96,High
60,08119,Teller County,20804,399,618,4648,7624,4471,3044,1.92,2.97,22.34,36.65,21.49,14.63,4.89,95.11,High
61,08121,Washington County,3761,177,229,1084,1398,590,283,4.71,6.09,28.82,37.17,15.69,7.52,10.80,89.20,Low
62,08123,Weld County,253176,18368,12334,65618,82862,48186,25808,7.26,4.87,25.92,32.73,19.03,10.19,12.13,87.87,Very Low


In [16]:
##### Household1
# 'DP02_0001E' - Total
# 'DP02_0002E' - Married-couple households
# 'DP02_0004E' - Cohabiting couple households
# 'DP02_0006E' - Male householder, no spouse
# 'DP02_0010E' - Female householder, no spouse

# Percents 
dp02_tableGrouped['married_P'] = round((dp02_tableGrouped['DP02_0002E'] / dp02_tableGrouped['DP02_0001E'])*100, 2)   #Based on distribution within the geo_id, not within the Married set. (row)
dp02_tableGrouped['cohabiting_P'] = round((dp02_tableGrouped['DP02_0004E'] / dp02_tableGrouped['DP02_0001E'])*100, 2)    #Based on distribution within the geo_id, not within the cohabiting set. 
dp02_tableGrouped['maleHouseholder_P'] = round((dp02_tableGrouped['DP02_0006E'] / dp02_tableGrouped['DP02_0001E'])*100, 2)   #Based on distribution within the geo_id, not within the Male set. 
dp02_tableGrouped['femaleHouseholder_P'] = round((dp02_tableGrouped['DP02_0010E'] / dp02_tableGrouped['DP02_0001E'])*100, 2) #Based on distribution within the geo_id, not within the Female set. 

# Rank (1 = lowest occurance, 4 = highest occurance in that row)
colsToRank = ['DP02_0002E', 'DP02_0004E', 'DP02_0006E', 'DP02_0010E']
newName = ['marriedRank', 'cohabitingRank', 'maleHouseholderRank', 'femaleHouseholderRank']
dp02_tableGrouped[newName] = dp02_tableGrouped[colsToRank].rank(axis=1, method='dense')

#Very Low/Low/Medium/High/Very High Bins based on each column
dp02_tableGrouped['marriedBins'] = pd.qcut(dp02_tableGrouped['married_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Married set (column)
dp02_tableGrouped['cohabitingBins'] = pd.qcut(dp02_tableGrouped['cohabiting_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Cohabiting set and not set values
dp02_tableGrouped['maleHouseholderBins'] = pd.qcut(dp02_tableGrouped['maleHouseholder_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Male set and not set values
dp02_tableGrouped['femaleHouseholderBins'] = pd.qcut(dp02_tableGrouped['femaleHouseholder_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution Female set and not set values

dp02_tableGrouped

Unnamed: 0,County,County Name,DP02_0001E,DP02_0002E,DP02_0004E,DP02_0006E,DP02_0010E,married_P,cohabiting_P,maleHouseholder_P,femaleHouseholder_P,marriedRank,cohabitingRank,maleHouseholderRank,femaleHouseholderRank,marriedBins,cohabitingBins,maleHouseholderBins,femaleHouseholderBins
0,08001,Adams County,184964,92694,15600,34582,42088,50.11,8.43,18.70,22.75,4.0,1.0,2.0,3.0,Medium,Very High,Low,Medium
1,08003,Alamosa County,6570,2592,548,1641,1789,39.45,8.34,24.98,27.23,4.0,1.0,2.0,3.0,Very Low,High,Very High,High
2,08005,Arapahoe County,253890,123332,19151,47454,63953,48.58,7.54,18.69,25.19,4.0,1.0,2.0,3.0,Low,High,Low,High
3,08007,Archuleta County,5904,3753,170,708,1273,63.57,2.88,11.99,21.56,4.0,1.0,2.0,3.0,Very High,Very Low,Very Low,Low
4,08009,Baca County,1593,644,134,370,445,40.43,8.41,23.23,27.93,4.0,1.0,2.0,3.0,Very Low,High,High,Very High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,08117,Summit County,12347,5866,1036,3329,2116,47.51,8.39,26.96,17.14,4.0,1.0,3.0,2.0,Low,High,Very High,Very Low
60,08119,Teller County,11253,6392,546,2277,2038,56.80,4.85,20.23,18.11,4.0,1.0,3.0,2.0,Very High,Low,Medium,Very Low
61,08121,Washington County,2070,1124,61,472,413,54.30,2.95,22.80,19.95,4.0,1.0,3.0,2.0,High,Very Low,High,Low
62,08123,Weld County,120019,69325,8335,18357,24002,57.76,6.94,15.30,20.00,4.0,1.0,2.0,3.0,Very High,Medium,Very Low,Low


In [17]:
##### Household2
#           Never married   married     separated   widowed     divorced    Total
# Male      DP02_0026E	    DP02_0027E	DP02_0028E	DP02_0029E	DP02_0030E  DP02_0025E
# Female    DP02_0032E	    DP02_0033E	DP02_0034E	DP02_0035E	DP02_0036E  DP02_0031E

# Percents
total = dp02_table2Grouped['DP02_0025E'].astype(int) + dp02_table2Grouped['DP02_0031E'].astype(int)
dp02_table2Grouped['neverMarried_P'] = round(((dp02_table2Grouped['DP02_0026E'] + dp02_table2Grouped['DP02_0032E']) / total)*100, 2) #Based on distribution within geo_id
dp02_table2Grouped['nowMarried_P'] = round(((dp02_table2Grouped['DP02_0027E'] + dp02_table2Grouped['DP02_0033E']) / total)*100, 2) #Based on distribution within geo_id
dp02_table2Grouped['separated_P'] = round(((dp02_table2Grouped['DP02_0028E'] + dp02_table2Grouped['DP02_0034E']) / total)*100, 2) #Based on distribution within geo_id
dp02_table2Grouped['widowed_P'] = round(((dp02_table2Grouped['DP02_0029E'] + dp02_table2Grouped['DP02_0035E']) / total)*100, 2) #Based on distribution within geo_id
dp02_table2Grouped['divorced_P'] = round(((dp02_table2Grouped['DP02_0030E'] + dp02_table2Grouped['DP02_0036E']) / total)*100, 2) #Based on distribution within geo_id

# Binary
dp02_medianNeverMarried = np.median((dp02_table2Grouped['DP02_0026E'] + dp02_table2Grouped['DP02_0032E']))
dp02_table2Grouped['belowNeverMarriedMedian'] = ((dp02_table2Grouped['DP02_0026E'] + dp02_table2Grouped['DP02_0032E']) <= dp02_medianNeverMarried).astype(int)
dp02_table2Grouped['aboveNeverMarriedMedian'] = ((dp02_table2Grouped['DP02_0026E'] + dp02_table2Grouped['DP02_0032E']) >= dp02_medianNeverMarried).astype(int)
print(f'Median Never Married: {dp02_medianNeverMarried}')

## Potentially use 5 bins (Very low, low, medium, high, very high) on all?
dp02_table2Grouped['neverMarriedBins'] = pd.qcut(dp02_table2Grouped['neverMarried_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Never set (column)
dp02_table2Grouped['nowMarriedBins'] = pd.qcut(dp02_table2Grouped['nowMarried_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Now Married set (column)
dp02_table2Grouped['separatedBins'] = pd.qcut(dp02_table2Grouped['separated_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Separated set (column)
dp02_table2Grouped['widowedBins'] = pd.qcut(dp02_table2Grouped['widowed_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Widowed set (column)
dp02_table2Grouped['divorcedBins'] = pd.qcut(dp02_table2Grouped['divorced_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within divorced set (column)

dp02_table2Grouped

Median Never Married: 3376.5


Unnamed: 0,County,County Name,DP02_0025E,DP02_0031E,DP02_0026E,DP02_0027E,DP02_0028E,DP02_0029E,DP02_0030E,DP02_0032E,...,separated_P,widowed_P,divorced_P,belowNeverMarriedMedian,aboveNeverMarriedMedian,neverMarriedBins,nowMarriedBins,separatedBins,widowedBins,divorcedBins
0,08001,Adams County,210442,204713,79908,104179,3227,3528,19600,63499,...,1.78,3.54,10.72,0,1,Very High,Low,Very High,Very Low,Low
1,08003,Alamosa County,6572,6693,2971,2534,96,146,825,2214,...,0.89,4.89,14.18,0,1,Very High,Very Low,Low,Medium,Very High
2,08005,Arapahoe County,264373,268206,95482,136114,3731,4591,24455,80874,...,1.65,3.82,11.38,0,1,Very High,Low,High,Low,Medium
3,08007,Archuleta County,5942,5837,1158,4169,19,182,414,846,...,0.61,5.43,9.12,1,0,Very Low,Very High,Very Low,Medium,Very Low
4,08009,Baca County,1378,1426,461,685,1,73,158,249,...,1.28,12.91,11.48,1,0,Low,Low,Medium,Very High,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,08117,Summit County,14707,11956,6548,6300,152,146,1561,3628,...,0.94,1.80,11.20,0,1,Very High,Very Low,Low,Very Low,Medium
60,08119,Teller County,11013,10569,2488,6830,52,224,1419,1789,...,0.51,4.67,13.32,0,1,Very Low,Very High,Very Low,Low,High
61,08121,Washington County,2064,1888,606,1164,28,85,181,247,...,1.29,7.31,9.79,1,0,Low,High,Medium,Very High,Low
62,08123,Weld County,135706,132320,42042,76801,1414,3410,12039,34826,...,1.35,4.18,10.25,0,1,High,High,High,Low,Low


In [18]:
##### Household3
# Total population - DP02_0088E
# Native Born - DP02_0089E
# Foreign Born - DP02_0094E

# Percents
dp02_table3Grouped['nativeBorn_P'] = round((dp02_table3Grouped['DP02_0089E'] / dp02_table3Grouped['DP02_0088E'])*100, 2)
dp02_table3Grouped['foreignBorn_P'] = round((dp02_table3Grouped['DP02_0094E'] / dp02_table3Grouped['DP02_0088E'])*100, 2)

# Binary
dp02_nativeBornMedian = np.median(dp02_table3Grouped['DP02_0089E'])
dp02_table3Grouped['belowNativeBornMedian'] = ((dp02_table3Grouped['DP02_0089E']) <= dp02_nativeBornMedian).astype(int)
dp02_table3Grouped['aboveNativeBornMedian'] = ((dp02_table3Grouped['DP02_0089E']) >= dp02_nativeBornMedian).astype(int)
print(f'Median Native Born: {dp02_nativeBornMedian}')

dp02_foreignBornMedian = np.median(dp02_table3Grouped['DP02_0094E'])
dp02_table3Grouped['belowForeignBornMedian'] = ((dp02_table3Grouped['DP02_0094E']) <= dp02_foreignBornMedian).astype(int)
dp02_table3Grouped['aboveForeignBornMedian'] = ((dp02_table3Grouped['DP02_0094E']) >= dp02_foreignBornMedian).astype(int)
print(f'Median Foreign Born: {dp02_foreignBornMedian}')

## Potentially use 5 bins (Very low, low, medium, high, very high) on each
dp02_table3Grouped['nativeBornBins'] = pd.qcut(dp02_table3Grouped['nativeBorn_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within native born % set (column)
dp02_table3Grouped['foreignBornBins'] = pd.qcut(dp02_table3Grouped['foreignBorn_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within foreign born % set (column)

dp02_table3Grouped

Median Native Born: 14578.0
Median Foreign Born: 580.5


Unnamed: 0,County,County Name,DP02_0088E,DP02_0089E,DP02_0094E,nativeBorn_P,foreignBorn_P,belowNativeBornMedian,aboveNativeBornMedian,belowForeignBornMedian,aboveForeignBornMedian,nativeBornBins,foreignBornBins
0,08001,Adams County,524408,443177,81231,84.51,15.49,0,1,0,1,Very Low,Very High
1,08003,Alamosa County,16515,15819,696,95.79,4.21,0,1,0,1,Medium,Medium
2,08005,Arapahoe County,655709,550705,105004,83.99,16.01,0,1,0,1,Very Low,Very High
3,08007,Archuleta County,13730,13228,502,96.34,3.66,1,0,1,0,High,Low
4,08009,Baca County,3460,3317,143,95.87,4.13,1,0,1,0,Medium,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,08117,Summit County,30857,28110,2747,91.10,8.90,0,1,0,1,Low,High
60,08119,Teller County,24774,24246,528,97.87,2.13,0,1,1,0,Very High,Very Low
61,08121,Washington County,4839,4641,198,95.91,4.09,1,0,1,0,High,Low
62,08123,Weld County,340711,309744,30967,90.91,9.09,0,1,0,1,Low,High


In [19]:
##### Household4
# Civilian Population 18 and older - DP02_0069E
# Civilan Veterans - DP02_0070E

# Percents
dp02_table4Grouped['civilVet_P'] =  round((dp02_table4Grouped['DP02_0070E'] / dp02_table4Grouped['DP02_0069E'])*100, 2)

# Binary
dp02_vetMedian = np.median(dp02_table4Grouped['DP02_0070E'])
dp02_table4Grouped['belowCivilVetMedian'] = ((dp02_table4Grouped['DP02_0070E']) <= dp02_vetMedian).astype(int)
dp02_table4Grouped['aboveCivilVetMedian'] = ((dp02_table4Grouped['DP02_0070E']) >= dp02_vetMedian).astype(int)
print(f'Median Civilian Veterans: {dp02_vetMedian}')

## Potentially use 5 bins (Very low, low, medium, high, very high) on vets
dp02_table4Grouped['civilVetBins'] =  pd.qcut(dp02_table4Grouped['civilVet_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within civilVet % set (column)

dp02_table4Grouped

Median Civilian Veterans: 841.0


Unnamed: 0,County,County Name,DP02_0069E,DP02_0070E,civilVet_P,belowCivilVetMedian,aboveCivilVetMedian,civilVetBins
0,08001,Adams County,390659,24179,6.19,0,1,Low
1,08003,Alamosa County,12613,511,4.05,1,0,Very Low
2,08005,Arapahoe County,503417,35168,6.99,0,1,Medium
3,08007,Archuleta County,11344,1154,10.17,0,1,Very High
4,08009,Baca County,2688,181,6.73,1,0,Low
...,...,...,...,...,...,...,...,...
59,08117,Summit County,25793,1462,5.67,0,1,Very Low
60,08119,Teller County,20736,3564,17.19,0,1,Very High
61,08121,Washington County,3761,260,6.91,1,0,Low
62,08123,Weld County,252895,17505,6.92,0,1,Medium


In [20]:
##### Poverty
# 'B17001_001E' - Total
# 'B17001_002E' - Number of people whose income in past 12 months below poverty level

# Percents
b17001_tableGrouped['belowPoverty_P'] = round((b17001_tableGrouped['B17001_002E'] / b17001_tableGrouped['B17001_001E'])*100, 2)  #Based on distribution within the geo_id (row)
b17001_tableGrouped['atOrAbovePoverty_P'] = 100 - b17001_tableGrouped['belowPoverty_P']   #Based on distribution within the geo_id (row)

# Binary
b17001_medianBelowPoverty = np.median(b17001_tableGrouped['B17001_002E'])
b17001_tableGrouped['belowPovertyMedian'] = (b17001_tableGrouped['B17001_002E'] <= b17001_medianBelowPoverty).astype(int)
b17001_tableGrouped['abovePovertyMedian'] = (b17001_tableGrouped['B17001_002E'] >= b17001_medianBelowPoverty).astype(int)
print(f"Median Number of People Below Poverty: {b17001_medianBelowPoverty}")

# #Bins
b17001_tableGrouped['belowPovertyBins'] = pd.qcut(b17001_tableGrouped['belowPoverty_P'], q=5, labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']) #Based on the distribution within belowPoverty % set (column)
b17001_tableGrouped['atOrAbovePovertyBins'] = pd.qcut(b17001_tableGrouped['atOrAbovePoverty_P'], q=5, labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']) #Based on the distribution within atOrAbovePoverty % set (column)

b17001_tableGrouped

Median Number of People Below Poverty: 1488.5


Unnamed: 0,County,County Name,B17001_001E,B17001_002E,belowPoverty_P,atOrAbovePoverty_P,belowPovertyMedian,abovePovertyMedian,belowPovertyBins,atOrAbovePovertyBins
0,08001,Adams County,520338,48994,9.42,90.58,0,1,Low,High
1,08003,Alamosa County,15598,2843,18.23,81.77,0,1,Very High,Very Low
2,08005,Arapahoe County,649885,55261,8.50,91.50,0,1,Low,High
3,08007,Archuleta County,13606,1134,8.33,91.67,1,0,Low,High
4,08009,Baca County,3364,810,24.08,75.92,1,0,Very High,Very Low
...,...,...,...,...,...,...,...,...,...,...
59,08117,Summit County,30670,2277,7.42,92.58,0,1,Very Low,Very High
60,08119,Teller County,24659,1871,7.59,92.41,0,1,Very Low,Very High
61,08121,Washington County,4645,353,7.60,92.40,1,0,Low,High
62,08123,Weld County,334453,30256,9.05,90.95,0,1,Low,High


In [21]:
##### Age
# S0101_C01_001E - Total population
# S0101_C01_002E - Under 5 years
# S0101_C01_003E - 5 to 9 years
# S0101_C01_004E - 10 to 14 years
# S0101_C01_005E - 15 to 19 years
# S0101_C01_006E - 20 to 24 years
# S0101_C01_007E - 25 to 29 years
# S0101_C01_008E - 30 to 34 years
# S0101_C01_009E - 35 to 39 years
# S0101_C01_010E - 40 to 44 years
# S0101_C01_011E - 45 to 49 years
# S0101_C01_012E - 50 to 54 years
# S0101_C01_013E - 55 to 59 years
# S0101_C01_014E - 60 to 64 years
# S0101_C01_015E - 65 to 69 years
# S0101_C01_016E - 70 to 74 years
# S0101_C01_017E - 75 to 79 years
# S0101_C01_018E - 80 to 84 years
# S0101_C01_019E - 85 years and over

# #Sums
s0101_tableGrouped['65andOverSum'] = (s0101_tableGrouped['S0101_C01_015E'] + s0101_tableGrouped['S0101_C01_016E'] + s0101_tableGrouped['S0101_C01_017E'] + s0101_tableGrouped['S0101_C01_018E'] + s0101_tableGrouped['S0101_C01_019E'])

# #Percents
s0101_tableGrouped['65andOver_P'] = round((( (s0101_tableGrouped['S0101_C01_015E'] + s0101_tableGrouped['S0101_C01_016E'] + s0101_tableGrouped['S0101_C01_017E'] + s0101_tableGrouped['S0101_C01_018E'] + s0101_tableGrouped['S0101_C01_019E']) / 
                                  s0101_tableGrouped['S0101_C01_001E'] ) * 100), 2)
s0101_tableGrouped['75andOver_P'] = round((( (s0101_tableGrouped['S0101_C01_017E'] + s0101_tableGrouped['S0101_C01_018E'] + s0101_tableGrouped['S0101_C01_019E']) / s0101_tableGrouped['S0101_C01_001E'] ) * 100), 2) 

# #Bins
s0101_tableGrouped['65andOverBins'] = pd.qcut(s0101_tableGrouped['65andOver_P'], q=5, labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']) #Based on the distribution within 65 and Over Age % set (column)
s0101_tableGrouped['75andOverBins'] = pd.qcut(s0101_tableGrouped['75andOver_P'], q=5, labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']) #Based on the distribution within 75 and Over Age % set (column)

s0101_tableGrouped

Unnamed: 0,County,County Name,S0101_C01_001E,S0101_C01_002E,S0101_C01_003E,S0101_C01_004E,S0101_C01_005E,S0101_C01_006E,S0101_C01_007E,S0101_C01_008E,...,S0101_C01_015E,S0101_C01_016E,S0101_C01_017E,S0101_C01_018E,S0101_C01_019E,65andOverSum,65andOver_P,75andOver_P,65andOverBins,75andOverBins
0,08001,Adams County,524408,33921,34413,40919,36991,34216,40668,44274,...,20772,16113,9652,6477,4827,57841,11.03,4.00,Very Low,Very Low
1,08003,Alamosa County,16515,955,1115,1180,1731,1543,1058,1202,...,813,748,353,211,311,2436,14.75,5.30,Very Low,Low
2,08005,Arapahoe County,655709,38128,40465,44537,41248,38784,49739,52528,...,31709,25893,15091,9566,9812,92071,14.04,5.26,Very Low,Low
3,08007,Archuleta County,13730,520,789,642,617,290,658,726,...,1369,1207,676,248,301,3801,27.68,8.92,Very High,High
4,08009,Baca County,3460,179,258,219,147,165,193,120,...,310,156,154,106,140,866,25.03,11.56,High,Very High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,08117,Summit County,30857,1102,1388,1704,1379,1261,2957,3105,...,1809,1660,740,335,195,4739,15.36,4.12,Low,Very Low
60,08119,Teller County,24774,803,1216,1173,1062,927,1261,1309,...,2018,2226,976,354,415,5989,24.17,7.04,High,Medium
61,08121,Washington County,4839,238,355,294,261,279,261,279,...,352,259,154,133,162,1060,21.91,9.28,High,Very High
62,08123,Weld County,340711,23272,23664,25749,24392,20679,24041,27548,...,15892,11394,6814,4877,4252,43229,12.69,4.68,Very Low,Very Low


In [22]:
## Save DF with to_pickle so it can be used in another file
merged65andOver = s0101_tableGrouped[['County Name', '65andOverSum']]
merged65andOver.to_pickle('../Data/merged65andOver.pkl')

### Check ACS Table Columns

In [23]:
## Check Columns
print(f'{s1903_tableGrouped.columns}\n')
print(f'{s1501_tableGrouped.columns}\n')
print(f'{dp02_tableGrouped.columns}\n')
print(f'{dp02_table2Grouped.columns}\n')
print(f'{dp02_table3Grouped.columns}\n')
print(f'{dp02_table4Grouped.columns}\n')
print(f'{b17001_tableGrouped.columns}\n')
print(f'{s0101_tableGrouped.columns}\n')

Index(['County', 'County Name', 'S1903_C03_001E', 'householdIncomeAboveMedian',
       'householdIncomeBins'],
      dtype='object')

Index(['County', 'County Name', '001E_006E', '002E_007E', '008E', '003E_009E',
       '004E_010E_011E', '005E_012E', '013E', 'lessThanHS_P', 'HSNoDiploma_P',
       'HSGrad_P', 'SomeCollegeNoDeg_P', 'BachOrHigher_P', 'GradOrProf_P',
       'noHSDegree_P', 'hasHSDegree_P', 'hsDegreeBins'],
      dtype='object')

Index(['County', 'County Name', 'DP02_0001E', 'DP02_0002E', 'DP02_0004E',
       'DP02_0006E', 'DP02_0010E', 'married_P', 'cohabiting_P',
       'maleHouseholder_P', 'femaleHouseholder_P', 'marriedRank',
       'cohabitingRank', 'maleHouseholderRank', 'femaleHouseholderRank',
       'marriedBins', 'cohabitingBins', 'maleHouseholderBins',
       'femaleHouseholderBins'],
      dtype='object')

Index(['County', 'County Name', 'DP02_0025E', 'DP02_0031E', 'DP02_0026E',
       'DP02_0027E', 'DP02_0028E', 'DP02_0029E', 'DP02_0030E', 'DP02_0032E',
      

In [24]:
## Check Length
print(f's1903 - {len(s1903_tableGrouped)}')
print(f's1501 - {len(s1501_tableGrouped)}')
print(f'dp02 T1 - {len(dp02_tableGrouped)}')
print(f'dp02 T2 - {len(dp02_table2Grouped)}')
print(f'dp02 T3 - {len(dp02_table3Grouped)}')
print(f'dp02 T4 - {len(dp02_table4Grouped)}')
print(f'b17001 - {len(b17001_tableGrouped)}')
print(f's0101 - {len(s0101_tableGrouped)}')

s1903 - 64
s1501 - 64
dp02 T1 - 64
dp02 T2 - 64
dp02 T3 - 64
dp02 T4 - 64
b17001 - 64
s0101 - 64


### Create subtables with County info and bins

In [25]:
# Table Subsets - Bins
s1903_subset = s1903_tableGrouped[['County', 'County Name', 'householdIncomeBins']]
s1501_subset = s1501_tableGrouped[['County', 'County Name', 'hsDegreeBins']]
dp02_subset = dp02_tableGrouped[['County', 'County Name', 'marriedBins', 'cohabitingBins', 'maleHouseholderBins', 'femaleHouseholderBins']]
dp02_subset2 = dp02_table2Grouped[['County', 'County Name', 'neverMarriedBins', 'nowMarriedBins', 'separatedBins', 'widowedBins', 'divorcedBins']]
dp02_subset3 = dp02_table3Grouped[['County', 'County Name', 'nativeBornBins', 'foreignBornBins']]
dp02_subset4 = dp02_table4Grouped[['County', 'County Name', 'civilVetBins']]
b17001_subset = b17001_tableGrouped[['County', 'County Name', 'belowPovertyBins', 'atOrAbovePovertyBins']]
s0101_subset = s0101_tableGrouped[['County', 'County Name', '65andOverBins', '75andOverBins']]

### Merge ACS Tables

In [26]:
# MERGE ACS TABLES
from functools import reduce

tables = [s1903_subset, s1501_subset, dp02_subset, dp02_subset2, dp02_subset3, dp02_subset4, b17001_subset, s0101_subset]
acsTable = reduce(lambda left, right: pd.merge(left, right, on=["County", "County Name"], how='inner'), tables)

acsTable

Unnamed: 0,County,County Name,householdIncomeBins,hsDegreeBins,marriedBins,cohabitingBins,maleHouseholderBins,femaleHouseholderBins,neverMarriedBins,nowMarriedBins,separatedBins,widowedBins,divorcedBins,nativeBornBins,foreignBornBins,civilVetBins,belowPovertyBins,atOrAbovePovertyBins,65andOverBins,75andOverBins
0,08001,Adams County,Very High,Very Low,Medium,Very High,Low,Medium,Very High,Low,Very High,Very Low,Low,Very Low,Very High,Low,Low,High,Very Low,Very Low
1,08003,Alamosa County,Very Low,Low,Very Low,High,Very High,High,Very High,Very Low,Low,Medium,Very High,Medium,Medium,Very Low,Very High,Very Low,Very Low,Low
2,08005,Arapahoe County,Very High,Medium,Low,High,Low,High,Very High,Low,High,Low,Medium,Very Low,Very High,Medium,Low,High,Very Low,Low
3,08007,Archuleta County,Medium,High,Very High,Very Low,Very Low,Low,Very Low,Very High,Very Low,Medium,Very Low,High,Low,Very High,Low,High,Very High,High
4,08009,Baca County,Very Low,Very Low,Very Low,High,High,Very High,Low,Low,Medium,Very High,Medium,Medium,Medium,Low,Very High,Very Low,High,Very High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,08117,Summit County,Very High,High,Low,High,Very High,Very Low,Very High,Very Low,Low,Very Low,Medium,Low,High,Very Low,Very Low,Very High,Low,Very Low
60,08119,Teller County,High,High,Very High,Low,Medium,Very Low,Very Low,Very High,Very Low,Low,High,Very High,Very Low,Very High,Very Low,Very High,High,Medium
61,08121,Washington County,Low,Low,High,Very Low,High,Low,Low,High,Medium,Very High,Low,High,Low,Low,Low,High,High,Very High
62,08123,Weld County,High,Very Low,Very High,Medium,Very Low,Low,High,High,High,Low,Low,Low,High,Medium,Low,High,Very Low,Very Low


### ACS Table Columns

| Output Column Header   | ACS Columns Utilized |
| -------- | ------- |
| County   | County ID |
| County Name   | County Name |
| householdIncomeBins   | S1903_C03_001E |
| hsDegreeBins   | S1501_C01_001E (total 18-24) <br> S1501_C01_003E (High school graduate (18-24)) <br> S1501_C01_004E (Some college or associates degree (18-24)) <br> S1501_C01_005E (Bachelor's degree or higher (18-24)) <br> S1501_C01_006E (total 25+) <br> S1501_C01_008E (9th to 12th, no diploma (25+)) <br> S1501_C01_009E (High school graduate (25+)) <br> S1501_C01_010E (Some college, no degree (25+)) <br> S1501_C01_011E (Associates Degree (25+)) <br> S1501_C01_012E (Bachelor's degree (25+)) <br> S1501_C01_013E (graduate or professional degree (25+)) |
| marriedBins  | DP02_0001 (total) <br> DP02_0002E (married couple) |
| cohabitingBins   | DP02_0001 (total) <br> DP02_0004E (cohabiting couple)  |
| maleHouseholderBins   | DP02_0001 (total) <br> DP02_0006E (male householder no spouse)  |
| femaleHouseholderBins   | DP02_0001 (total) <br> DP02_0010E (female householder no spouse)  |
| neverMarriedBins | DP02_0025E (Male Total) <br> DP02_0031E (Female Total) <br> DP02_0026E (Male Never Married) <br> DP02_0032E (Female Never Married) |
| nowMarriedBins   | DP02_0025E (Male Total) <br> DP02_0031E (Female Total) <br> DP02_0027E (Male Now Married) <br> DP02_0033E (Female Now Married) |
| separatedBins   | DP02_0025E (Male Total) <br> DP02_0031E (Female Total) <br> DP02_0028E (Male Separated) <br> DP02_0034E (Female Separated) |
| widowedBins   | DP02_0025E (Male Total) <br> DP02_0031E (Female Total) <br> DP02_0029E (Male Widowed) <br> DP02_0035E (Female Widowed) |
| divorcedBins  | DP02_0025E (Male Total) <br> DP02_0031E (Female Total) <br> DP02_0030E (Male Divorced) <br> DP02_0036E (Female Divorced) |
| nativeBornBins   | DP02_0088E (total) <br> DP02_0089E (Native Born)|
| foreignBornBins   | DP02_0088E (total) <br> DP02_0094E (Foreign Born) |
| civilVetBins   | DP02_0069E (Civilian Population 18+) <br> DP02_0070E (Civilan Veterans) |
| belowPovertyBins  | B17001_001E (total) <br> B17001_002E (Number of people whose income below poverty level in last 12months) |
| atOrAbovePovertyBins  | B17001_001E (total) <br> B17001_002E (Number of people whose income below poverty level in last 12months) |
| 65andOverBins  | S0101_C01_001E (total) <br> S0101_C01_015E (age 65-69) <br> S0101_C01_016E (age 70-) <br> S0101_C01_017E (age 75-79) <br> S0101_C01_018E (age 80-84) <br> S0101_C01_019E (age 85 and over) |
| 75andOverBins  | S0101_C01_001E (total) <br> S0101_C01_017E (age 75-79) <br> S0101_C01_018E (age 80-84) <br> S0101_C01_019E (age 85 and over) |


### Pull In and Add NPI Hospice Provider Information

In [27]:
### Import NPI Provider Information from another file output
df = pd.read_pickle('../Data/df_zip.pkl') 
df

Unnamed: 0,NPI,Name,Status,Location State,Primary Practice Address,Taxonomy Entries,Issuers,ZIP9,ZIP5,clean_name,Matched_ID2
0,1760093470,247 HOME HEALTH CARE LTD,A,CO,"8055 E TUFTS AVE STE 250 , DENVER, CO 802372857","[(Hospice Care, Community Based, True)]",[],802372857,80237,247homehealthcare,
1,1740072065,A BETTER COLORADO HOSPICE LLC,A,CO,"126 W D ST STE 200 , PUEBLO, CO 810034430","[(Hospice Care, Community Based, True)]",[],810034430,81003,abettercoloradohospice,
2,1003483330,"A PEACEFUL JOURNEY HOSPICE, LLC",A,CO,"2851 S PARKER RD STE 1130 , AURORA, CO 800142732","[(Hospice Care, Community Based, True)]",[],800142732,80014,apeacefuljourneyhospice,apeacefuljourneyhospice
3,1861097982,"ABODE HEALTHCARE COLORADO, INC",A,CO,"1050 EAGLERIDGE BLVD , PUEBLO, CO 810082130","[(Hospice Care, Community Based, True)]",[],810082130,81008,abodehealthcarecolorado,
4,1326459025,"ABODE HEALTHCARE COLORADO, INC",A,CO,"5465 MARK DABLING BLVD , COLORADO SPRINGS, CO ...","[(Hospice Care, Community Based, True)]",[],809183842,80918,abodehealthcarecolorado,
...,...,...,...,...,...,...,...,...,...,...,...
294,1518171834,WOMENS HEALTH CENTER INC,A,CO,"1600 N GRAND AVE STE 400 , PUEBLO, CO 810032760","[(Obstetrics & Gynecology, Hospice and Palliat...",[],810032760,81003,womenshealthcenter,
298,1013648583,YNA HOSPICE INC,A,CO,"3190 S VAUGHN WAY STE 550 OFF 520 , AURORA, CO...","[(Hospice Care, Community Based, True)]",[],80014,80014,ynahospice,
299,1285645382,YULIYA GOSTISHCHEVA,A,CO,"1240 S PARKER RD #106, DENVER, CO 802317558","[(Hospice Care, Community Based, True)]",[],802317558,80231,yuliyagostishcheva,
300,1942931415,ZA HOSPICE INC,A,CO,"102 S TEJON ST STE 1100 OFF 1111 , COLORADO SP...","[(Hospice Care, Community Based, True)]",[],80903,80903,zahospice,


In [28]:
### Import functions to convert postal to lat/long, and lat/long to Census Tract
from zipcodeToCensusTract import convertPostalToLatLong, convertLatLongToCensusTract

In [29]:
## Call function and add lat/long to df
df[['lat', 'long']] = df['ZIP5'].apply(lambda zip: pd.Series(convertPostalToLatLong(zip)))
df

Unnamed: 0,NPI,Name,Status,Location State,Primary Practice Address,Taxonomy Entries,Issuers,ZIP9,ZIP5,clean_name,Matched_ID2,lat,long
0,1760093470,247 HOME HEALTH CARE LTD,A,CO,"8055 E TUFTS AVE STE 250 , DENVER, CO 802372857","[(Hospice Care, Community Based, True)]",[],802372857,80237,247homehealthcare,,39.6431,-104.8987
1,1740072065,A BETTER COLORADO HOSPICE LLC,A,CO,"126 W D ST STE 200 , PUEBLO, CO 810034430","[(Hospice Care, Community Based, True)]",[],810034430,81003,abettercoloradohospice,,38.2843,-104.6234
2,1003483330,"A PEACEFUL JOURNEY HOSPICE, LLC",A,CO,"2851 S PARKER RD STE 1130 , AURORA, CO 800142732","[(Hospice Care, Community Based, True)]",[],800142732,80014,apeacefuljourneyhospice,apeacefuljourneyhospice,39.6662,-104.8350
3,1861097982,"ABODE HEALTHCARE COLORADO, INC",A,CO,"1050 EAGLERIDGE BLVD , PUEBLO, CO 810082130","[(Hospice Care, Community Based, True)]",[],810082130,81008,abodehealthcarecolorado,,38.3133,-104.6284
4,1326459025,"ABODE HEALTHCARE COLORADO, INC",A,CO,"5465 MARK DABLING BLVD , COLORADO SPRINGS, CO ...","[(Hospice Care, Community Based, True)]",[],809183842,80918,abodehealthcarecolorado,,38.9129,-104.7734
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,1518171834,WOMENS HEALTH CENTER INC,A,CO,"1600 N GRAND AVE STE 400 , PUEBLO, CO 810032760","[(Obstetrics & Gynecology, Hospice and Palliat...",[],810032760,81003,womenshealthcenter,,38.2843,-104.6234
298,1013648583,YNA HOSPICE INC,A,CO,"3190 S VAUGHN WAY STE 550 OFF 520 , AURORA, CO...","[(Hospice Care, Community Based, True)]",[],80014,80014,ynahospice,,39.6662,-104.8350
299,1285645382,YULIYA GOSTISHCHEVA,A,CO,"1240 S PARKER RD #106, DENVER, CO 802317558","[(Hospice Care, Community Based, True)]",[],802317558,80231,yuliyagostishcheva,,39.6793,-104.8843
300,1942931415,ZA HOSPICE INC,A,CO,"102 S TEJON ST STE 1100 OFF 1111 , COLORADO SP...","[(Hospice Care, Community Based, True)]",[],80903,80903,zahospice,,38.8388,-104.8145


In [30]:
## Check to ensure all zipcodes were converted to Lat and Long (no null)
df[df['lat'].isnull()]
df[df['long'].isnull()]

Unnamed: 0,NPI,Name,Status,Location State,Primary Practice Address,Taxonomy Entries,Issuers,ZIP9,ZIP5,clean_name,Matched_ID2,lat,long


In [31]:
## Call function and add censusTract to df
df['censusTract'] = df.apply(lambda row: convertLatLongToCensusTract(row['lat'], row['long']), axis = 1)
df

Unnamed: 0,NPI,Name,Status,Location State,Primary Practice Address,Taxonomy Entries,Issuers,ZIP9,ZIP5,clean_name,Matched_ID2,lat,long,censusTract
0,1760093470,247 HOME HEALTH CARE LTD,A,CO,"8055 E TUFTS AVE STE 250 , DENVER, CO 802372857","[(Hospice Care, Community Based, True)]",[],802372857,80237,247homehealthcare,,39.6431,-104.8987,1400000US08031006816
1,1740072065,A BETTER COLORADO HOSPICE LLC,A,CO,"126 W D ST STE 200 , PUEBLO, CO 810034430","[(Hospice Care, Community Based, True)]",[],810034430,81003,abettercoloradohospice,,38.2843,-104.6234,1400000US08101000400
2,1003483330,"A PEACEFUL JOURNEY HOSPICE, LLC",A,CO,"2851 S PARKER RD STE 1130 , AURORA, CO 800142732","[(Hospice Care, Community Based, True)]",[],800142732,80014,apeacefuljourneyhospice,apeacefuljourneyhospice,39.6662,-104.8350,1400000US08005080400
3,1861097982,"ABODE HEALTHCARE COLORADO, INC",A,CO,"1050 EAGLERIDGE BLVD , PUEBLO, CO 810082130","[(Hospice Care, Community Based, True)]",[],810082130,81008,abodehealthcarecolorado,,38.3133,-104.6284,1400000US08101002919
4,1326459025,"ABODE HEALTHCARE COLORADO, INC",A,CO,"5465 MARK DABLING BLVD , COLORADO SPRINGS, CO ...","[(Hospice Care, Community Based, True)]",[],809183842,80918,abodehealthcarecolorado,,38.9129,-104.7734,1400000US08041004800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,1518171834,WOMENS HEALTH CENTER INC,A,CO,"1600 N GRAND AVE STE 400 , PUEBLO, CO 810032760","[(Obstetrics & Gynecology, Hospice and Palliat...",[],810032760,81003,womenshealthcenter,,38.2843,-104.6234,1400000US08101000400
298,1013648583,YNA HOSPICE INC,A,CO,"3190 S VAUGHN WAY STE 550 OFF 520 , AURORA, CO...","[(Hospice Care, Community Based, True)]",[],80014,80014,ynahospice,,39.6662,-104.8350,1400000US08005080400
299,1285645382,YULIYA GOSTISHCHEVA,A,CO,"1240 S PARKER RD #106, DENVER, CO 802317558","[(Hospice Care, Community Based, True)]",[],802317558,80231,yuliyagostishcheva,,39.6793,-104.8843,1400000US08005087200
300,1942931415,ZA HOSPICE INC,A,CO,"102 S TEJON ST STE 1100 OFF 1111 , COLORADO SP...","[(Hospice Care, Community Based, True)]",[],80903,80903,zahospice,,38.8388,-104.8145,1400000US08041002200


In [32]:
## Change CensusTract to County
df['County'] = df['censusTract'].str[9:14]
df

Unnamed: 0,NPI,Name,Status,Location State,Primary Practice Address,Taxonomy Entries,Issuers,ZIP9,ZIP5,clean_name,Matched_ID2,lat,long,censusTract,County
0,1760093470,247 HOME HEALTH CARE LTD,A,CO,"8055 E TUFTS AVE STE 250 , DENVER, CO 802372857","[(Hospice Care, Community Based, True)]",[],802372857,80237,247homehealthcare,,39.6431,-104.8987,1400000US08031006816,08031
1,1740072065,A BETTER COLORADO HOSPICE LLC,A,CO,"126 W D ST STE 200 , PUEBLO, CO 810034430","[(Hospice Care, Community Based, True)]",[],810034430,81003,abettercoloradohospice,,38.2843,-104.6234,1400000US08101000400,08101
2,1003483330,"A PEACEFUL JOURNEY HOSPICE, LLC",A,CO,"2851 S PARKER RD STE 1130 , AURORA, CO 800142732","[(Hospice Care, Community Based, True)]",[],800142732,80014,apeacefuljourneyhospice,apeacefuljourneyhospice,39.6662,-104.8350,1400000US08005080400,08005
3,1861097982,"ABODE HEALTHCARE COLORADO, INC",A,CO,"1050 EAGLERIDGE BLVD , PUEBLO, CO 810082130","[(Hospice Care, Community Based, True)]",[],810082130,81008,abodehealthcarecolorado,,38.3133,-104.6284,1400000US08101002919,08101
4,1326459025,"ABODE HEALTHCARE COLORADO, INC",A,CO,"5465 MARK DABLING BLVD , COLORADO SPRINGS, CO ...","[(Hospice Care, Community Based, True)]",[],809183842,80918,abodehealthcarecolorado,,38.9129,-104.7734,1400000US08041004800,08041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,1518171834,WOMENS HEALTH CENTER INC,A,CO,"1600 N GRAND AVE STE 400 , PUEBLO, CO 810032760","[(Obstetrics & Gynecology, Hospice and Palliat...",[],810032760,81003,womenshealthcenter,,38.2843,-104.6234,1400000US08101000400,08101
298,1013648583,YNA HOSPICE INC,A,CO,"3190 S VAUGHN WAY STE 550 OFF 520 , AURORA, CO...","[(Hospice Care, Community Based, True)]",[],80014,80014,ynahospice,,39.6662,-104.8350,1400000US08005080400,08005
299,1285645382,YULIYA GOSTISHCHEVA,A,CO,"1240 S PARKER RD #106, DENVER, CO 802317558","[(Hospice Care, Community Based, True)]",[],802317558,80231,yuliyagostishcheva,,39.6793,-104.8843,1400000US08005087200,08005
300,1942931415,ZA HOSPICE INC,A,CO,"102 S TEJON ST STE 1100 OFF 1111 , COLORADO SP...","[(Hospice Care, Community Based, True)]",[],80903,80903,zahospice,,38.8388,-104.8145,1400000US08041002200,08041


In [33]:
## Get count of NPI Providers per CensusTract
providerCounts = df['County'].value_counts().reset_index()
providerCounts.columns = ['County', 'Provider Count']
providerCounts

Unnamed: 0,County,Provider Count
0,8005,65
1,8031,38
2,8041,34
3,8059,19
4,8101,17
5,8069,14
6,8001,11
7,8013,10
8,8077,7
9,8123,7


In [34]:
## Merge Count of Providers per CensusTract with ACS Data information
mergedTable = pd.merge(providerCounts, acsTable, on = 'County', how = 'right')
mergedTable

Unnamed: 0,County,Provider Count,County Name,householdIncomeBins,hsDegreeBins,marriedBins,cohabitingBins,maleHouseholderBins,femaleHouseholderBins,neverMarriedBins,...,separatedBins,widowedBins,divorcedBins,nativeBornBins,foreignBornBins,civilVetBins,belowPovertyBins,atOrAbovePovertyBins,65andOverBins,75andOverBins
0,08001,11.0,Adams County,Very High,Very Low,Medium,Very High,Low,Medium,Very High,...,Very High,Very Low,Low,Very Low,Very High,Low,Low,High,Very Low,Very Low
1,08003,1.0,Alamosa County,Very Low,Low,Very Low,High,Very High,High,Very High,...,Low,Medium,Very High,Medium,Medium,Very Low,Very High,Very Low,Very Low,Low
2,08005,65.0,Arapahoe County,Very High,Medium,Low,High,Low,High,Very High,...,High,Low,Medium,Very Low,Very High,Medium,Low,High,Very Low,Low
3,08007,,Archuleta County,Medium,High,Very High,Very Low,Very Low,Low,Very Low,...,Very Low,Medium,Very Low,High,Low,Very High,Low,High,Very High,High
4,08009,1.0,Baca County,Very Low,Very Low,Very Low,High,High,Very High,Low,...,Medium,Very High,Medium,Medium,Medium,Low,Very High,Very Low,High,Very High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,08117,1.0,Summit County,Very High,High,Low,High,Very High,Very Low,Very High,...,Low,Very Low,Medium,Low,High,Very Low,Very Low,Very High,Low,Very Low
60,08119,1.0,Teller County,High,High,Very High,Low,Medium,Very Low,Very Low,...,Very Low,Low,High,Very High,Very Low,Very High,Very Low,Very High,High,Medium
61,08121,,Washington County,Low,Low,High,Very Low,High,Low,Low,...,Medium,Very High,Low,High,Low,Low,Low,High,High,Very High
62,08123,7.0,Weld County,High,Very Low,Very High,Medium,Very Low,Low,High,...,High,Low,Low,Low,High,Medium,Low,High,Very Low,Very Low


In [35]:
## Check length of mergedTable -- Should be 64 to correspond with 64 counties 
len(mergedTable)

64

In [36]:
## Filling in NaN of Provider Counts with 0 to correspond to the 0 providers in that area
mergedTable['Provider Count'] = mergedTable['Provider Count'].fillna(0)

In [37]:
## Save DF with to_pickle so it can be used in another file
mergedTable.to_pickle('../Data/mergedTableAllCounties.pkl')