This file merges multiple datasets together and processes the data

In [8]:
import pandas as pd
import numpy as np

### Import ACS Data

In [9]:
from ACSDataImport import getS1903, getS1501, getDP02, getB17001, getS0101

In [10]:
s1903 = getS1903()
s1501 = getS1501()
dp02 = getDP02()
b17001 = getB17001()
s0101 = getS0101()

### Clean ACS Data

In [11]:
##### Median Income
s1903_table = s1903.loc[: , ['GEO_ID', 'S1903_C03_001E']]
## Replace '+' and ',' with empty string
s1903_table['S1903_C03_001E'] = (s1903_table['S1903_C03_001E'].str.replace(r'[\+,]', '', regex=True))
## Covert column to numeric
s1903_table['S1903_C03_001E'] = pd.to_numeric(s1903_table['S1903_C03_001E'], errors='coerce')
# s1903_table.describe() # used to confirm max house value returns as 250000
# s1903_table.dtypes    # used to confirm column types

In [12]:
##### Educational Attainment
s1501_table = s1501.loc[: , ['GEO_ID', 'S1501_C01_001E', 'S1501_C01_002E', 'S1501_C01_003E', 'S1501_C01_004E', 'S1501_C01_005E', 'S1501_C01_006E', 'S1501_C01_007E', 'S1501_C01_008E', 'S1501_C01_009E', 'S1501_C01_010E', 'S1501_C01_011E', 'S1501_C01_012E', 'S1501_C01_013E', 'S1501_C01_014E', 'S1501_C01_015E']]

# Merge into the following groups:
# 'S1501_C01_001E' - Total (18-24)
# 'S1501_C01_006E' - Total (25+)
s1501_table['001E_006E'] = s1501_table[['S1501_C01_001E', 'S1501_C01_006E']].apply(pd.to_numeric).sum(axis=1)

# 'S1501_C01_002E' - Less than high school (18-24)
# 'S1501_C01_007E' - Less than 9th (25+)
s1501_table['002E_007E'] = s1501_table[['S1501_C01_002E', 'S1501_C01_007E']].apply(pd.to_numeric).sum(axis=1)

# 'S1501_C01_008E' - 9th to 12th, no diploma (25+)
s1501_table['008E'] = s1501_table['S1501_C01_008E'].apply(pd.to_numeric)

# 'S1501_C01_003E' - High school graduate (18-24)
# 'S1501_C01_009E' - High school graduate (25+)
s1501_table['003E_009E'] = s1501_table[['S1501_C01_003E', 'S1501_C01_009E']].apply(pd.to_numeric).sum(axis=1)

# 'S1501_C01_004E' - Some college or associates degree (18-24)
# 'S1501_C01_010E' - Some college, no degree (25+)
# 'S1501_C01_011E' - Associates Degree (25+)
s1501_table['004E_010E_011E'] = s1501_table[['S1501_C01_004E', 'S1501_C01_010E', 'S1501_C01_011E']].apply(pd.to_numeric).sum(axis=1)

# 'S1501_C01_005E' - Bachelor's degree or higher (18-24)
# 'S1501_C01_012E' - Bachelor's degree (25+)
s1501_table['005E_012E'] = s1501_table[['S1501_C01_005E', 'S1501_C01_012E']].apply(pd.to_numeric).sum(axis=1)

# 'S1501_C01_013E' - graduate or professional degree (25+)
s1501_table['013E'] = s1501_table['S1501_C01_013E'].apply(pd.to_numeric)

s1501_table = s1501_table[['GEO_ID', '001E_006E', '002E_007E', '008E', '003E_009E', '004E_010E_011E', '005E_012E', '013E']]
# s1501_table.describe()
# s1501_table.dtypes    # used to confirm column types


In [13]:
##### Household - Total, Married-couple, Co-habitating couple, Male Householder no spouse, Female householder no spouse
dp02_table = dp02.loc[: , ['GEO_ID', 'DP02_0001E', 'DP02_0002E', 'DP02_0004E', 'DP02_0006E', 'DP02_0010E']]
## Covert column to numberic
dp02_table[['DP02_0001E', 'DP02_0002E', 'DP02_0004E', 'DP02_0006E', 'DP02_0010E']] = dp02_table[['DP02_0001E', 'DP02_0002E', 'DP02_0004E', 'DP02_0006E', 'DP02_0010E']].apply(pd.to_numeric, errors='coerce')
# dp02_table.describe()
# dp02_table.dtypes # used to confirm column types

In [14]:
##### Household2 - Male Total, Female Total, Never married, now marred except separated, separated, widowed, divorced
dp02_table2 = dp02.loc[: , ['GEO_ID', 'DP02_0025E', 'DP02_0031E', 'DP02_0026E', 'DP02_0027E', 'DP02_0028E', 'DP02_0029E', 'DP02_0030E', 'DP02_0032E', 'DP02_0033E', 'DP02_0034E', 'DP02_0035E', 'DP02_0036E']]
## Covert column to numberic
dp02_table2[['DP02_0026E', 'DP02_0027E', 'DP02_0028E', 'DP02_0029E', 'DP02_0030E', 'DP02_0032E', 'DP02_0033E', 'DP02_0034E', 'DP02_0035E', 'DP02_0036E']] = dp02_table2[['DP02_0026E', 'DP02_0027E', 'DP02_0028E', 'DP02_0029E', 'DP02_0030E', 'DP02_0032E', 'DP02_0033E', 'DP02_0034E', 'DP02_0035E', 'DP02_0036E']].apply(pd.to_numeric, errors='coerce')
# dp02_table2.describe()
# dp02_table2.dtypes # used to confirm column types

In [15]:
##### Household3 - Total population, Native born, foreign born
dp02_table3 = dp02.loc[: , ['GEO_ID', 'DP02_0088E', 'DP02_0089E', 'DP02_0094E']]
## Covert column to numberic
dp02_table3[['DP02_0088E', 'DP02_0089E', 'DP02_0094E']] = dp02_table3[['DP02_0088E', 'DP02_0089E', 'DP02_0094E']].apply(pd.to_numeric, errors='coerce')
# dp02_table3.describe()
# dp02_table3.dtypes # used to confirm column types

In [16]:
##### Household4 - Civilian Pop over 18, Civilian veterans
dp02_table4 = dp02.loc[: , ['GEO_ID', 'DP02_0069E', 'DP02_0070E']]
## Covert column to numberic
dp02_table4[['DP02_0069E', 'DP02_0070E']] = dp02_table4[['DP02_0069E', 'DP02_0070E']].apply(pd.to_numeric, errors='coerce')
# dp02_table4.describe()
# dp02_table4.dtypes # used to confirm column types

In [17]:
##### Poverty - below poverty level, total
b17001_table = b17001.loc[:, ['GEO_ID', 'B17001_001E' , 'B17001_002E']]
b17001_table[['B17001_001E' , 'B17001_002E']] = b17001_table[['B17001_001E' , 'B17001_002E']].apply(pd.to_numeric, errors = 'coerce')
# b17001_table.describe()
# b17001_table.dtypes # used to confirm column types

In [18]:
##### Age - Total population, Under 5 years, 5 to 9 years, 10 to 14 years, 15 to 19 years, 20 to 24 years, 25 to 29 years, 30 to 34 years, 35 to 39 years, 40 to 44 years, 45 to 49 years, 50 to 54 years, 55 to 59 years, 60 to 64 years, 65 to 69 years, 70 to 74 years, 75 to 79 years, 80 to 84 years, 85 years and over
s0101_table = s0101.loc[: , ['GEO_ID', 'S0101_C01_001E', 'S0101_C01_002E', 'S0101_C01_003E', 'S0101_C01_004E', 'S0101_C01_005E', 'S0101_C01_006E', 'S0101_C01_007E', 'S0101_C01_008E', 'S0101_C01_009E', 'S0101_C01_010E', 'S0101_C01_011E', 'S0101_C01_012E', 'S0101_C01_013E', 'S0101_C01_014E', 'S0101_C01_015E', 'S0101_C01_016E', 'S0101_C01_017E', 'S0101_C01_018E', 'S0101_C01_019E']]
s0101_table[['S0101_C01_001E', 'S0101_C01_002E', 'S0101_C01_003E', 'S0101_C01_004E', 'S0101_C01_005E', 'S0101_C01_006E', 'S0101_C01_007E', 'S0101_C01_008E', 'S0101_C01_009E', 'S0101_C01_010E', 'S0101_C01_011E', 'S0101_C01_012E', 'S0101_C01_013E', 'S0101_C01_014E', 'S0101_C01_015E', 'S0101_C01_016E', 'S0101_C01_017E', 'S0101_C01_018E', 'S0101_C01_019E']] = s0101_table[['S0101_C01_001E', 'S0101_C01_002E', 'S0101_C01_003E', 'S0101_C01_004E', 'S0101_C01_005E', 'S0101_C01_006E', 'S0101_C01_007E', 'S0101_C01_008E', 'S0101_C01_009E', 'S0101_C01_010E', 'S0101_C01_011E', 'S0101_C01_012E', 'S0101_C01_013E', 'S0101_C01_014E', 'S0101_C01_015E', 'S0101_C01_016E', 'S0101_C01_017E', 'S0101_C01_018E', 'S0101_C01_019E']].apply(pd.to_numeric, errors = 'coerce')
# s0101_table.describe()
# s0101_table.dtypes # used to confirm column types

### Create Bins for ACS

In [19]:
##### Median Income
## Income (S1903_C03_001E) > median income (s1903_medIncome) - create flag where 1 = True (income >= to than median), 0 = False (income < than median))
s1903_medIncome = np.median(s1903_table['S1903_C03_001E'])

#Binary
s1903_table['householdIncomeAboveMedian'] = (s1903_table['S1903_C03_001E'] >= s1903_medIncome).astype(int)
print(f"Median Household Income ($): {s1903_medIncome}")

#Very Low/Low/Medium/High/Very High bins
s1903_table['householdIncomeBins'] = pd.qcut(s1903_table['S1903_C03_001E'], q =5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High'])
s1903_table.head()

Median Household Income ($): 90303.0


Unnamed: 0,GEO_ID,S1903_C03_001E,householdIncomeAboveMedian,householdIncomeBins
0,1400000US08001007801,44286,0,Very Low
1,1400000US08001007802,50646,0,Very Low
2,1400000US08001007900,67273,0,Low
3,1400000US08001008000,77679,0,Low
4,1400000US08001008100,50000,0,Very Low


In [20]:
##### Educational Attainment
# '001E_006E' - Total
# '002E_007E' - Less than high school
# '008E' - High school, no diploma (25+)
# '003E_009E' - High school graduate 
# '004E_010E_011E' - Some college, no degree or Associates Degree
# '005E_012E' - Bachelor's degree or higher
# '013E' - Graduate or professional degree

# Percents
s1501_table['lessThanHS_P'] = round((s1501_table['002E_007E'] / s1501_table['001E_006E'])*100, 2)
s1501_table['HSNoDiploma_P'] = round((s1501_table['008E'] / s1501_table['001E_006E'])*100, 2)
s1501_table['HSGrad_P'] = round((s1501_table['003E_009E'] / s1501_table['001E_006E'])*100, 2)
s1501_table['SomeCollegeNoDeg_P'] = round((s1501_table['004E_010E_011E'] / s1501_table['001E_006E'])*100, 2)
s1501_table['BachOrHigher_P'] = round((s1501_table['005E_012E'] / s1501_table['001E_006E'])*100, 2)
s1501_table['GradOrProf_P'] = round((s1501_table['013E'] / s1501_table['001E_006E'])*100, 2)

# No HS vs HS Grad Percent
s1501_table['noHSDegree_P'] = round(((s1501_table['002E_007E'] + s1501_table['008E']) / s1501_table['001E_006E'])*100, 2)
s1501_table['hasHSDegree_P'] = round(((s1501_table['003E_009E'] + s1501_table['004E_010E_011E'] + s1501_table['005E_012E'] + s1501_table['013E']) / s1501_table['001E_006E'])*100, 2) 
s1501_table.head()

# Very low/Low/Medium/High/Very High on hasHSDegree_P
s1501_table['hsDegreeBins'] = pd.qcut(s1501_table['hasHSDegree_P'], q =5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High'])
s1501_table.head()


Unnamed: 0,GEO_ID,001E_006E,002E_007E,008E,003E_009E,004E_010E_011E,005E_012E,013E,lessThanHS_P,HSNoDiploma_P,HSGrad_P,SomeCollegeNoDeg_P,BachOrHigher_P,GradOrProf_P,noHSDegree_P,hasHSDegree_P,hsDegreeBins
0,1400000US08001007801,2869,699,339,847,831,107,46,24.36,11.82,29.52,28.96,3.73,1.6,36.18,63.82,Very Low
1,1400000US08001007802,3011,840,126,1134,583,250,78,27.9,4.18,37.66,19.36,8.3,2.59,32.08,67.92,Very Low
2,1400000US08001007900,4320,731,431,1305,1076,522,255,16.92,9.98,30.21,24.91,12.08,5.9,26.9,73.1,Very Low
3,1400000US08001008000,4402,788,284,1293,854,681,502,17.9,6.45,29.37,19.4,15.47,11.4,24.35,75.65,Very Low
4,1400000US08001008100,1599,19,10,238,381,563,388,1.19,0.63,14.88,23.83,35.21,24.27,1.81,98.19,Very High


In [21]:
##### Household1
# 'DP02_0001E' - Total
# 'DP02_0002E' - Married-couple households
# 'DP02_0004E' - Cohabiting couple households
# 'DP02_0006E' - Male householder, no spouse
# 'DP02_0010E' - Female householder, no spouse

# Percents 
dp02_table['married_P'] = round((dp02_table['DP02_0002E'] / dp02_table['DP02_0001E'])*100, 2)   #Based on distribution within the geo_id, not within the Married set. (row)
dp02_table['cohabiting_P'] = round((dp02_table['DP02_0004E'] / dp02_table['DP02_0001E'])*100, 2)    #Based on distribution within the geo_id, not within the cohabiting set. 
dp02_table['maleHouseholder_P'] = round((dp02_table['DP02_0006E'] / dp02_table['DP02_0001E'])*100, 2)   #Based on distribution within the geo_id, not within the Male set. 
dp02_table['femaleHouseholder_P'] = round((dp02_table['DP02_0010E'] / dp02_table['DP02_0001E'])*100, 2) #Based on distribution within the geo_id, not within the Female set. 

# Rank (1 = lowest occurance, 4 = highest occurance in that row)
colsToRank = ['DP02_0002E', 'DP02_0004E', 'DP02_0006E', 'DP02_0010E']
newName = ['marriedRank', 'cohabitingRank', 'maleHouseholderRank', 'femaleHouseholderRank']
dp02_table[newName] = dp02_table[colsToRank].rank(axis=1, method='dense')

#Very Low/Low/Medium/High/Very High Bins based on each column
dp02_table['marriedBins'] = pd.qcut(dp02_table['married_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Married set (column)
dp02_table['cohabitingBins'] = pd.qcut(dp02_table['cohabiting_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Cohabiting set and not set values
dp02_table['maleHouseholderBins'] = pd.qcut(dp02_table['maleHouseholder_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Male set and not set values
dp02_table['femaleHouseholderBins'] = pd.qcut(dp02_table['femaleHouseholder_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution Female set and not set values

dp02_table.head()

Unnamed: 0,GEO_ID,DP02_0001E,DP02_0002E,DP02_0004E,DP02_0006E,DP02_0010E,married_P,cohabiting_P,maleHouseholder_P,femaleHouseholder_P,marriedRank,cohabitingRank,maleHouseholderRank,femaleHouseholderRank,marriedBins,cohabitingBins,maleHouseholderBins,femaleHouseholderBins
0,1400000US08001007801,1332,276,153,391,512,20.72,11.49,29.35,38.44,2.0,1.0,3.0,4.0,Very Low,Very High,Very High,Very High
1,1400000US08001007802,1517,406,211,581,319,26.76,13.91,38.3,21.03,3.0,1.0,4.0,2.0,Very Low,Very High,Very High,Medium
2,1400000US08001007900,2264,764,347,595,558,33.75,15.33,26.28,24.65,4.0,1.0,3.0,2.0,Very Low,Very High,High,Medium
3,1400000US08001008000,2025,685,151,488,701,33.83,7.46,24.1,34.62,3.0,1.0,2.0,4.0,Low,Medium,High,Very High
4,1400000US08001008100,770,114,129,243,284,14.81,16.75,31.56,36.88,1.0,2.0,3.0,4.0,Very Low,Very High,Very High,Very High


In [22]:
##### Household2
#           Never married   married     separated   widowed     divorced    Total
# Male      DP02_0026E	    DP02_0027E	DP02_0028E	DP02_0029E	DP02_0030E  DP02_0025E
# Female    DP02_0032E	    DP02_0033E	DP02_0034E	DP02_0035E	DP02_0036E  DP02_0031E

# Percents
total = dp02_table2['DP02_0025E'].astype(int) + dp02_table2['DP02_0031E'].astype(int)
dp02_table2['neverMarried_P'] = round(((dp02_table2['DP02_0026E'] + dp02_table2['DP02_0032E']) / total)*100, 2) #Based on distribution within geo_id
dp02_table2['nowMarried_P'] = round(((dp02_table2['DP02_0027E'] + dp02_table2['DP02_0033E']) / total)*100, 2) #Based on distribution within geo_id
dp02_table2['separated_P'] = round(((dp02_table2['DP02_0028E'] + dp02_table2['DP02_0034E']) / total)*100, 2) #Based on distribution within geo_id
dp02_table2['widowed_P'] = round(((dp02_table2['DP02_0029E'] + dp02_table2['DP02_0035E']) / total)*100, 2) #Based on distribution within geo_id
dp02_table2['divorced_P'] = round(((dp02_table2['DP02_0030E'] + dp02_table2['DP02_0036E']) / total)*100, 2) #Based on distribution within geo_id

# Binary
dp02_medianNeverMarried = np.median((dp02_table2['DP02_0026E'] + dp02_table2['DP02_0032E']))
dp02_table2['belowNeverMarriedMedian'] = ((dp02_table2['DP02_0026E'] + dp02_table2['DP02_0032E']) <= dp02_medianNeverMarried).astype(int)
dp02_table2['aboveNeverMarriedMedian'] = ((dp02_table2['DP02_0026E'] + dp02_table2['DP02_0032E']) >= dp02_medianNeverMarried).astype(int)
print(f'Median Never Married: {dp02_medianNeverMarried}')

## Potentially use 5 bins (Very low, low, medium, high, very high) on all?
dp02_table2['neverMarriedBins'] = pd.qcut(dp02_table2['neverMarried_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Never set (column)
dp02_table2['nowMarriedBins'] = pd.qcut(dp02_table2['nowMarried_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Now Married set (column)
dp02_table2['separatedBins'] = pd.qcut(dp02_table2['separated_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Separated set (column)
dp02_table2['widowedBins'] = pd.qcut(dp02_table2['widowed_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within Widowed set (column)
dp02_table2['divorcedBins'] = pd.qcut(dp02_table2['divorced_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within divorced set (column)

dp02_table2.head()

Median Never Married: 998.0


Unnamed: 0,GEO_ID,DP02_0025E,DP02_0031E,DP02_0026E,DP02_0027E,DP02_0028E,DP02_0029E,DP02_0030E,DP02_0032E,DP02_0033E,DP02_0034E,DP02_0035E,DP02_0036E,neverMarried_P,nowMarried_P,separated_P,widowed_P,divorced_P,belowNeverMarriedMedian,aboveNeverMarriedMedian,neverMarriedBins,nowMarriedBins,separatedBins,widowedBins,divorcedBins
0,1400000US08001007801,1508,1566,656,422,150,58,222,806,353,94,87,226,47.56,25.21,7.94,4.72,14.57,0,1,Very High,Very Low,Very High,High,High
1,1400000US08001007802,1744,1375,962,534,0,46,202,682,452,0,32,209,52.71,31.61,0.0,2.5,13.18,0,1,Very High,Very Low,Very Low,Low,High
2,1400000US08001007900,2464,2340,1213,879,87,107,178,949,828,27,106,430,45.0,35.53,2.37,4.43,12.66,0,1,Very High,Very Low,Very High,Medium,High
3,1400000US08001008000,2414,2280,1122,1000,131,18,143,825,908,33,136,378,41.48,40.65,3.49,3.28,11.1,0,1,High,Low,Very High,Medium,Medium
4,1400000US08001008100,858,741,485,189,7,24,153,443,166,0,73,59,58.04,22.2,0.44,6.07,13.26,1,0,Very High,Very Low,Low,High,High


In [23]:
##### Household3
# Total population - DP02_0088E
# Native Born - DP02_0089E
# Foreign Born - DP02_0094E

# Percents
dp02_table3['nativeBorn_P'] = round((dp02_table3['DP02_0089E'] / dp02_table3['DP02_0088E'])*100, 2)
dp02_table3['foreignBorn_P'] = round((dp02_table3['DP02_0094E'] / dp02_table3['DP02_0088E'])*100, 2)

# Binary
dp02_nativeBornMedian = np.median(dp02_table3['DP02_0089E'])
dp02_table3['belowNativeBornMedian'] = ((dp02_table3['DP02_0089E']) <= dp02_nativeBornMedian).astype(int)
dp02_table3['aboveNativeBornMedian'] = ((dp02_table3['DP02_0089E']) >= dp02_nativeBornMedian).astype(int)
print(f'Median Native Born: {dp02_nativeBornMedian}')

dp02_foreignBornMedian = np.median(dp02_table3['DP02_0094E'])
dp02_table3['belowForeignBornMedian'] = ((dp02_table3['DP02_0094E']) <= dp02_foreignBornMedian).astype(int)
dp02_table3['aboveForeignBornMedian'] = ((dp02_table3['DP02_0094E']) >= dp02_foreignBornMedian).astype(int)
print(f'Median Foreign Born: {dp02_foreignBornMedian}')

## Potentially use 5 bins (Very low, low, medium, high, very high) on each
dp02_table3['nativeBornBins'] = pd.qcut(dp02_table3['nativeBorn_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within native born % set (column)
dp02_table3['foreignBornBins'] = pd.qcut(dp02_table3['foreignBorn_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within foreign born % set (column)

dp02_table3.head()

Median Native Born: 3418.0
Median Foreign Born: 253.0


Unnamed: 0,GEO_ID,DP02_0088E,DP02_0089E,DP02_0094E,nativeBorn_P,foreignBorn_P,belowNativeBornMedian,aboveNativeBornMedian,belowForeignBornMedian,aboveForeignBornMedian,nativeBornBins,foreignBornBins
0,1400000US08001007801,4145,2014,2131,48.59,51.41,1,0,0,1,Very Low,Very High
1,1400000US08001007802,4092,2393,1699,58.48,41.52,1,0,0,1,Very Low,Very High
2,1400000US08001007900,6173,4276,1897,69.27,30.73,0,1,0,1,Very Low,Very High
3,1400000US08001008000,5701,4287,1414,75.2,24.8,0,1,0,1,Very Low,Very High
4,1400000US08001008100,1615,1394,221,86.32,13.68,1,0,1,0,Very Low,Very High


In [24]:
##### Household4
# Civilian Population 18 and older - DP02_0069E
# Civilan Veterans - DP02_0070E

# Percents
dp02_table4['civilVet_P'] =  round((dp02_table4['DP02_0070E'] / dp02_table4['DP02_0069E'])*100, 2)

# Binary
dp02_vetMedian = np.median(dp02_table4['DP02_0070E'])
dp02_table4['belowCivilVetMedian'] = ((dp02_table4['DP02_0070E']) <= dp02_vetMedian).astype(int)
dp02_table4['aboveCivilVetMedian'] = ((dp02_table4['DP02_0070E']) >= dp02_vetMedian).astype(int)
print(f'Median Civilian Veterans: {dp02_vetMedian}')

## Potentially use 5 bins (Very low, low, medium, high, very high) on vets
dp02_table4['civilVetBins'] =  pd.qcut(dp02_table4['civilVet_P'], q=5, labels = ['Very Low','Low', 'Medium', 'High', 'Very High']) #Based on the distribution within civilVet % set (column)

dp02_table4.head()

Median Civilian Veterans: 190.0


Unnamed: 0,GEO_ID,DP02_0069E,DP02_0070E,civilVet_P,belowCivilVetMedian,aboveCivilVetMedian,civilVetBins
0,1400000US08001007801,2869,43,1.5,1,0,Very Low
1,1400000US08001007802,3011,237,7.87,0,1,Medium
2,1400000US08001007900,4308,175,4.06,1,0,Very Low
3,1400000US08001008000,4402,377,8.56,0,1,High
4,1400000US08001008100,1599,136,8.51,1,0,High


In [25]:
##### Poverty
# 'B17001_001E' - Total
# 'B17001_002E' - Number of people whose income in past 12 months below poverty level

# Percents
b17001_table['belowPoverty_P'] = round((b17001_table['B17001_002E'] / b17001_table['B17001_001E'])*100, 2)  #Based on distribution within the geo_id (row)
b17001_table['atOrAbovePoverty_P'] = 100 - b17001_table['belowPoverty_P']   #Based on distribution within the geo_id (row)

# Binary
b17001_medianBelowPoverty = np.median(b17001_table['B17001_002E'])
b17001_table['belowPovertyMedian'] = (b17001_table['B17001_002E'] <= b17001_medianBelowPoverty).astype(int)
b17001_table['abovePovertyMedian'] = (b17001_table['B17001_002E'] >= b17001_medianBelowPoverty).astype(int)
print(f"Median Number of People Below Poverty: {b17001_medianBelowPoverty}")

# #Bins
b17001_table['belowPovertyBins'] = pd.qcut(b17001_table['belowPoverty_P'], q=5, labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']) #Based on the distribution within belowPoverty % set (column)
b17001_table['atOrAbovePovertyBins'] = pd.qcut(b17001_table['atOrAbovePoverty_P'], q=5, labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']) #Based on the distribution within atOrAbovePoverty % set (column)

b17001_table.head()

Median Number of People Below Poverty: 285.0


Unnamed: 0,GEO_ID,B17001_001E,B17001_002E,belowPoverty_P,atOrAbovePoverty_P,belowPovertyMedian,abovePovertyMedian,belowPovertyBins,atOrAbovePovertyBins
0,1400000US08001007801,4110,1208,29.39,70.61,0,1,Very High,Very Low
1,1400000US08001007802,4070,1208,29.68,70.32,0,1,Very High,Very Low
2,1400000US08001007900,6173,1112,18.01,81.99,0,1,Very High,Very Low
3,1400000US08001008000,5667,621,10.96,89.04,0,1,High,Low
4,1400000US08001008100,1404,542,38.6,61.4,0,1,Very High,Very Low


In [26]:
##### Age
# S0101_C01_001E - Total population
# S0101_C01_002E - Under 5 years
# S0101_C01_003E - 5 to 9 years
# S0101_C01_004E - 10 to 14 years
# S0101_C01_005E - 15 to 19 years
# S0101_C01_006E - 20 to 24 years
# S0101_C01_007E - 25 to 29 years
# S0101_C01_008E - 30 to 34 years
# S0101_C01_009E - 35 to 39 years
# S0101_C01_010E - 40 to 44 years
# S0101_C01_011E - 45 to 49 years
# S0101_C01_012E - 50 to 54 years
# S0101_C01_013E - 55 to 59 years
# S0101_C01_014E - 60 to 64 years
# S0101_C01_015E - 65 to 69 years
# S0101_C01_016E - 70 to 74 years
# S0101_C01_017E - 75 to 79 years
# S0101_C01_018E - 80 to 84 years
# S0101_C01_019E - 85 years and over



### Check ACS Tables

In [27]:
## Check Columns
print(f'{s1903_table.columns}\n')
print(f'{s1501_table.columns}\n')
print(f'{dp02_table.columns}\n')
print(f'{dp02_table2.columns}\n')
print(f'{dp02_table3.columns}\n')
print(f'{dp02_table4.columns}\n')
print(f'{b17001_table.columns}\n')

Index(['GEO_ID', 'S1903_C03_001E', 'householdIncomeAboveMedian',
       'householdIncomeBins'],
      dtype='object')

Index(['GEO_ID', '001E_006E', '002E_007E', '008E', '003E_009E',
       '004E_010E_011E', '005E_012E', '013E', 'lessThanHS_P', 'HSNoDiploma_P',
       'HSGrad_P', 'SomeCollegeNoDeg_P', 'BachOrHigher_P', 'GradOrProf_P',
       'noHSDegree_P', 'hasHSDegree_P', 'hsDegreeBins'],
      dtype='object')

Index(['GEO_ID', 'DP02_0001E', 'DP02_0002E', 'DP02_0004E', 'DP02_0006E',
       'DP02_0010E', 'married_P', 'cohabiting_P', 'maleHouseholder_P',
       'femaleHouseholder_P', 'marriedRank', 'cohabitingRank',
       'maleHouseholderRank', 'femaleHouseholderRank', 'marriedBins',
       'cohabitingBins', 'maleHouseholderBins', 'femaleHouseholderBins'],
      dtype='object')

Index(['GEO_ID', 'DP02_0025E', 'DP02_0031E', 'DP02_0026E', 'DP02_0027E',
       'DP02_0028E', 'DP02_0029E', 'DP02_0030E', 'DP02_0032E', 'DP02_0033E',
       'DP02_0034E', 'DP02_0035E', 'DP02_0036E', 'neverMarr

In [28]:
## Check Length
print(f'{len(s1903_table)}')
print(f'{len(s1501_table)}')
print(f'{len(dp02_table)}')
print(f'{len(dp02_table2)}')
print(f'{len(dp02_table3)}')
print(f'{len(dp02_table4)}')
print(f'{len(b17001_table)}')

1447
1447
1447
1447
1447
1447
1447


In [29]:
# Table Subsets - Bins
s1903_subset = s1903_table[['GEO_ID', 'householdIncomeBins']]
s1501_subset = s1501_table[['GEO_ID', 'hsDegreeBins']]
dp02_subset = dp02_table[['GEO_ID', 'marriedBins', 'cohabitingBins', 'maleHouseholderBins', 'femaleHouseholderBins']]
dp02_subset2 = dp02_table2[['GEO_ID', 'neverMarriedBins', 'nowMarriedBins', 'separatedBins', 'widowedBins', 'divorcedBins']]
dp02_subset3 = dp02_table3[['GEO_ID', 'nativeBornBins', 'foreignBornBins']]
dp02_subset4 = dp02_table4[['GEO_ID', 'civilVetBins']]
b17001_subset = b17001_table[['GEO_ID', 'belowPovertyBins', 'atOrAbovePovertyBins']]

### Merge ACS Tables

In [30]:
# MERGE ACS TABLES
from functools import reduce

tables = [s1903_subset, s1501_subset, dp02_subset, dp02_subset2, dp02_subset3, dp02_subset4, b17001_subset]
acsTable = reduce(lambda left, right: pd.merge(left, right, on="GEO_ID", how='inner'), tables)

acsTable

Unnamed: 0,GEO_ID,householdIncomeBins,hsDegreeBins,marriedBins,cohabitingBins,maleHouseholderBins,femaleHouseholderBins,neverMarriedBins,nowMarriedBins,separatedBins,widowedBins,divorcedBins,nativeBornBins,foreignBornBins,civilVetBins,belowPovertyBins,atOrAbovePovertyBins
0,1400000US08001007801,Very Low,Very Low,Very Low,Very High,Very High,Very High,Very High,Very Low,Very High,High,High,Very Low,Very High,Very Low,Very High,Very Low
1,1400000US08001007802,Very Low,Very Low,Very Low,Very High,Very High,Medium,Very High,Very Low,Very Low,Low,High,Very Low,Very High,Medium,Very High,Very Low
2,1400000US08001007900,Low,Very Low,Very Low,Very High,High,Medium,Very High,Very Low,Very High,Medium,High,Very Low,Very High,Very Low,Very High,Very Low
3,1400000US08001008000,Low,Very Low,Low,Medium,High,Very High,High,Low,Very High,Medium,Medium,Very Low,Very High,High,High,Low
4,1400000US08001008100,Very Low,Very High,Very Low,Very High,Very High,Very High,Very High,Very Low,Low,High,High,Very Low,Very High,High,Very High,Very Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1442,1400000US08123002300,Medium,Very Low,High,Very Low,Medium,Low,Very Low,High,Medium,High,Medium,Medium,Medium,Medium,Medium,Medium
1443,1400000US08123002501,Low,Medium,High,Low,Very Low,Medium,Low,High,Medium,Medium,Low,Medium,Medium,High,Very High,Very Low
1444,1400000US08123002502,Medium,Very Low,High,Low,Low,Low,Low,High,High,High,Low,Low,High,High,High,Low
1445,1400000US08125963100,Low,Very Low,High,Low,Low,High,Low,High,High,High,Low,Low,High,Medium,High,Low


### ACS Table Columns

| Output Column Header   | ACS Columns Utilized |
| -------- | ------- |
| GEO_ID   | GEO_ID |
| householdIncomeBins   | S1903_C03_001E |
| hsDegreeBins   | S1501_C01_001E (total 18-24) <br> S1501_C01_003E (High school graduate (18-24)) <br> S1501_C01_004E (Some college or associates degree (18-24)) <br> S1501_C01_005E (Bachelor's degree or higher (18-24)) <br> S1501_C01_006E (total 25+) <br> S1501_C01_008E (9th to 12th, no diploma (25+)) <br> S1501_C01_009E (High school graduate (25+)) <br> S1501_C01_010E (Some college, no degree (25+)) <br> S1501_C01_011E (Associates Degree (25+)) <br> S1501_C01_012E (Bachelor's degree (25+)) <br> S1501_C01_013E (graduate or professional degree (25+)) |
| marriedBins  | DP02_0001 (total) <br> DP02_0002E (married couple) |
| cohabitingBins   | DP02_0001 (total) <br> DP02_0004E (cohabiting couple)  |
| maleHouseholderBins   | DP02_0001 (total) <br> DP02_0006E (male householder no spouse)  |
| femaleHouseholderBins   | DP02_0001 (total) <br> DP02_0010E (female householder no spouse)  |
| neverMarriedBins | DP02_0025E (Male Total) <br> DP02_0031E (Female Total) <br> DP02_0026E (Male Never Married) <br> DP02_0032E (Female Never Married) |
| nowMarriedBins   | DP02_0025E (Male Total) <br> DP02_0031E (Female Total) <br> DP02_0027E (Male Now Married) <br> DP02_0033E (Female Now Married) |
| separatedBins   | DP02_0025E (Male Total) <br> DP02_0031E (Female Total) <br> DP02_0028E (Male Separated) <br> DP02_0034E (Female Separated) |
| widowedBins   | DP02_0025E (Male Total) <br> DP02_0031E (Female Total) <br> DP02_0029E (Male Widowed) <br> DP02_0035E (Female Widowed) |
| divorcedBins  | DP02_0025E (Male Total) <br> DP02_0031E (Female Total) <br> DP02_0030E (Male Divorced) <br> DP02_0036E (Female Divorced) |
| nativeBornBins   | DP02_0088E (total) <br> DP02_0089E (Native Born)|
| foreignBornBins   | DP02_0088E (total) <br> DP02_0094E (Foreign Born) |
| civilVetBins   | DP02_0069E (Civilian Population 18+) <br> DP02_0070E (Civilan Veterans) |
| belowPovertyBins  | B17001_001E (total) <br> B17001_002E (Number of people whose income below poverty level in last 12months) |
| atOrAbovePovertyBins  | B17001_001E (total) <br> B17001_002E (Number of people whose income below poverty level in last 12months) |


### Add NPI Hospice Provider Information

In [38]:
################# TO BE REPLACED WITH DAGMAR'S WORK #################
## Could replace with raw code or call in another .py folder
## TEST DATA
testData = {'NPI': ['1760093470', '1861097982', '1447936901', '1043826795'], 
            'zipPlusFour': ['80237-2857', '81008-2130', '80111-2213', '80111-7957'],    ##Last zipcode is not accurate - just used for testing
            'taxType': ['NPI-2 Organization', 'NPI-2 Organization', 'NPI-2 Organization', 'NPI-2 Organization'], 
            'primaryTax': ['Yes', 'Yes', 'Yes', 'No'], 
            'Issuer': ['MEDICAID', '', '', ''], 
            'status': ['Active', 'Active', 'Active', 'Active']}

## CREATE TEST DF
df = pd.DataFrame(testData)

####################################################################

In [39]:
from zipcodeToCensusTract import convertPostalToLatLong, convertLatLongToCensusTract

In [40]:
## Create zipcode column
df['zip'] = df['zipPlusFour'].str[:5]

## Call function and add lat/long to df
df[['lat', 'long']] = df['zip'].apply(lambda zip: pd.Series(convertPostalToLatLong(zip)))

## Call function and add censusTract to df
df['censusTract'] = df.apply(lambda row: convertLatLongToCensusTract(row['lat'], row['long']), axis = 1)
df

Unnamed: 0,NPI,zipPlusFour,taxType,primaryTax,Issuer,status,zip,lat,long,censusTract
0,1760093470,80237-2857,NPI-2 Organization,Yes,MEDICAID,Active,80237,39.6431,-104.8987,1400000US08031006816
1,1861097982,81008-2130,NPI-2 Organization,Yes,,Active,81008,38.3133,-104.6284,1400000US08101002919
2,1447936901,80111-2213,NPI-2 Organization,Yes,,Active,80111,39.6123,-104.8799,1400000US08005006857
3,1043826795,80111-7957,NPI-2 Organization,No,,Active,80111,39.6123,-104.8799,1400000US08005006857


In [41]:
## Get count of NPI Providers per CensusTract
providerCounts = df['censusTract'].value_counts().reset_index()
providerCounts.columns = ['GEO_ID', 'Provider Count']
providerCounts

Unnamed: 0,GEO_ID,Provider Count
0,1400000US08005006857,2
1,1400000US08031006816,1
2,1400000US08101002919,1


In [42]:
## Merge Count of Providers per CensusTract with ACS Data information
mergedTable = pd.merge(providerCounts, acsTable, on = 'GEO_ID', how = 'left')
mergedTable

Unnamed: 0,GEO_ID,Provider Count,householdIncomeBins,hsDegreeBins,marriedBins,cohabitingBins,maleHouseholderBins,femaleHouseholderBins,neverMarriedBins,nowMarriedBins,separatedBins,widowedBins,divorcedBins,nativeBornBins,foreignBornBins,civilVetBins,belowPovertyBins,atOrAbovePovertyBins
0,1400000US08005006857,2,Very High,Very High,Very High,Very Low,Medium,Very Low,Very Low,Very High,Medium,Very Low,Very Low,Very Low,Very High,Medium,Very Low,Very High
1,1400000US08031006816,1,Very Low,Medium,Low,High,Very High,Medium,High,Low,Low,High,Very High,Low,High,Very High,High,Low
2,1400000US08101002919,1,Low,Low,Low,High,Low,Very High,High,Low,Very High,High,Low,Medium,Medium,Medium,Low,High


In [43]:
## Check length of mergedTable
len(mergedTable)

3