In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Read and merge Census

In [2]:
data_path = "../data/data_2016/"
acs1 = pd.read_csv(data_path + 'ACSDP5Y2016.DP03_data_with_overlays_2022-03-17T201553.csv', na_values=['(X)']) #, encoding='cp1252'
acs2 = pd.read_csv(data_path + 'ACSDP5Y2016.DP05_data_with_overlays_2021-12-09T232536.csv', na_values=['(X)']) 
acs3 = pd.read_csv(data_path + 'ACSST5Y2016.S1701_data_with_overlays_2021-12-12T065723.csv', na_values=['(X)'])

#DP03_0095E - health care coverage

def remove_cols(df):
    
    #save county
    county = df['GEO_ID']
    county = county.str[-5:]
    county = county.astype(str) 
    county = county.loc[1:]
    
    #find the bad columns
    cols = list(df.columns)
    trim_cols1 = []
    for col in cols:
        income_col = 'C01_' in col and 'E' in col
        if 'PE' in col or 'DP03_0095E' in col or income_col:
            trim_cols1.append(col)

    df = df[trim_cols1]
    df = df.loc[1:]
    df.fillna(0, inplace = True)
    
    
    #filter out bad columns
    df['County'] = county
    
    return df


dfs = []

for i in [acs1,acs2,acs3]:
    dfs.append(remove_cols(i))

for i in dfs:
    print(i.shape)


(3220, 139)
(3220, 85)
(3220, 62)


In [3]:
from functools import partial, reduce
[acs1, acs2,acs3] = dfs
merge = partial(pd.merge, on=['County'], how='outer')
all_acs = reduce(merge, dfs)
all_acs

Unnamed: 0,DP03_0001PE,DP03_0002PE,DP03_0003PE,DP03_0004PE,DP03_0005PE,DP03_0006PE,DP03_0007PE,DP03_0008PE,DP03_0009PE,DP03_0010PE,...,S1701_C01_052E,S1701_C01_053E,S1701_C01_054E,S1701_C01_055E,S1701_C01_056E,S1701_C01_057E,S1701_C01_058E,S1701_C01_059E,S1701_C01_060E,S1701_C01_061E
0,42712,60.9,60.2,56.8,3.4,0.7,39.1,25699,5.6,22074,...,952,860,1467,1336,1152,1091,5417,3374,1132,3316
1,160301,58.6,58.4,54.7,3.7,0.1,41.4,93640,6.3,82813,...,4493,3936,5446,6397,5225,5209,6288,12802,6793,14202
2,21476,48.0,48.0,41.9,6.2,0.0,52.0,10316,12.8,9938,...,651,364,606,655,733,674,5349,1274,621,1993
3,18496,48.7,48.6,45.2,3.5,0.0,51.3,8997,7.1,8514,...,294,308,561,323,484,380,7539,742,317,1504
4,46007,49.9,49.9,46.9,3.0,0.0,50.1,22960,6.0,23370,...,894,704,1072,1267,1257,1215,6610,2121,1007,3631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,45100,37.8,37.8,31.2,6.5,0.0,62.2,17037,17.3,23837,...,639,757,1138,1288,1379,1140,7833,1374,483,4669
3216,7229,46.2,46.2,40.6,5.7,0.0,53.8,3342,12.3,3622,...,41,230,273,319,374,232,6304,495,251,751
3217,19212,48.3,48.2,36.2,12.0,0.1,51.7,9256,24.8,9970,...,430,303,470,496,547,319,7023,489,531,1656
3218,28916,37.4,37.3,28.3,9.0,0.1,62.6,10778,24.2,15074,...,513,857,961,1249,1165,661,7824,937,418,4391


# Preprocess CSR

In [4]:
csr = pd.read_csv(data_path + 'csrzipcounty2016.csv', na_values=['.'])
csr.columns = csr.iloc[2]
csr = csr.iloc[4:-1]
csr.reset_index(drop=True)
csr

2,State,FIPS County Code,County Name,Total Number of Consumers,Number of Consumers with CSR (AV of 73%/87%/94%),Number of Consumers with CSR AV of 73%,Number of Consumers with CSR AV of 87%,Number of Consumers with CSR AV of 94%,Average Monthly Advanced CSR Payment for Consumers with 73%,Average Monthly Advanced CSR Payment for Consumers with 87%,Average Monthly Advanced CSR Payment for Consumers with 94%
4,AK,02013,ALEUTIANS EAST,38,16,,,,,,
5,AK,02016,ALEUTIANS WEST,38,16,,,,,,
6,AK,02020,ANCHORAGE,9484,3917,787,1640,1490,$26,$204,$290
7,AK,02050,BETHEL,73,18,,11,,,$234,
8,AK,02060,BRISTOL BAY BOROUGH,34,12,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2600,WY,56037,SWEETWATER,1168,665,109,222,334,$16,$119,$173
2601,WY,56039,TETON,2812,1411,428,484,499,$15,$115,$165
2602,WY,56041,UINTA,663,356,79,116,161,$16,$133,$181
2603,WY,56043,WASHAKIE,341,173,33,54,86,$15,$138,$192


In [5]:
csr_clean = csr.copy()
csr_clean = csr_clean
for column in list(csr.columns)[3:]:
    csr_clean[column] = csr_clean[column].fillna('0')
    csr_clean[column] = csr_clean[column].apply(lambda x: float((str(x)).replace('$','').replace(',','')) )

In [6]:
merged_acs = all_acs.merge(csr_clean, how='left', left_on='County', right_on='FIPS County Code')
merged_acs

Unnamed: 0,DP03_0001PE,DP03_0002PE,DP03_0003PE,DP03_0004PE,DP03_0005PE,DP03_0006PE,DP03_0007PE,DP03_0008PE,DP03_0009PE,DP03_0010PE,...,FIPS County Code,County Name,Total Number of Consumers,Number of Consumers with CSR (AV of 73%/87%/94%),Number of Consumers with CSR AV of 73%,Number of Consumers with CSR AV of 87%,Number of Consumers with CSR AV of 94%,Average Monthly Advanced CSR Payment for Consumers with 73%,Average Monthly Advanced CSR Payment for Consumers with 87%,Average Monthly Advanced CSR Payment for Consumers with 94%
0,42712,60.9,60.2,56.8,3.4,0.7,39.1,25699,5.6,22074,...,01001,AUTAUGA,1896.0,1341.0,142.0,399.0,800.0,13.0,87.0,117.0
1,160301,58.6,58.4,54.7,3.7,0.1,41.4,93640,6.3,82813,...,01003,BALDWIN,11778.0,8342.0,1246.0,2426.0,4670.0,13.0,91.0,121.0
2,21476,48.0,48.0,41.9,6.2,0.0,52.0,10316,12.8,9938,...,01005,BARBOUR,801.0,584.0,62.0,155.0,367.0,13.0,98.0,132.0
3,18496,48.7,48.6,45.2,3.5,0.0,51.3,8997,7.1,8514,...,01007,BIBB,629.0,477.0,58.0,121.0,298.0,15.0,101.0,123.0
4,46007,49.9,49.9,46.9,3.0,0.0,50.1,22960,6.0,23370,...,01009,BLOUNT,2264.0,1663.0,197.0,437.0,1029.0,14.0,96.0,132.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,45100,37.8,37.8,31.2,6.5,0.0,62.2,17037,17.3,23837,...,,,,,,,,,,
3216,7229,46.2,46.2,40.6,5.7,0.0,53.8,3342,12.3,3622,...,,,,,,,,,,
3217,19212,48.3,48.2,36.2,12.0,0.1,51.7,9256,24.8,9970,...,,,,,,,,,,
3218,28916,37.4,37.3,28.3,9.0,0.1,62.6,10778,24.2,15074,...,,,,,,,,,,


In [7]:
missing_value_df = pd.DataFrame(merged_acs.isnull().sum())
missing_value_df.T #= missing_value_df.T
#all_merged = pd.concat([merged_acs, missing_value_df])

Unnamed: 0,DP03_0001PE,DP03_0002PE,DP03_0003PE,DP03_0004PE,DP03_0005PE,DP03_0006PE,DP03_0007PE,DP03_0008PE,DP03_0009PE,DP03_0010PE,...,FIPS County Code,County Name,Total Number of Consumers,Number of Consumers with CSR (AV of 73%/87%/94%),Number of Consumers with CSR AV of 73%,Number of Consumers with CSR AV of 87%,Number of Consumers with CSR AV of 94%,Average Monthly Advanced CSR Payment for Consumers with 73%,Average Monthly Advanced CSR Payment for Consumers with 87%,Average Monthly Advanced CSR Payment for Consumers with 94%
0,0,0,0,0,0,0,0,0,0,0,...,622,622,622,622,622,622,622,622,622,622


In [8]:
merged_acs.fillna(0, inplace = True)
merged_acs.to_csv('../data/processed_data/county_characteristics.csv', index=False)