In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_path = "../data/data_2016/"
enroll_df = pd.read_csv(data_path + '2016-Issuer-Enrollment-Disenrollment-Report.csv',na_values=['*'])
enroll_df = enroll_df[['HIOS ID','Policy County FIPS Code','Ever Enrolled Count']].fillna('0')
#clean the column
enroll_df['Ever Enrolled Count'] = enroll_df['Ever Enrolled Count'].apply(lambda x: int(x.replace(',','')) )

# enrollment with issuer characteristics

In [3]:
service_df = pd.read_csv(data_path + 'ServiceArea_PUF_2016.csv', encoding='cp1252')
service_df = service_df[['County','ServiceAreaId','IssuerId']]
service_df = service_df.drop_duplicates()
issuer_df = pd.read_csv('../data/processed_data/issuer_characteristics_2017.csv')

In [4]:
issuer_service = issuer_df.merge(service_df, how='inner', on=['ServiceAreaId', 'IssuerId'])
issuer_service = issuer_service[ ~issuer_service['County'].isna() ]

#reorder the columns, drop service area
col_order= ['IssuerId','County'] + list(issuer_service.columns)[2:-1]
issuer_service = issuer_service[ col_order ]

#group by county
#pre_cols = list(issuer_service.columns)
issuer_service = issuer_service.groupby(['IssuerId','County'],as_index=False).mean()
#post_cols = list(issuer_service.columns)
print(issuer_service.shape)
#print(len(pre_cols),len(post_cols))

(9413, 72)


In [5]:
enroll_issuer = enroll_df.merge(issuer_service, how='left', right_on=['County', 'IssuerId'],
                                     left_on=['Policy County FIPS Code','HIOS ID'])
# enroll_issuer.to_csv('test3.csv')
enroll_issuer = enroll_issuer[~enroll_issuer['IssuerId'].isna()]
print(enroll_issuer.shape)

(5723, 75)


# Merge with county characteristics

In [6]:
county = pd.read_csv('../data/processed_data/county_characteristics_2017.csv')
result = enroll_issuer.merge(county, how='left', left_on='County', right_on='County')
result =result[~result['County'].isna()]

In [7]:
all_cols  = list(result.columns)
keys = ['HIOS ID','Policy County FIPS Code','IssuerId','County']
for key in keys:
    all_cols.remove(key)
    
#delete bad columns/clean up census data
all_cols2 = []

for col in all_cols:
    #fix cols from census data
    result[col] = result[col].apply(lambda x : float(str(x).replace('-','0').replace('N','0')))
    
    #clean up cols with no variance  
    if result[col].std() > 0:
        all_cols2.append(col)


result[keys + all_cols2].to_csv('../data/processed_data/merged_characteristics_2017.csv',index=False)

# Summary stats

In [8]:
result['Policy County FIPS Code'].nunique()

2142

In [9]:
result['IssuerId'].nunique()

185

In [10]:
col_names = pd.Series(result.columns)
#col_names.to_csv('col_names.csv')

In [11]:
result1 = pd.DataFrame(result.isnull().sum())
result1

Unnamed: 0,0
HIOS ID,0
Policy County FIPS Code,0
Ever Enrolled Count,0
IssuerId,0
County,0
...,...
DP05_0085PE,5
DP05_0086PE,5
DP05_0087PE,5
DP05_0088PE,5


In [12]:
merged_characteristics_2017 = pd.read_csv('../data/processed_data/merged_characteristics_2017.csv')
list2 = list(merged_characteristics_2017.columns)
merged_characteristics = pd.read_csv('../data/processed_data/merged_characteristics.csv')
list1 = list(merged_characteristics.columns)
list_missing = list(set(list1).difference(list2))
print("Missing values in first list:", list_missing)

Missing values in first list: ['Number of Consumers with CSR AV of 94%', 'Number of Consumers with CSR AV of 73%', 'Total Number of Consumers', 'DP05_0004PE', 'FIPS County Code', 'Average Monthly Advanced CSR Payment for Consumers with 94%', 'Number of Consumers with CSR (AV of 73%/87%/94%)', 'State', 'DP05_0028PE', 'Average Monthly Advanced CSR Payment for Consumers with 73%', 'DP05_0018PE', 'DP05_0032PE', 'County Name', 'Average Monthly Advanced CSR Payment for Consumers with 87%', 'Number of Consumers with CSR AV of 87%']


In [13]:
merged_characteristics_2017[[list_missing]] = 0
merged_characteristics_2017

Unnamed: 0,HIOS ID,Policy County FIPS Code,IssuerId,County,Ever Enrolled Count,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,CSRVariationTypeBinary,SourceName_HIOS,...,Average Monthly Advanced CSR Payment for Consumers with 94%,Number of Consumers with CSR (AV of 73%/87%/94%),State,DP05_0028PE,Average Monthly Advanced CSR Payment for Consumers with 73%,DP05_0018PE,DP05_0032PE,County Name,Average Monthly Advanced CSR Payment for Consumers with 87%,Number of Consumers with CSR AV of 87%
0,44580,1009,44580.0,1009.0,408.0,97.857143,0.0,0.285714,3.52381,1.0,...,0,0,0,0,0,0,0,0,0,0
1,44580,1073,44580.0,1073.0,7156.0,97.857143,0.0,0.285714,3.52381,1.0,...,0,0,0,0,0,0,0,0,0,0
2,44580,1083,44580.0,1083.0,1058.0,97.857143,0.0,0.285714,3.52381,1.0,...,0,0,0,0,0,0,0,0,0,0
3,44580,1089,44580.0,1089.0,6016.0,97.857143,0.0,0.285714,3.52381,1.0,...,0,0,0,0,0,0,0,0,0,0
4,44580,1103,44580.0,1103.0,1575.0,97.857143,0.0,0.285714,3.52381,1.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5718,50328,54051,50328.0,54051.0,24.0,100.000000,0.0,0.000000,3.40000,0.0,...,0,0,0,0,0,0,0,0,0,0
5719,50328,54053,50328.0,54053.0,169.0,100.000000,0.0,0.000000,3.40000,0.0,...,0,0,0,0,0,0,0,0,0,0
5720,50328,54069,50328.0,54069.0,38.0,100.000000,0.0,0.000000,3.40000,0.0,...,0,0,0,0,0,0,0,0,0,0
5721,50328,54079,50328.0,54079.0,427.0,100.000000,0.0,0.000000,3.40000,0.0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
merged_characteristics_2017.to_csv('../data/processed_data/merged_characteristics_2017.csv',index=False)