In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_path = "../data/data_2017/"
enroll_df = pd.read_csv(data_path + '2017-Enrollment-Disenrollment-PUF.csv',na_values=['*'],encoding='cp1252')
enroll_df = enroll_df[['HIOS ID','Policy County FIPS Code','Ever Enrolled Count']].fillna('0')

#clean the columns
enroll_df['Policy County FIPS Code'] = enroll_df['Policy County FIPS Code'].apply( lambda x : str(int(x)) )
enroll_df['Ever Enrolled Count'] = enroll_df['Ever Enrolled Count'].apply(lambda x: float(str(x).replace(',','')) )

# enrollment with issuer characteristics

In [3]:
service_df = pd.read_csv(data_path + 'Service_Area_PUF_2017.csv', encoding='cp1252')
service_df = service_df[['County','ServiceAreaId','IssuerId']]
service_df = service_df.drop_duplicates()
issuer_df = pd.read_csv('../data/processed_data/issuer_characteristics_2017.csv')

#fix fips
service_df['County'] = service_df['County'].fillna(0)
service_df['County'] = service_df['County'].apply( lambda x : str(int(x)) )

In [4]:
issuer_service = issuer_df.merge(service_df, how='inner', on=['ServiceAreaId', 'IssuerId'])
issuer_service = issuer_service[ ~issuer_service['County'].isna() ]

#reorder the columns, drop service area
col_order= ['IssuerId','County'] + list(issuer_service.columns)[2:-1]
issuer_service = issuer_service[ col_order ]

#group by county
#pre_cols = list(issuer_service.columns)
issuer_service = issuer_service.groupby(['IssuerId','County','StandardComponentId','StateCode'],as_index=False).median()
issuer_service = issuer_service.groupby(['IssuerId','County','StateCode'],as_index=False).mean()
issuer_service = issuer_service.rename(columns={'StateCode':'State'})
#post_cols = list(issuer_service.columns)
print(issuer_service.shape)
#print(len(pre_cols),len(post_cols))

(4592, 76)


In [5]:
enroll_issuer = enroll_df.merge(issuer_service, how='left', right_on=['County', 'IssuerId'],
                                     left_on=['Policy County FIPS Code','HIOS ID'])
# enroll_issuer.to_csv('test3.csv')
enroll_issuer = enroll_issuer[~enroll_issuer['IssuerId'].isna()]
print(enroll_issuer.shape)

(4155, 79)


# Merge with county characteristics

In [6]:
county = pd.read_csv('../data/processed_data/county_characteristics_2017.csv')
county['County'] = county['County'].apply(lambda x : str(x))
result = enroll_issuer.merge(county, how='left', left_on='County', right_on='County')
result =result[~result['County'].isna()]

print(result.shape)

(4155, 367)


In [7]:
all_cols  = list(result.columns)
keys = ['HIOS ID','Policy County FIPS Code','IssuerId','County','State']
for key in keys:
    all_cols.remove(key)
    
#delete bad columns/clean up census data
all_cols2 = []

for col in all_cols:
    #fix cols from census data
    result[col] = result[col].apply(lambda x : float(str(x).replace('-','0').replace('N','0')))
    
    #clean up cols with no variance  
    if result[col].std() > 0:
        all_cols2.append(col)


result[keys + all_cols2].to_csv('../data/processed_data/merged_characteristics_2017.csv',index=False)

# Summary stats

In [8]:
result['Policy County FIPS Code'].nunique()

2381

In [9]:
result['IssuerId'].nunique()

142

In [10]:
col_names = pd.Series(result.columns)
#col_names.to_csv('col_names.csv')

In [11]:
result1 = pd.DataFrame(result.isnull().sum())
result1

Unnamed: 0,0
HIOS ID,0
Policy County FIPS Code,0
Ever Enrolled Count,0
IssuerId,0
County,0
...,...
S1701_C01_057E,6
S1701_C01_058E,6
S1701_C01_059E,6
S1701_C01_060E,6


In [12]:
merged_characteristics_2017 = pd.read_csv('../data/processed_data/merged_characteristics_2017.csv')
list2 = list(merged_characteristics_2017.columns)
merged_characteristics = pd.read_csv('../data/processed_data/merged_characteristics.csv')
list1 = list(merged_characteristics.columns)
list_missing = list(set(list1).difference(list2))
print("Missing values in first list:", list_missing)

Missing values in first list: ['ChildOnlyOffering_Allows Child-Only', 'InpatientCopaymentMaximumDays_1', 'Average Monthly Advanced CSR Payment for Consumers with 94%', 'County Name', 'DP05_0028PE', 'Total Number of Consumers', 'DP05_0032PE', 'Number of Consumers with CSR AV of 94%', 'Number of Consumers with CSR AV of 87%', 'Number of Consumers with CSR (AV of 73%/87%/94%)', 'OutOfCountryCoverage_YES', 'DP05_0018PE', 'OutOfServiceAreaCoverage_YES', 'PREMIC', 'NationalNetwork_NO', 'PlanType_Indemnity', 'Average Monthly Advanced CSR Payment for Consumers with 87%', 'DP05_0004PE', 'InpatientCopaymentMaximumDays_2', 'Number of Consumers with CSR AV of 73%', 'FIPS County Code', 'Average Monthly Advanced CSR Payment for Consumers with 73%']


In [13]:
for col in list_missing:
    if col!= 'County Name':
        merged_characteristics_2017[col] = 0
merged_characteristics_2017

Unnamed: 0,HIOS ID,Policy County FIPS Code,IssuerId,County,State,Ever Enrolled Count,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,CSRVariationTypeBinary,...,OutOfServiceAreaCoverage_YES,PREMIC,NationalNetwork_NO,PlanType_Indemnity,Average Monthly Advanced CSR Payment for Consumers with 87%,DP05_0004PE,InpatientCopaymentMaximumDays_2,Number of Consumers with CSR AV of 73%,FIPS County Code,Average Monthly Advanced CSR Payment for Consumers with 73%
0,37903,5007,37903.0,5007,AR,0.0,100.0,0.0,0.0000,3.750000,...,0,0,0,0,0,0,0,0,0,0
1,37903,5009,37903.0,5009,AR,0.0,100.0,0.0,0.0000,3.750000,...,0,0,0,0,0,0,0,0,0,0
2,37903,5015,37903.0,5015,AR,0.0,100.0,0.0,0.0000,3.750000,...,0,0,0,0,0,0,0,0,0,0
3,37903,5035,37903.0,5035,AR,0.0,100.0,0.0,0.0000,3.750000,...,0,0,0,0,0,0,0,0,0,0
4,37903,5037,37903.0,5037,AR,0.0,100.0,0.0,0.0000,3.750000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4150,50328,54099,50328.0,54099,WV,206.0,100.0,0.0,0.3000,3.700000,...,0,0,0,0,0,0,0,0,0,0
4151,50328,54103,50328.0,54103,WV,67.0,100.0,0.0,0.3000,3.700000,...,0,0,0,0,0,0,0,0,0,0
4152,50328,54105,50328.0,54105,WV,54.0,100.0,0.0,0.3000,3.700000,...,0,0,0,0,0,0,0,0,0,0
4153,50328,54107,50328.0,54107,WV,644.0,100.0,0.0,0.3000,3.700000,...,0,0,0,0,0,0,0,0,0,0


In [14]:
merged_characteristics_2017.to_csv('../data/processed_data/merged_characteristics_2017.csv',index=False)