In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_path = "../data/data_2017/"
enroll_df = pd.read_csv(data_path + '2017-Enrollment-Disenrollment-PUF.csv',na_values=['*'],encoding='cp1252')
enroll_df = enroll_df[['HIOS ID','Policy County FIPS Code','Ever Enrolled Count']].fillna('0')

#clean the columns
enroll_df['Policy County FIPS Code'] = enroll_df['Policy County FIPS Code'].apply( lambda x : str(int(x)) )
enroll_df['Ever Enrolled Count'] = enroll_df['Ever Enrolled Count'].apply(lambda x: float(str(x).replace(',','')) )

# enrollment with issuer characteristics

In [3]:
service_df = pd.read_csv(data_path + 'Service_Area_PUF_2017.csv', encoding='cp1252')
service_df = service_df[['County','ServiceAreaId','IssuerId']]
service_df = service_df.drop_duplicates()
issuer_df = pd.read_csv('../data/processed_data/issuer_characteristics_2017.csv')

#fix fips
service_df['County'] = service_df['County'].fillna(0)
service_df['County'] = service_df['County'].apply( lambda x : str(int(x)) )

In [4]:
issuer_service = issuer_df.merge(service_df, how='inner', on=['ServiceAreaId', 'IssuerId'])
issuer_service = issuer_service[ ~issuer_service['County'].isna() ]

#reorder the columns, drop service area
col_order= ['IssuerId','County'] + list(issuer_service.columns)[2:-1]
issuer_service = issuer_service[ col_order ]

#group by county
#pre_cols = list(issuer_service.columns)
issuer_service = issuer_service.groupby(['IssuerId','County','StandardComponentId','StateCode'],as_index=False).median()
issuer_service_count = issuer_service.groupby(['IssuerId','County','StateCode'],as_index=False)['StandardComponentId'].count()
issuer_service = issuer_service.groupby(['IssuerId','County','StateCode'],as_index=False).mean()
issuer_service['Plan Counts'] = issuer_service_count['StandardComponentId']
issuer_service = issuer_service.rename(columns={'StateCode':'State'})
#post_cols = list(issuer_service.columns)
print(issuer_service.shape)
#print(len(pre_cols),len(post_cols))

(4592, 132)


In [5]:
enroll_issuer = enroll_df.merge(issuer_service, how='left', right_on=['County', 'IssuerId'],
                                     left_on=['Policy County FIPS Code','HIOS ID'])
# enroll_issuer.to_csv('test3.csv')
enroll_issuer['County'] = enroll_issuer['Policy County FIPS Code']
enroll_issuer['IssuerId'] = enroll_issuer['HIOS ID']
enroll_issuer = enroll_issuer.fillna(0)
print(enroll_issuer.shape)

(5698, 135)


# Merge with county characteristics

In [6]:
county = pd.read_csv('../data/processed_data/county_characteristics_2017.csv')
county['County'] = county['County'].apply(lambda x : str(x))
result = enroll_issuer.merge(county, how='left', left_on='County', right_on='County')
result =result[~result['County'].isna()]

print(result.shape)

(5698, 423)


In [7]:
all_cols  = list(result.columns)
keys = ['HIOS ID','Policy County FIPS Code','IssuerId','County','State']
for key in keys:
    all_cols.remove(key)
    
#delete bad columns/clean up census data
all_cols2 = []

for col in all_cols:
    #fix cols from census data
    result[col] = result[col].apply(lambda x : float(str(x).replace('-','0').replace('N','0')))
    
    #clean up cols with no variance  
    if result[col].std() > 0:
        all_cols2.append(col)


result[keys + all_cols2].to_csv('../data/processed_data/merged_characteristics_2017.csv',index=False)

# Summary stats

In [8]:
result['Ever Enrolled Count'].sum()

9167801.0

In [9]:
result['Policy County FIPS Code'].nunique()

2725

In [10]:
result['IssuerId'].nunique()

167

In [11]:
col_names = pd.Series(result.columns)
#col_names.to_csv('col_names.csv')

In [12]:
result1 = pd.DataFrame(result.isnull().sum())
result1

Unnamed: 0,0
HIOS ID,0
Policy County FIPS Code,0
Ever Enrolled Count,0
IssuerId,0
County,0
...,...
S1701_C01_057E,11
S1701_C01_058E,11
S1701_C01_059E,11
S1701_C01_060E,11


In [13]:
merged_characteristics_2017 = pd.read_csv('../data/processed_data/merged_characteristics_2017.csv')
list2 = list(merged_characteristics_2017.columns)
merged_characteristics = pd.read_csv('../data/processed_data/merged_characteristics.csv')
list1 = list(merged_characteristics.columns)
list_missing = list(set(list1).difference(list2))
print("Missing values in first list:", list_missing)

Missing values in first list: ['FIPS County Code', 'IssuerId84670', 'PREMIC', 'BusinessYear_2016', 'Number of Consumers with CSR (AV of 73%/87%/94%)', 'IssuerId68781', 'StateCodeSC', 'County Name', 'IssuerId56503', 'IssuerId14002', 'ChildOnlyOffering_Allows Child-Only', 'Average Monthly Advanced CSR Payment for Consumers with 87%', 'Number of Consumers with CSR AV of 87%', 'DP05_0032PE', 'IssuerId27357', 'Average Monthly Advanced CSR Payment for Consumers with 94%', 'InpatientCopaymentMaximumDays_2', 'Number of Consumers with CSR AV of 73%', 'Number of Consumers with CSR AV of 94%', 'OutOfCountryCoverage_YES', 'DP05_0028PE', 'DP05_0018PE', 'NationalNetwork_NO', 'PlanType_Indemnity', 'DP05_0004PE', 'InpatientCopaymentMaximumDays_1', 'Average Monthly Advanced CSR Payment for Consumers with 73%', 'Total Number of Consumers', 'StateCodeAL', 'OutOfServiceAreaCoverage_YES']


In [14]:
for col in list_missing:
    if col!= 'County Name':
        merged_characteristics_2017[col] = 0
merged_characteristics_2017

Unnamed: 0,HIOS ID,Policy County FIPS Code,IssuerId,County,State,Ever Enrolled Count,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,CSRVariationTypeOther,...,DP05_0028PE,DP05_0018PE,NationalNetwork_NO,PlanType_Indemnity,DP05_0004PE,InpatientCopaymentMaximumDays_1,Average Monthly Advanced CSR Payment for Consumers with 73%,Total Number of Consumers,StateCodeAL,OutOfServiceAreaCoverage_YES
0,38344,2013,38344,2013,0,28.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,38344,2016,38344,2016,0,37.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,38344,2020,38344,2020,0,7601.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,38344,2050,38344,2050,0,43.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,38344,2060,38344,2060,0,27.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5693,11269,56037,11269,56037,0,1189.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5694,11269,56039,11269,56039,0,2906.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5695,11269,56041,11269,56041,0,669.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5696,11269,56043,11269,56043,0,369.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
merged_characteristics_2017.to_csv('../data/processed_data/merged_characteristics_2017.csv',index=False)