In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_path = "../data/data_2016/"
enroll_df = pd.read_csv(data_path + '2016-Issuer-Enrollment-Disenrollment-Report.csv',na_values=['*'])
enroll_df = enroll_df[['HIOS ID','Policy County FIPS Code','Ever Enrolled Count']].fillna('0')
#clean the column
enroll_df['Ever Enrolled Count'] = enroll_df['Ever Enrolled Count'].apply(lambda x: int(x.replace(',','')) )

# enrollment with issuer characteristics

In [3]:
service_df = pd.read_csv(data_path + 'ServiceArea_PUF_2016.csv', encoding='cp1252')
service_df = service_df[['County','ServiceAreaId','IssuerId']]
service_df = service_df.drop_duplicates()
issuer_df = pd.read_csv('../data/processed_data/issuer_characteristics.csv')

In [4]:
issuer_service = issuer_df.merge(service_df, how='inner', on=['ServiceAreaId', 'IssuerId'])
issuer_service = issuer_service[ ~issuer_service['County'].isna() ]

#reorder the columns, drop service area
col_order= ['IssuerId','County'] + list(issuer_service.columns)[2:-1]
issuer_service = issuer_service[ col_order ]

#group by county
#pre_cols = list(issuer_service.columns)
issuer_service = issuer_service.groupby(['IssuerId','County','StandardComponentId'],as_index=False).median()
issuer_service = issuer_service.groupby(['IssuerId','County'],as_index=False).mean()
#post_cols = list(issuer_service.columns)
print(issuer_service.shape)
#print(len(pre_cols),len(post_cols))

(5904, 77)


In [5]:
enroll_issuer = enroll_df.merge(issuer_service, how='left', right_on=['County', 'IssuerId'],
                                     left_on=['Policy County FIPS Code','HIOS ID'])
# enroll_issuer.to_csv('test3.csv')
enroll_issuer = enroll_issuer[~enroll_issuer['IssuerId'].isna()]
print(enroll_issuer.shape)

(5638, 80)


# Merge with county characteristics

In [6]:
county = pd.read_csv('../data/processed_data/county_characteristics.csv')
result = enroll_issuer.merge(county, how='left', left_on='County', right_on='County')
result =result[~result['FIPS County Code'].isna()]

In [7]:
all_cols  = list(result.columns)
keys = ['HIOS ID','Policy County FIPS Code','IssuerId','County','State','FIPS County Code','County Name']
for key in keys:
    all_cols.remove(key)
    
#delete bad columns/clean up census data
all_cols2 = []

for col in all_cols:
    #fix cols from census data
    result[col] = result[col].apply(lambda x : float(str(x).replace('-','0').replace('N','0')))
    
    #clean up cols with no variance  
    if result[col].std() > 0:
        all_cols2.append(col)


result[keys + all_cols2].to_csv('../data/processed_data/merged_characteristics.csv',index=False)

# Summary stats

In [8]:
result['Policy County FIPS Code'].nunique()

2132

In [9]:
result['IssuerId'].nunique()

185

In [10]:
col_names = pd.Series(result.columns)
#col_names.to_csv('col_names.csv')

In [11]:
result1 = pd.DataFrame(result.isnull().sum())
result1

Unnamed: 0,0
HIOS ID,0
Policy County FIPS Code,0
Ever Enrolled Count,0
IssuerId,0
County,0
...,...
Number of Consumers with CSR AV of 87%,0
Number of Consumers with CSR AV of 94%,0
Average Monthly Advanced CSR Payment for Consumers with 73%,0
Average Monthly Advanced CSR Payment for Consumers with 87%,0
