In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_path = "../data/"
service_df = pd.read_csv(data_path + 'Service_Area_PUF_2019.csv', encoding='cp1252')
plan_df = pd.read_csv(data_path + 'Plan_Attributes_PUF_2019.csv', encoding='cp1252')
plan_summary = pd.read_csv('plan_summary.csv', index_col=None)

In [3]:
no_missing_values = plan_summary[(plan_summary['Missing_Values'] == 0) & (plan_summary['Unique_Values'] <= 34)]['Column_Name'].to_list()
no_missing_values.extend(['IssuerId','ServiceAreaId'])
no_missing_values

['BusinessYear',
 'SourceName',
 'MarketCoverage',
 'DentalOnlyPlan',
 'IsNewPlan',
 'PlanType',
 'MetalLevel',
 'DesignType',
 'QHPNonQHPTypeId',
 'CompositeRatingOffered',
 'ChildOnlyOffering',
 'PlanEffectiveDate',
 'OutOfCountryCoverage',
 'OutOfServiceAreaCoverage',
 'NationalNetwork',
 'CSRVariationType',
 'MultipleInNetworkTiers',
 'FirstTierUtilization',
 'InpatientCopaymentMaximumDays',
 'BeginPrimaryCareCostSharingAfterNumberOfVisits',
 'BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays',
 'IssuerId',
 'ServiceAreaId']

In [4]:
# Treat columns as continous
cleaned_plan_df = plan_df[no_missing_values]
continuous = ['FirstTierUtilization','BeginPrimaryCareCostSharingAfterNumberOfVisits','BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays']
cleaned_plan_df['FirstTierUtilization'] = cleaned_plan_df['FirstTierUtilization'].str.replace('%','')
cleaned_plan_df[continuous] = cleaned_plan_df[continuous].astype(str).astype(float) 

In [5]:
# Get CSRVariationType binary
categories = cleaned_plan_df['CSRVariationType'].value_counts()
four_largest = categories.head(4).index.to_list()
rest = categories[4:].to_list()
m = 1 
cleaned_plan_df['CSRVariationTypeBinary'] = 0
for j in range(len(four_largest)):
    cleaned_plan_df.loc[cleaned_plan_df.CSRVariationType == four_largest[j], 'CSRVariationTypeBinary'] = m
    m += 1
cleaned_plan_df.loc[cleaned_plan_df.CSRVariationTypeBinary == 0, 'CSRVariationTypeBinary'] = 5
cleaned_plan_df

Unnamed: 0,BusinessYear,SourceName,MarketCoverage,DentalOnlyPlan,IsNewPlan,PlanType,MetalLevel,DesignType,QHPNonQHPTypeId,CompositeRatingOffered,...,NationalNetwork,CSRVariationType,MultipleInNetworkTiers,FirstTierUtilization,InpatientCopaymentMaximumDays,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,IssuerId,ServiceAreaId,CSRVariationTypeBinary
0,2019,HIOS,Individual,No,New,PPO,Silver,Not Applicable,On the Exchange,No,...,No,Zero Cost Sharing Plan Variation,No,100.0,0,0.0,0.0,38344,AKS001,2
1,2019,HIOS,SHOP (Small Group),Yes,Existing,Indemnity,High,Not Applicable,Off the Exchange,No,...,Yes,Standard High Off Exchange Plan,No,100.0,0,0.0,0.0,21989,AKS001,5
2,2019,HIOS,Individual,Yes,Existing,Indemnity,Low,Not Applicable,Both,No,...,Yes,Standard Low Off Exchange Plan,No,100.0,0,0.0,0.0,21989,AKS001,5
3,2019,HIOS,Individual,Yes,Existing,Indemnity,Low,Not Applicable,Both,No,...,Yes,Standard Low On Exchange Plan,No,100.0,0,0.0,0.0,21989,AKS001,5
4,2019,HIOS,SHOP (Small Group),Yes,Existing,Indemnity,High,Not Applicable,Off the Exchange,No,...,Yes,Standard High Off Exchange Plan,No,100.0,0,0.0,0.0,21989,AKS001,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15685,2019,SERFF,Individual,Yes,Existing,PPO,Low,Not Applicable,Off the Exchange,No,...,Yes,Standard Low Off Exchange Plan,No,100.0,0,0.0,0.0,76526,WVS001,5
15686,2019,SERFF,Individual,Yes,Existing,PPO,High,Not Applicable,Off the Exchange,No,...,Yes,Standard High Off Exchange Plan,No,100.0,0,0.0,0.0,76526,WVS001,5
15687,2019,SERFF,Individual,Yes,Existing,PPO,Low,Not Applicable,On the Exchange,No,...,Yes,Standard Low On Exchange Plan,No,100.0,0,0.0,0.0,76526,WVS001,5
15688,2019,SERFF,Individual,Yes,Existing,PPO,High,Not Applicable,On the Exchange,No,...,Yes,Standard High On Exchange Plan,No,100.0,0,0.0,0.0,76526,WVS001,5


In [6]:
dummy_cols = ['BusinessYear',
 'SourceName',
 'MarketCoverage',
 'DentalOnlyPlan',
 'IsNewPlan',
 'PlanType',
 'MetalLevel',
 'DesignType',
 'QHPNonQHPTypeId',
 'CompositeRatingOffered',
 'ChildOnlyOffering',
 'PlanEffectiveDate',
 'OutOfCountryCoverage',
 'OutOfServiceAreaCoverage',
 'NationalNetwork',
 'MultipleInNetworkTiers',
 'InpatientCopaymentMaximumDays']
cleaned_plan_df = pd.get_dummies(cleaned_plan_df, columns = dummy_cols)
cleaned_plan_df

Unnamed: 0,CSRVariationType,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,IssuerId,ServiceAreaId,CSRVariationTypeBinary,BusinessYear_2019,SourceName_HIOS,SourceName_OPM,...,NationalNetwork_No,NationalNetwork_Yes,MultipleInNetworkTiers_No,MultipleInNetworkTiers_Yes,InpatientCopaymentMaximumDays_0,InpatientCopaymentMaximumDays_2,InpatientCopaymentMaximumDays_3,InpatientCopaymentMaximumDays_4,InpatientCopaymentMaximumDays_5,InpatientCopaymentMaximumDays_10
0,Zero Cost Sharing Plan Variation,100.0,0.0,0.0,38344,AKS001,2,1,1,0,...,1,0,1,0,1,0,0,0,0,0
1,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,5,1,1,0,...,0,1,1,0,1,0,0,0,0,0
2,Standard Low Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,5,1,1,0,...,0,1,1,0,1,0,0,0,0,0
3,Standard Low On Exchange Plan,100.0,0.0,0.0,21989,AKS001,5,1,1,0,...,0,1,1,0,1,0,0,0,0,0
4,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,5,1,1,0,...,0,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15685,Standard Low Off Exchange Plan,100.0,0.0,0.0,76526,WVS001,5,1,0,0,...,0,1,1,0,1,0,0,0,0,0
15686,Standard High Off Exchange Plan,100.0,0.0,0.0,76526,WVS001,5,1,0,0,...,0,1,1,0,1,0,0,0,0,0
15687,Standard Low On Exchange Plan,100.0,0.0,0.0,76526,WVS001,5,1,0,0,...,0,1,1,0,1,0,0,0,0,0
15688,Standard High On Exchange Plan,100.0,0.0,0.0,76526,WVS001,5,1,0,0,...,0,1,1,0,1,0,0,0,0,0


In [7]:
# Group by IssuerID and ServiceAreaID
service_df2 = service_df[['IssuerId','ServiceAreaId']]
merged = service_df2.merge(cleaned_plan_df, how='left', left_on=['IssuerId','ServiceAreaId'], right_on=['IssuerId','ServiceAreaId'])
merged

Unnamed: 0,IssuerId,ServiceAreaId,CSRVariationType,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,CSRVariationTypeBinary,BusinessYear_2019,SourceName_HIOS,SourceName_OPM,...,NationalNetwork_No,NationalNetwork_Yes,MultipleInNetworkTiers_No,MultipleInNetworkTiers_Yes,InpatientCopaymentMaximumDays_0,InpatientCopaymentMaximumDays_2,InpatientCopaymentMaximumDays_3,InpatientCopaymentMaximumDays_4,InpatientCopaymentMaximumDays_5,InpatientCopaymentMaximumDays_10
0,21989,AKS001,Standard High Off Exchange Plan,100.0,0.0,0.0,5,1,1,0,...,0,1,1,0,1,0,0,0,0,0
1,21989,AKS001,Standard Low Off Exchange Plan,100.0,0.0,0.0,5,1,1,0,...,0,1,1,0,1,0,0,0,0,0
2,21989,AKS001,Standard Low On Exchange Plan,100.0,0.0,0.0,5,1,1,0,...,0,1,1,0,1,0,0,0,0,0
3,21989,AKS001,Standard High Off Exchange Plan,100.0,0.0,0.0,5,1,1,0,...,0,1,1,0,1,0,0,0,0,0
4,21989,AKS001,Standard High Off Exchange Plan,100.0,0.0,0.0,5,1,1,0,...,0,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213187,76526,WVS001,Standard Low Off Exchange Plan,100.0,0.0,0.0,5,1,0,0,...,0,1,1,0,1,0,0,0,0,0
213188,76526,WVS001,Standard High Off Exchange Plan,100.0,0.0,0.0,5,1,0,0,...,0,1,1,0,1,0,0,0,0,0
213189,76526,WVS001,Standard Low On Exchange Plan,100.0,0.0,0.0,5,1,0,0,...,0,1,1,0,1,0,0,0,0,0
213190,76526,WVS001,Standard High On Exchange Plan,100.0,0.0,0.0,5,1,0,0,...,0,1,1,0,1,0,0,0,0,0


In [8]:
grouped = merged.groupby(['IssuerId', 'ServiceAreaId']).mean()
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,CSRVariationTypeBinary,BusinessYear_2019,SourceName_HIOS,SourceName_OPM,SourceName_SERFF,MarketCoverage_Individual,MarketCoverage_SHOP (Small Group),...,NationalNetwork_No,NationalNetwork_Yes,MultipleInNetworkTiers_No,MultipleInNetworkTiers_Yes,InpatientCopaymentMaximumDays_0,InpatientCopaymentMaximumDays_2,InpatientCopaymentMaximumDays_3,InpatientCopaymentMaximumDays_4,InpatientCopaymentMaximumDays_5,InpatientCopaymentMaximumDays_10
IssuerId,ServiceAreaId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10046,HIS001,100.0,0.000000,0.000000,5.000000,1.0,0.0,0.0,1.0,1.000000,0.000000,...,0.000000,1.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10091,ORS002,100.0,0.000000,0.000000,5.000000,1.0,0.0,0.0,1.0,1.000000,0.000000,...,0.000000,1.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10091,ORS003,100.0,0.285714,0.000000,3.523810,1.0,0.0,0.0,1.0,1.000000,0.000000,...,0.000000,1.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10091,ORS005,100.0,0.285714,0.000000,3.523810,1.0,0.0,0.0,1.0,1.000000,0.000000,...,0.000000,1.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10207,VAS001,100.0,0.000000,0.000000,3.947368,1.0,0.0,0.0,1.0,0.684211,0.315789,...,0.894737,0.105263,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99969,OHS004,100.0,0.000000,1.108696,3.521739,1.0,0.0,0.0,1.0,1.000000,0.000000,...,1.000000,0.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
99969,OHS005,100.0,0.000000,1.108696,3.521739,1.0,0.0,0.0,1.0,1.000000,0.000000,...,1.000000,0.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
99969,OHS007,100.0,0.000000,1.108696,3.521739,1.0,0.0,0.0,1.0,1.000000,0.000000,...,1.000000,0.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
99969,OHS009,100.0,0.000000,1.108696,3.521739,1.0,0.0,0.0,1.0,1.000000,0.000000,...,1.000000,0.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [9]:
issuer_numeric = pd.read_csv('issuer_numeric.csv', index_col=None)
issuer_numeric = issuer_numeric.drop(columns=['Unnamed: 0'])
grouped = grouped.merge(issuer_numeric, on=['IssuerId', 'ServiceAreaId'], how='left')
grouped = grouped.drop_duplicates()
grouped.fillna(0, inplace = True)
grouped

Unnamed: 0,IssuerId,ServiceAreaId,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,CSRVariationTypeBinary,BusinessYear_2019,SourceName_HIOS,SourceName_OPM,SourceName_SERFF,...,TEHBCombInnOonFamilyPerPersonMOOP,TEHBCombInnOonFamilyPerGroupMOOP,TEHBInnTier1IndividualMOOP,SBCHavingDiabetesCopayment,TEHBCombInnOonIndividualMOOP,SBCHavingaBabyCopayment,TEHBOutOfNetFamilyPerGroupMOOP,TEHBOutOfNetIndividualMOOP,TEHBInnTier1FamilyPerPersonMOOP,TEHBOutOfNetFamilyPerPersonMOOP
0,10046,HIS001,100.0,0.0,0.000000,5.000000,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,10091,ORS002,100.0,0.0,0.000000,5.000000,1.0,0.0,0.0,1.0,...,0.0,0.0,6.0,0.0,0.0,0.0,40000.0,20.0,6850.0,20000.0
10,10091,ORS002,100.0,0.0,0.000000,5.000000,1.0,0.0,0.0,1.0,...,0.0,0.0,6.0,60.0,0.0,20.0,40000.0,20.0,6350.0,20000.0
12,10091,ORS002,100.0,0.0,0.000000,5.000000,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,10091,ORS002,100.0,0.0,0.000000,5.000000,1.0,0.0,0.0,1.0,...,0.0,0.0,6.0,500.0,0.0,20.0,30000.0,15.0,6350.0,15000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12980,99969,OHS005,100.0,0.0,1.108696,3.521739,1.0,0.0,0.0,1.0,...,0.0,0.0,500.0,0.0,0.0,0.0,0.0,0.0,500.0,0.0
12988,99969,OHS005,100.0,0.0,1.108696,3.521739,1.0,0.0,0.0,1.0,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,6000.0,0.0
12996,99969,OHS007,100.0,0.0,1.108696,3.521739,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12997,99969,OHS009,100.0,0.0,1.108696,3.521739,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
grouped.to_csv('issuer_characteristics.csv', index=False)