In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
plan_df = pd.read_csv('../data/data_2016/Plan_Attributes_PUF_2016.csv', encoding='cp1252')
plan_summary = pd.read_csv('../data/processed_data/plan_summary.csv', index_col=None)

print(plan_df.shape)

(26993, 151)


In [3]:
no_missing_values = plan_summary[(plan_summary['Missing_Values'] == 0) & 
                                 (plan_summary['Unique_Values'] <= 34)]['Column_Name'].to_list()
no_missing_values.extend(['IssuerId','ServiceAreaId','StandardComponentId','StateCode'])
no_missing_values.remove('PlanEffectiveDate')
no_missing_values.remove('DesignType')
print(no_missing_values)

['BusinessYear', 'SourceName', 'MarketCoverage', 'DentalOnlyPlan', 'IsNewPlan', 'PlanType', 'MetalLevel', 'QHPNonQHPTypeId', 'CompositeRatingOffered', 'ChildOnlyOffering', 'OutOfCountryCoverage', 'OutOfServiceAreaCoverage', 'NationalNetwork', 'CSRVariationType', 'MultipleInNetworkTiers', 'FirstTierUtilization', 'InpatientCopaymentMaximumDays', 'BeginPrimaryCareCostSharingAfterNumberOfVisits', 'BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays', 'IssuerId', 'ServiceAreaId', 'StandardComponentId', 'StateCode']


In [4]:
# Treat columns as continous
cleaned_plan_df = plan_df[no_missing_values]
continuous = ['FirstTierUtilization','BeginPrimaryCareCostSharingAfterNumberOfVisits',
              'BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays']
cleaned_plan_df['FirstTierUtilization'] = cleaned_plan_df['FirstTierUtilization'].str.replace('%','')
cleaned_plan_df[continuous] = cleaned_plan_df[continuous].astype(str).astype(float) 

In [5]:
# Get CSRVariationType binary
categories = cleaned_plan_df['CSRVariationType'].value_counts()
four_largest = categories.head(4).index.to_list()
rest = categories[4:].to_list()
m = 1 
cleaned_plan_df['CSRVariationTypeBinary'] = 0
for j in range(len(four_largest)):
    cleaned_plan_df.loc[cleaned_plan_df.CSRVariationType == four_largest[j], 'CSRVariationTypeBinary'] = m
    m += 1
cleaned_plan_df.loc[cleaned_plan_df.CSRVariationTypeBinary == 0, 'CSRVariationTypeBinary'] = 5
cleaned_plan_df

Unnamed: 0,BusinessYear,SourceName,MarketCoverage,DentalOnlyPlan,IsNewPlan,PlanType,MetalLevel,QHPNonQHPTypeId,CompositeRatingOffered,ChildOnlyOffering,...,MultipleInNetworkTiers,FirstTierUtilization,InpatientCopaymentMaximumDays,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,IssuerId,ServiceAreaId,StandardComponentId,StateCode,CSRVariationTypeBinary
0,2016,HIOS,SHOP (Small Group),Yes,New,Indemnity,High,Off the Exchange,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0080001,AK,5
1,2016,HIOS,Individual,Yes,Existing,Indemnity,Low,Both,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0030001,AK,5
2,2016,HIOS,Individual,Yes,Existing,Indemnity,Low,Both,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0030001,AK,5
3,2016,HIOS,Individual,Yes,New,PPO,High,Both,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,21989,AKS002,21989AK0050001,AK,5
4,2016,HIOS,SHOP (Small Group),Yes,New,Indemnity,High,Off the Exchange,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0080002,AK,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26988,2016,SERFF,SHOP (Small Group),Yes,New,PPO,Low,Off the Exchange,No,Allows Child-Only,...,No,100.0,0,0.0,0.0,96480,WVS001,96480WV0110003,WV,5
26989,2016,SERFF,SHOP (Small Group),Yes,New,PPO,High,Both,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,96480,WVS001,96480WV0070003,WV,5
26990,2016,SERFF,SHOP (Small Group),Yes,New,PPO,High,Both,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,96480,WVS001,96480WV0070003,WV,5
26991,2016,SERFF,SHOP (Small Group),Yes,New,PPO,Low,Both,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,96480,WVS001,96480WV0090003,WV,5


In [6]:
dummy_cols = ['BusinessYear',
 'SourceName',
 'MarketCoverage',
 'DentalOnlyPlan',
 'IsNewPlan',
 'PlanType',
 'MetalLevel',
 'QHPNonQHPTypeId',
 'CompositeRatingOffered',
 'ChildOnlyOffering',
 'OutOfCountryCoverage',
 'OutOfServiceAreaCoverage',
 'NationalNetwork',
 'MultipleInNetworkTiers',
 'InpatientCopaymentMaximumDays']
cleaned_plan_df = pd.get_dummies(cleaned_plan_df, columns = dummy_cols)
cleaned_plan_df

Unnamed: 0,CSRVariationType,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,IssuerId,ServiceAreaId,StandardComponentId,StateCode,CSRVariationTypeBinary,BusinessYear_2016,...,NationalNetwork_YEs,NationalNetwork_Yes,MultipleInNetworkTiers_No,MultipleInNetworkTiers_Yes,InpatientCopaymentMaximumDays_0,InpatientCopaymentMaximumDays_1,InpatientCopaymentMaximumDays_2,InpatientCopaymentMaximumDays_3,InpatientCopaymentMaximumDays_4,InpatientCopaymentMaximumDays_5
0,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0080001,AK,5,1,...,0,1,1,0,1,0,0,0,0,0
1,Standard Low Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0030001,AK,5,1,...,0,1,1,0,1,0,0,0,0,0
2,Standard Low On Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0030001,AK,5,1,...,0,1,1,0,1,0,0,0,0,0
3,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS002,21989AK0050001,AK,5,1,...,0,1,1,0,1,0,0,0,0,0
4,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0080002,AK,5,1,...,0,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26988,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0110003,WV,5,1,...,0,1,1,0,1,0,0,0,0,0
26989,Standard High Off Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0070003,WV,5,1,...,0,1,1,0,1,0,0,0,0,0
26990,Standard High On Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0070003,WV,5,1,...,0,1,1,0,1,0,0,0,0,0
26991,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0090003,WV,5,1,...,0,1,1,0,1,0,0,0,0,0


In [7]:
issuer_numeric = pd.read_csv('../data/processed_data/issuer_numeric.csv')
issuer_numeric = issuer_numeric.iloc[: , 1:]
merged_df = cleaned_plan_df.merge(issuer_numeric, left_index=True,right_index=True,suffixes=('', '_y'))

#clean up the columns 
relevant_cols = list(merged_df.columns)
for word in ['IssuerId','ServiceAreaId','IssuerId_y','ServiceAreaId_y']:
    relevant_cols.remove(word)
relevant_cols = ['IssuerId','ServiceAreaId','StandardComponentId','StateCode'] + relevant_cols 
merged_df = merged_df[relevant_cols]
merged_df

Unnamed: 0,IssuerId,ServiceAreaId,StandardComponentId,StateCode,CSRVariationType,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,StandardComponentId.1,StateCode.1,...,SBCHavingDiabetesCoinsurance,TEHBCombInnOonFamilyPerPersonMOOP,SBCHavingaBabyCopayment,TEHBOutOfNetFamilyPerGroupMOOP,TEHBCombInnOonIndividualMOOP,SBCHavingaBabyCoinsurance,act_value,TEHBInnTier1FamilyPerPersonMOOP,EHBPercentTotalPremium,OutOfServiceAreaCoverageDescription
0,21989,AKS001,21989AK0080001,AK,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0080001,AK,...,0,0,0,0,0,0,86.3,0,0.0,0
1,21989,AKS001,21989AK0030001,AK,Standard Low Off Exchange Plan,100.0,0.0,0.0,21989AK0030001,AK,...,0,0,0,0,0,0,70.7,0,0.0,0
2,21989,AKS001,21989AK0030001,AK,Standard Low On Exchange Plan,100.0,0.0,0.0,21989AK0030001,AK,...,0,0,0,0,0,0,70.7,0,0.0,0
3,21989,AKS002,21989AK0050001,AK,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0050001,AK,...,0,0,0,0,0,0,83.4,0,0.0,0
4,21989,AKS001,21989AK0080002,AK,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0080002,AK,...,0,0,0,0,0,0,86.3,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26988,96480,WVS001,96480WV0110003,WV,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480WV0110003,WV,...,0,0,0,0,0,0,69.8,0,0.0,0
26989,96480,WVS001,96480WV0070003,WV,Standard High Off Exchange Plan,100.0,0.0,0.0,96480WV0070003,WV,...,0,0,0,0,0,0,83.3,0,0.0,0
26990,96480,WVS001,96480WV0070003,WV,Standard High On Exchange Plan,100.0,0.0,0.0,96480WV0070003,WV,...,0,0,0,0,0,0,83.3,0,0.0,0
26991,96480,WVS001,96480WV0090003,WV,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480WV0090003,WV,...,0,0,0,0,0,0,69.8,0,0.0,0


In [8]:
merged_df.to_csv('../data/processed_data/issuer_characteristics_2017.csv', index=False)