In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Existing issuer_characteristics.ipynb code

In [2]:
plan_df = pd.read_csv('../data/data_2016/Plan_Attributes_PUF_2016.csv', encoding='cp1252')
plan_summary = pd.read_csv('../data/processed_data/plan_summary.csv', index_col=None)

print(plan_df.shape)

(26993, 151)


In [3]:
no_missing_values = plan_summary[(plan_summary['Missing_Values'] == 0) & 
                                 (plan_summary['Unique_Values'] <= 34)]['Column_Name'].to_list()
no_missing_values.extend(['IssuerId','ServiceAreaId','StandardComponentId'])
no_missing_values.remove('PlanEffectiveDate')
no_missing_values.remove('DesignType')
print(no_missing_values)

['BusinessYear', 'SourceName', 'MarketCoverage', 'DentalOnlyPlan', 'IsNewPlan', 'PlanType', 'MetalLevel', 'QHPNonQHPTypeId', 'CompositeRatingOffered', 'ChildOnlyOffering', 'OutOfCountryCoverage', 'OutOfServiceAreaCoverage', 'NationalNetwork', 'CSRVariationType', 'MultipleInNetworkTiers', 'FirstTierUtilization', 'InpatientCopaymentMaximumDays', 'BeginPrimaryCareCostSharingAfterNumberOfVisits', 'BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays', 'IssuerId', 'ServiceAreaId', 'StandardComponentId']


In [4]:
# Treat columns as continous
cleaned_plan_df = plan_df[no_missing_values]
continuous = ['FirstTierUtilization','BeginPrimaryCareCostSharingAfterNumberOfVisits',
              'BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays']
cleaned_plan_df['FirstTierUtilization'] = cleaned_plan_df['FirstTierUtilization'].str.replace('%','')
cleaned_plan_df[continuous] = cleaned_plan_df[continuous].astype(str).astype(float) 

In [5]:
# Get CSRVariationType binary
categories = cleaned_plan_df['CSRVariationType'].value_counts()
four_largest = categories.head(4).index.to_list()
rest = categories[4:].to_list()
m = 1 
cleaned_plan_df['CSRVariationTypeBinary'] = 0
for j in range(len(four_largest)):
    cleaned_plan_df.loc[cleaned_plan_df.CSRVariationType == four_largest[j], 'CSRVariationTypeBinary'] = m
    m += 1
cleaned_plan_df.loc[cleaned_plan_df.CSRVariationTypeBinary == 0, 'CSRVariationTypeBinary'] = 5
cleaned_plan_df

Unnamed: 0,BusinessYear,SourceName,MarketCoverage,DentalOnlyPlan,IsNewPlan,PlanType,MetalLevel,QHPNonQHPTypeId,CompositeRatingOffered,ChildOnlyOffering,...,CSRVariationType,MultipleInNetworkTiers,FirstTierUtilization,InpatientCopaymentMaximumDays,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,IssuerId,ServiceAreaId,StandardComponentId,CSRVariationTypeBinary
0,2016,HIOS,SHOP (Small Group),Yes,New,Indemnity,High,Off the Exchange,No,Allows Adult and Child-Only,...,Standard High Off Exchange Plan,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0080001,5
1,2016,HIOS,Individual,Yes,Existing,Indemnity,Low,Both,No,Allows Adult and Child-Only,...,Standard Low Off Exchange Plan,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0030001,5
2,2016,HIOS,Individual,Yes,Existing,Indemnity,Low,Both,No,Allows Adult and Child-Only,...,Standard Low On Exchange Plan,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0030001,5
3,2016,HIOS,Individual,Yes,New,PPO,High,Both,No,Allows Adult and Child-Only,...,Standard High Off Exchange Plan,No,100.0,0,0.0,0.0,21989,AKS002,21989AK0050001,5
4,2016,HIOS,SHOP (Small Group),Yes,New,Indemnity,High,Off the Exchange,No,Allows Adult and Child-Only,...,Standard High Off Exchange Plan,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0080002,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26988,2016,SERFF,SHOP (Small Group),Yes,New,PPO,Low,Off the Exchange,No,Allows Child-Only,...,Standard Low Off Exchange Plan,No,100.0,0,0.0,0.0,96480,WVS001,96480WV0110003,5
26989,2016,SERFF,SHOP (Small Group),Yes,New,PPO,High,Both,No,Allows Adult and Child-Only,...,Standard High Off Exchange Plan,No,100.0,0,0.0,0.0,96480,WVS001,96480WV0070003,5
26990,2016,SERFF,SHOP (Small Group),Yes,New,PPO,High,Both,No,Allows Adult and Child-Only,...,Standard High On Exchange Plan,No,100.0,0,0.0,0.0,96480,WVS001,96480WV0070003,5
26991,2016,SERFF,SHOP (Small Group),Yes,New,PPO,Low,Both,No,Allows Adult and Child-Only,...,Standard Low Off Exchange Plan,No,100.0,0,0.0,0.0,96480,WVS001,96480WV0090003,5


In [6]:
dummy_cols = ['BusinessYear',
 'SourceName',
 'MarketCoverage',
 'DentalOnlyPlan',
 'IsNewPlan',
 'PlanType',
 'MetalLevel',
 'QHPNonQHPTypeId',
 'CompositeRatingOffered',
 'ChildOnlyOffering',
 'OutOfCountryCoverage',
 'OutOfServiceAreaCoverage',
 'NationalNetwork',
 'MultipleInNetworkTiers',
 'InpatientCopaymentMaximumDays']
cleaned_plan_df = pd.get_dummies(cleaned_plan_df, columns = dummy_cols)
cleaned_plan_df

Unnamed: 0,CSRVariationType,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,IssuerId,ServiceAreaId,StandardComponentId,CSRVariationTypeBinary,BusinessYear_2016,SourceName_HIOS,...,NationalNetwork_YEs,NationalNetwork_Yes,MultipleInNetworkTiers_No,MultipleInNetworkTiers_Yes,InpatientCopaymentMaximumDays_0,InpatientCopaymentMaximumDays_1,InpatientCopaymentMaximumDays_2,InpatientCopaymentMaximumDays_3,InpatientCopaymentMaximumDays_4,InpatientCopaymentMaximumDays_5
0,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0080001,5,1,1,...,0,1,1,0,1,0,0,0,0,0
1,Standard Low Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0030001,5,1,1,...,0,1,1,0,1,0,0,0,0,0
2,Standard Low On Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0030001,5,1,1,...,0,1,1,0,1,0,0,0,0,0
3,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS002,21989AK0050001,5,1,1,...,0,1,1,0,1,0,0,0,0,0
4,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0080002,5,1,1,...,0,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26988,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0110003,5,1,0,...,0,1,1,0,1,0,0,0,0,0
26989,Standard High Off Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0070003,5,1,0,...,0,1,1,0,1,0,0,0,0,0
26990,Standard High On Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0070003,5,1,0,...,0,1,1,0,1,0,0,0,0,0
26991,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0090003,5,1,0,...,0,1,1,0,1,0,0,0,0,0


In [7]:
issuer_numeric = pd.read_csv('../data/processed_data/issuer_numeric.csv')
issuer_numeric = issuer_numeric.iloc[: , 1:]
merged_df = cleaned_plan_df.merge(issuer_numeric, left_index=True,right_index=True,suffixes=('', '_y'))

#clean up the columns 
relevant_cols = list(merged_df.columns)
for word in ['IssuerId','ServiceAreaId','IssuerId_y','ServiceAreaId_y']:
    relevant_cols.remove(word)
relevant_cols = ['IssuerId','ServiceAreaId','StandardComponentId'] + relevant_cols 
merged_df = merged_df[relevant_cols]
merged_df

Unnamed: 0,IssuerId,ServiceAreaId,StandardComponentId,CSRVariationType,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,StandardComponentId.1,CSRVariationTypeBinary,BusinessYear_2016,...,SBCHavingDiabetesCoinsurance,TEHBCombInnOonFamilyPerPersonMOOP,SBCHavingaBabyCopayment,TEHBOutOfNetFamilyPerGroupMOOP,TEHBCombInnOonIndividualMOOP,SBCHavingaBabyCoinsurance,act_value,TEHBInnTier1FamilyPerPersonMOOP,EHBPercentTotalPremium,OutOfServiceAreaCoverageDescription
0,21989,AKS001,21989AK0080001,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0080001,5,1,...,0,0,0,0,0,0,86.3,0,0.0,0
1,21989,AKS001,21989AK0030001,Standard Low Off Exchange Plan,100.0,0.0,0.0,21989AK0030001,5,1,...,0,0,0,0,0,0,70.7,0,0.0,0
2,21989,AKS001,21989AK0030001,Standard Low On Exchange Plan,100.0,0.0,0.0,21989AK0030001,5,1,...,0,0,0,0,0,0,70.7,0,0.0,0
3,21989,AKS002,21989AK0050001,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0050001,5,1,...,0,0,0,0,0,0,83.4,0,0.0,0
4,21989,AKS001,21989AK0080002,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0080002,5,1,...,0,0,0,0,0,0,86.3,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26988,96480,WVS001,96480WV0110003,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480WV0110003,5,1,...,0,0,0,0,0,0,69.8,0,0.0,0
26989,96480,WVS001,96480WV0070003,Standard High Off Exchange Plan,100.0,0.0,0.0,96480WV0070003,5,1,...,0,0,0,0,0,0,83.3,0,0.0,0
26990,96480,WVS001,96480WV0070003,Standard High On Exchange Plan,100.0,0.0,0.0,96480WV0070003,5,1,...,0,0,0,0,0,0,83.3,0,0.0,0
26991,96480,WVS001,96480WV0090003,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480WV0090003,5,1,...,0,0,0,0,0,0,69.8,0,0.0,0


##  Merge new data files 

In [14]:
new_plan_df = pd.read_csv('../data/plans_data/plans_2016.csv')
new_plan_df = new_plan_df[['PLANID','AREA','PREMIC', 'PREMI27', 'PREMI50', 'PREMI2C30', 'PREMC2C30']]
new_plan_df = new_plan_df.fillna(0)
new_plan_df

Unnamed: 0,PLANID,AREA,PREMIC,PREMI27,PREMI50,PREMI2C30,PREMC2C30
0,38344AK0540003,AK02,0.0,908.00,1547.00,2083.00,3066.00
1,38344AK0540003,AK03,0.0,886.00,1509.00,2033.00,2992.00
2,38344AK0540003,AK01,0.0,864.00,1472.00,1981.00,2916.00
3,38344AK0540006,AK02,0.0,779.00,1328.00,1788.00,2632.00
4,38344AK0540006,AK01,0.0,741.00,1263.00,1701.00,2504.00
...,...,...,...,...,...,...,...
50557,11269WY0180015,WY03,0.0,429.64,732.19,985.97,1451.28
50558,11269WY0180015,WY02,0.0,391.55,667.27,898.55,1322.60
50559,47823WY0200001,WY01,0.0,535.34,912.33,1228.53,1808.32
50560,47823WY0200001,WY02,0.0,535.34,912.33,1228.53,1808.32


In [22]:
merged_df['ServiceAreaId'].nunique()

350

In [23]:
new_plan_df['AREA'].nunique()

499

In [17]:
new_plan_df['AREA'].unique()
lst = []
for i in new_plan_df['AREA']:
    new_str = i[:2]+'S0'+i[2:]
    lst.append(str(new_str))
AREA2 = pd.Series(lst)
new_plan_df['AREA2'] = AREA2
from functools import reduce
len(reduce(np.intersect1d, [merged_df.ServiceAreaId, new_plan_df.AREA2]))

189

In [12]:
# new_merged_df = merged_df.merge(new_plan_df, how='left', left_on=['StandardComponentId', 'ServiceAreaId'], right_on=['PLAN', 'AREA2'])

In [13]:
#new_merged_df.to_csv('../data/processed_data/issuer_characteristics_plans_2016.csv', index=False)