In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
plan_df = pd.read_csv('../data/data_2017/Plan_Attributes_PUF_2017.csv', encoding='cp1252')
plan_summary = pd.read_csv('../data/processed_data/plan_summary.csv', index_col=None)

print(plan_df.shape)

(21365, 152)


In [3]:
no_missing_values = plan_summary[(plan_summary['Missing_Values'] == 0) & 
                                 (plan_summary['Unique_Values'] <= 34)]['Column_Name'].to_list()
no_missing_values.extend(['IssuerId','ServiceAreaId','StandardComponentId','StateCode'])
no_missing_values.remove('PlanEffectiveDate')
no_missing_values.remove('DesignType')
print(no_missing_values)

['BusinessYear', 'SourceName', 'MarketCoverage', 'DentalOnlyPlan', 'IsNewPlan', 'PlanType', 'MetalLevel', 'QHPNonQHPTypeId', 'CompositeRatingOffered', 'ChildOnlyOffering', 'OutOfCountryCoverage', 'OutOfServiceAreaCoverage', 'NationalNetwork', 'CSRVariationType', 'MultipleInNetworkTiers', 'FirstTierUtilization', 'InpatientCopaymentMaximumDays', 'BeginPrimaryCareCostSharingAfterNumberOfVisits', 'BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays', 'IssuerId', 'ServiceAreaId', 'StandardComponentId', 'StateCode']


In [4]:
# Treat columns as continous
cleaned_plan_df = plan_df[no_missing_values]
continuous = ['FirstTierUtilization','BeginPrimaryCareCostSharingAfterNumberOfVisits',
              'BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays']
cleaned_plan_df['FirstTierUtilization'] = cleaned_plan_df['FirstTierUtilization'].str.replace('%','')
cleaned_plan_df[continuous] = cleaned_plan_df[continuous].astype(str).astype(float) 

In [5]:
# Get CSRVariationType binary
categories = cleaned_plan_df['CSRVariationType'].value_counts()
four_largest = categories.head(4).index.to_list()
rest = categories[4:].to_list()
m = 1 
cleaned_plan_df['CSRVariationTypeBinary'] = 0
for j in range(len(four_largest)):
    cleaned_plan_df.loc[cleaned_plan_df.CSRVariationType == four_largest[j], 'CSRVariationTypeBinary'] = m
    m += 1
cleaned_plan_df.loc[cleaned_plan_df.CSRVariationTypeBinary == 0, 'CSRVariationTypeBinary'] = 5
cleaned_plan_df

Unnamed: 0,BusinessYear,SourceName,MarketCoverage,DentalOnlyPlan,IsNewPlan,PlanType,MetalLevel,QHPNonQHPTypeId,CompositeRatingOffered,ChildOnlyOffering,...,MultipleInNetworkTiers,FirstTierUtilization,InpatientCopaymentMaximumDays,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,IssuerId,ServiceAreaId,StandardComponentId,StateCode,CSRVariationTypeBinary
0,2017,HIOS,Individual,Yes,Existing,Indemnity,Low,Both,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0030001,AK,5
1,2017,HIOS,SHOP (Small Group),Yes,New,Indemnity,High,Off the Exchange,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0130001,AK,5
2,2017,HIOS,SHOP (Small Group),Yes,New,Indemnity,High,Off the Exchange,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0130002,AK,5
3,2017,HIOS,Individual,Yes,Existing,Indemnity,Low,Both,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,21989,AKS001,21989AK0030001,AK,5
4,2017,HIOS,Individual,Yes,Existing,PPO,High,Both,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,21989,AKS002,21989AK0050001,AK,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21360,2017,SERFF,Individual,Yes,Existing,PPO,Low,On the Exchange,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,76526,WVS001,76526WV0010006,WV,5
21361,2017,SERFF,SHOP (Small Group),Yes,Existing,PPO,Low,On the Exchange,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,76526,WVS001,76526WV0020006,WV,5
21362,2017,SERFF,SHOP (Small Group),Yes,Existing,PPO,High,On the Exchange,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,76526,WVS001,76526WV0020004,WV,5
21363,2017,SERFF,Individual,Yes,Existing,PPO,High,On the Exchange,No,Allows Adult and Child-Only,...,No,100.0,0,0.0,0.0,76526,WVS001,76526WV0010004,WV,5


In [6]:
dummy_cols = ['BusinessYear',
 'SourceName',
 'MarketCoverage',
 'DentalOnlyPlan',
 'IsNewPlan',
 'PlanType',
 'MetalLevel',
 'QHPNonQHPTypeId',
 'CompositeRatingOffered',
 'ChildOnlyOffering',
 'OutOfCountryCoverage',
 'OutOfServiceAreaCoverage',
 'NationalNetwork',
 'MultipleInNetworkTiers',
 'InpatientCopaymentMaximumDays']
cleaned_plan_df = pd.get_dummies(cleaned_plan_df, columns = dummy_cols)
cleaned_plan_df

Unnamed: 0,CSRVariationType,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,IssuerId,ServiceAreaId,StandardComponentId,StateCode,CSRVariationTypeBinary,BusinessYear_2017,...,OutOfServiceAreaCoverage_yes,NationalNetwork_No,NationalNetwork_Yes,NationalNetwork_yes,MultipleInNetworkTiers_No,MultipleInNetworkTiers_Yes,InpatientCopaymentMaximumDays_0,InpatientCopaymentMaximumDays_3,InpatientCopaymentMaximumDays_4,InpatientCopaymentMaximumDays_5
0,Standard Low Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0030001,AK,5,1,...,0,0,1,0,1,0,1,0,0,0
1,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0130001,AK,5,1,...,0,0,1,0,1,0,1,0,0,0
2,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0130002,AK,5,1,...,0,0,1,0,1,0,1,0,0,0
3,Standard Low On Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0030001,AK,5,1,...,0,0,1,0,1,0,1,0,0,0
4,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS002,21989AK0050001,AK,5,1,...,0,0,1,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21360,Standard Low On Exchange Plan,100.0,0.0,0.0,76526,WVS001,76526WV0010006,WV,5,1,...,0,0,1,0,1,0,1,0,0,0
21361,Standard Low On Exchange Plan,100.0,0.0,0.0,76526,WVS001,76526WV0020006,WV,5,1,...,0,0,1,0,1,0,1,0,0,0
21362,Standard High On Exchange Plan,100.0,0.0,0.0,76526,WVS001,76526WV0020004,WV,5,1,...,0,0,1,0,1,0,1,0,0,0
21363,Standard High On Exchange Plan,100.0,0.0,0.0,76526,WVS001,76526WV0010004,WV,5,1,...,0,0,1,0,1,0,1,0,0,0


In [7]:
issuer_numeric = pd.read_csv('../data/processed_data/issuer_numeric_2017.csv')
issuer_numeric = issuer_numeric.iloc[: , 1:]
merged_df = cleaned_plan_df.merge(issuer_numeric, left_index=True,right_index=True,suffixes=('', '_y'))

#clean up the columns 
relevant_cols = list(merged_df.columns)
for word in ['IssuerId','ServiceAreaId','IssuerId_y','ServiceAreaId_y']:
    relevant_cols.remove(word)
relevant_cols = ['IssuerId','ServiceAreaId','StandardComponentId','StateCode'] + relevant_cols 
merged_df = merged_df[relevant_cols]
merged_df

Unnamed: 0,IssuerId,ServiceAreaId,StandardComponentId,StateCode,CSRVariationType,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,StandardComponentId.1,StateCode.1,...,SBCHavingaBabyCopayment,TEHBInnTier1FamilyPerPersonMOOP,TEHBOutOfNetFamilyPerGroupMOOP,act_value,SBCHavingaBabyCoinsurance,TEHBCombInnOonIndividualMOOP,EHBPercentTotalPremium,TEHBInnTier1FamilyPerGroupMOOP,TEHBOutOfNetIndividualMOOP,TEHBCombInnOonFamilyPerGroupMOOP
0,21989,AKS001,21989AK0030001,AK,Standard Low Off Exchange Plan,100.0,0.0,0.0,21989AK0030001,AK,...,0,0,0,71.4,0,0,0.0,0,0,0
1,21989,AKS001,21989AK0130001,AK,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0130001,AK,...,0,0,0,85.0,0,0,0.0,0,0,0
2,21989,AKS001,21989AK0130002,AK,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0130002,AK,...,0,0,0,85.0,0,0,0.0,0,0,0
3,21989,AKS001,21989AK0030001,AK,Standard Low On Exchange Plan,100.0,0.0,0.0,21989AK0030001,AK,...,0,0,0,71.4,0,0,0.0,0,0,0
4,21989,AKS002,21989AK0050001,AK,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0050001,AK,...,0,0,0,85.2,0,0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21360,76526,WVS001,76526WV0010006,WV,Standard Low On Exchange Plan,100.0,0.0,0.0,76526WV0010006,WV,...,0,0,0,72.0,0,0,0.0,0,0,0
21361,76526,WVS001,76526WV0020006,WV,Standard Low On Exchange Plan,100.0,0.0,0.0,76526WV0020006,WV,...,0,0,0,72.0,0,0,0.0,0,0,0
21362,76526,WVS001,76526WV0020004,WV,Standard High On Exchange Plan,100.0,0.0,0.0,76526WV0020004,WV,...,0,0,0,85.0,0,0,0.0,0,0,0
21363,76526,WVS001,76526WV0010004,WV,Standard High On Exchange Plan,100.0,0.0,0.0,76526WV0010004,WV,...,0,0,0,85.0,0,0,0.0,0,0,0


In [8]:
#drop the dental plan data?
print(merged_df.shape)
merged_df  = merged_df[plan_df['DentalOnlyPlan']=='No']
print(merged_df.shape)

(21365, 75)
(18556, 75)


In [9]:
merged_df.to_csv('../data/processed_data/issuer_characteristics_2017_v1.csv', index=False)

In [10]:
#add in the hix data
cms_data = pd.read_csv('../data/processed_data/issuer_characteristics_2017_v1.csv')
#clean up planids in the HIX data...
hix_data = pd.read_csv('../data/data_2017/plans_2017.csv')
hix_data['PLANID2'] = hix_data['PLANID'] #+ new_plan_df['AREA'].apply(lambda x : '-' + x[-2:])
hix_data['PLANID2'] = hix_data['PLANID2'].apply(lambda x : x[:-3])
no_dash = hix_data['PLANID'].apply(lambda x : x.find('-') == -1 )
hix_data['PLANID2'][no_dash] = hix_data['PLANID'][no_dash]

hix_data = hix_data[['PLANID2','PREMIC', 'PREMI27', 'PREMI50', 'PREMI2C30', 'PREMC2C30']]
hix_data = hix_data.fillna(0)

#take the median for a plan... unclear what area means...
hix_data = hix_data.groupby('PLANID2',as_index=False).median()

print(hix_data['PLANID2'].nunique())
print(hix_data['PREMI50'].nunique())

#first merge on ones where we know the service area?
merged_df = cms_data.merge(hix_data,how='left', left_on=['StandardComponentId'], 
                              right_on=['PLANID2'])
merged_df = merged_df.fillna(0)

print(merged_df['PLANID2'].nunique())
print(merged_df['PREMI50'].nunique())

merged_df = merged_df.drop(labels=['PLANID2'],axis=1)
merged_df.to_csv('../data/processed_data/issuer_characteristics_2017.csv', index=False)

7269
6305
3126
3008
