In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
plan_df = pd.read_csv('../data/data_2016/Plan_Attributes_PUF_2016.csv',encoding='cp1252')
plan_summary = pd.read_csv('../data/processed_data/plan_summary.csv', index_col=None)

print(plan_df.shape)

(26993, 151)


In [3]:
no_missing_values = plan_summary[(plan_summary['Missing_Values'] == 0) & 
                                 (plan_summary['Unique_Values'] <= 34)]['Column_Name'].to_list()
no_missing_values.extend(['IssuerId','ServiceAreaId','StandardComponentId','StateCode'])
no_missing_values.remove('PlanEffectiveDate')
no_missing_values.remove('DesignType')
print(no_missing_values)

['BusinessYear', 'SourceName', 'MarketCoverage', 'DentalOnlyPlan', 'IsNewPlan', 'PlanType', 'MetalLevel', 'QHPNonQHPTypeId', 'CompositeRatingOffered', 'ChildOnlyOffering', 'OutOfCountryCoverage', 'OutOfServiceAreaCoverage', 'NationalNetwork', 'CSRVariationType', 'MultipleInNetworkTiers', 'FirstTierUtilization', 'InpatientCopaymentMaximumDays', 'BeginPrimaryCareCostSharingAfterNumberOfVisits', 'BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays', 'IssuerId', 'ServiceAreaId', 'StandardComponentId', 'StateCode']


In [4]:
# Treat columns as continous
cleaned_plan_df = plan_df[no_missing_values]
continuous = ['FirstTierUtilization','BeginPrimaryCareCostSharingAfterNumberOfVisits',
              'BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays']
cleaned_plan_df['FirstTierUtilization'] = cleaned_plan_df['FirstTierUtilization'].str.replace('%','')
cleaned_plan_df[continuous] = cleaned_plan_df[continuous].astype(str).astype(float) 

In [5]:
# Get CSRVariationType binary
def largest_dummies(cleaned_plan_df, col_name,number_dummies):
    categories = cleaned_plan_df[col_name].value_counts()
    largest = categories.head(number_dummies).index.to_list()
    rest = categories[number_dummies:].to_list()
    cleaned_plan_df[col_name + 'Other'] = 1
    for j in range(len(largest)):
        category_name = largest[j]
        cleaned_plan_df.loc[ cleaned_plan_df[col_name] == largest[j], col_name+str(category_name)] = 1
        cleaned_plan_df.loc[ cleaned_plan_df[col_name] != largest[j], col_name+str(category_name)] = 0
        cleaned_plan_df.loc[ cleaned_plan_df[col_name] == largest[j], col_name + 'Other'] = 0
    return cleaned_plan_df

cleaned_plan_df = largest_dummies(cleaned_plan_df, 'CSRVariationType',4)
cleaned_plan_df = largest_dummies(cleaned_plan_df, 'IssuerId',10)
cleaned_plan_df = largest_dummies(cleaned_plan_df, 'StateCode',50)

In [6]:
dummy_cols = ['BusinessYear',
 'SourceName',
 'MarketCoverage',
 'DentalOnlyPlan',
 'IsNewPlan',
 'PlanType',
 'MetalLevel',
 'QHPNonQHPTypeId',
 'CompositeRatingOffered',
 'ChildOnlyOffering',
 'OutOfCountryCoverage',
 'OutOfServiceAreaCoverage',
 'NationalNetwork',
 'MultipleInNetworkTiers',
 'InpatientCopaymentMaximumDays']
cleaned_plan_df = pd.get_dummies(cleaned_plan_df, columns = dummy_cols)
cleaned_plan_df

Unnamed: 0,CSRVariationType,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,IssuerId,ServiceAreaId,StandardComponentId,StateCode,CSRVariationTypeOther,CSRVariationTypeLimited Cost Sharing Plan Variation,...,NationalNetwork_YEs,NationalNetwork_Yes,MultipleInNetworkTiers_No,MultipleInNetworkTiers_Yes,InpatientCopaymentMaximumDays_0,InpatientCopaymentMaximumDays_1,InpatientCopaymentMaximumDays_2,InpatientCopaymentMaximumDays_3,InpatientCopaymentMaximumDays_4,InpatientCopaymentMaximumDays_5
0,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0080001,AK,1,0.0,...,0,1,1,0,1,0,0,0,0,0
1,Standard Low Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0030001,AK,1,0.0,...,0,1,1,0,1,0,0,0,0,0
2,Standard Low On Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0030001,AK,1,0.0,...,0,1,1,0,1,0,0,0,0,0
3,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS002,21989AK0050001,AK,1,0.0,...,0,1,1,0,1,0,0,0,0,0
4,Standard High Off Exchange Plan,100.0,0.0,0.0,21989,AKS001,21989AK0080002,AK,1,0.0,...,0,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26988,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0110003,WV,1,0.0,...,0,1,1,0,1,0,0,0,0,0
26989,Standard High Off Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0070003,WV,1,0.0,...,0,1,1,0,1,0,0,0,0,0
26990,Standard High On Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0070003,WV,1,0.0,...,0,1,1,0,1,0,0,0,0,0
26991,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480,WVS001,96480WV0090003,WV,1,0.0,...,0,1,1,0,1,0,0,0,0,0


In [7]:
issuer_numeric = pd.read_csv('../data/processed_data/issuer_numeric.csv')
issuer_numeric = issuer_numeric.iloc[: , 1:]
merged_df = cleaned_plan_df.merge(issuer_numeric, left_index=True,right_index=True,suffixes=('', '_y'))

#clean up the columns 
relevant_cols = list(merged_df.columns)
for word in ['IssuerId','ServiceAreaId','IssuerId_y','ServiceAreaId_y']:
    relevant_cols.remove(word)
relevant_cols = ['IssuerId','ServiceAreaId','StandardComponentId'] + relevant_cols 
merged_df = merged_df[relevant_cols]
merged_df

Unnamed: 0,IssuerId,ServiceAreaId,StandardComponentId,CSRVariationType,FirstTierUtilization,BeginPrimaryCareCostSharingAfterNumberOfVisits,BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays,StandardComponentId.1,StateCode,CSRVariationTypeOther,...,TEHBInnTier1FamilyPerGroupMOOP,EHBPercentTotalPremium,SBCHavingDiabetesDeductible,TEHBInnTier1FamilyPerPersonMOOP,SBCHavingDiabetesCoinsurance,SBCHavingaBabyCoinsurance,SBCHavingaBabyCopayment,TEHBCombInnOonFamilyPerGroupMOOP,TEHBOutOfNetFamilyPerGroupMOOP,SBCHavingDiabetesCopayment
0,21989,AKS001,21989AK0080001,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0080001,AK,1,...,0,0.0,0,0,0,0,0,0,0,0
1,21989,AKS001,21989AK0030001,Standard Low Off Exchange Plan,100.0,0.0,0.0,21989AK0030001,AK,1,...,0,0.0,0,0,0,0,0,0,0,0
2,21989,AKS001,21989AK0030001,Standard Low On Exchange Plan,100.0,0.0,0.0,21989AK0030001,AK,1,...,0,0.0,0,0,0,0,0,0,0,0
3,21989,AKS002,21989AK0050001,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0050001,AK,1,...,0,0.0,0,0,0,0,0,0,0,0
4,21989,AKS001,21989AK0080002,Standard High Off Exchange Plan,100.0,0.0,0.0,21989AK0080002,AK,1,...,0,0.0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26988,96480,WVS001,96480WV0110003,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480WV0110003,WV,1,...,0,0.0,0,0,0,0,0,0,0,0
26989,96480,WVS001,96480WV0070003,Standard High Off Exchange Plan,100.0,0.0,0.0,96480WV0070003,WV,1,...,0,0.0,0,0,0,0,0,0,0,0
26990,96480,WVS001,96480WV0070003,Standard High On Exchange Plan,100.0,0.0,0.0,96480WV0070003,WV,1,...,0,0.0,0,0,0,0,0,0,0,0
26991,96480,WVS001,96480WV0090003,Standard Low Off Exchange Plan,100.0,0.0,0.0,96480WV0090003,WV,1,...,0,0.0,0,0,0,0,0,0,0,0


In [8]:
#drop the dental plan data?
print(merged_df.shape)
merged_df  = merged_df[plan_df['DentalOnlyPlan']=='No']
print(merged_df.shape)

(26993, 130)
(23138, 130)


In [9]:
merged_df.to_csv('../data/processed_data/issuer_characteristics_v1.csv', index=False)

# adding the HIX data

In [10]:
#add in the hix data
cms_data = pd.read_csv('../data/processed_data/issuer_characteristics_v1.csv')
#clean up planids in the HIX data...
hix_data = pd.read_csv('../data/data_2016/plans_2016.csv')
hix_data['PLANID2'] = hix_data['PLANID'] #+ new_plan_df['AREA'].apply(lambda x : '-' + x[-2:])
hix_data['PLANID2'] = hix_data['PLANID2'].apply(lambda x : x[:-3])
no_dash = hix_data['PLANID'].apply(lambda x : x.find('-') == -1 )
hix_data['PLANID2'][no_dash] = hix_data['PLANID'][no_dash]

hix_data = hix_data[['PLANID2','PREMIC', 'PREMI27', 'PREMI50', 'PREMI2C30', 'PREMC2C30']]
hix_data = hix_data.fillna(0)

#take the median for a plan... unclear what area means...
hix_data = hix_data.groupby('PLANID2',as_index=False).median()

print(hix_data['PLANID2'].nunique())
print(hix_data['PREMI50'].nunique())

#first merge on ones where we know the service area?
merged_df = cms_data.merge(hix_data,how='left', left_on=['StandardComponentId'], 
                              right_on=['PLANID2'])
merged_df = merged_df.fillna(0)

print(merged_df['PLANID2'].nunique())
print(merged_df['PREMI50'].nunique())

merged_df = merged_df.drop(labels=['PLANID2'],axis=1)
merged_df.to_csv('../data/processed_data/issuer_characteristics.csv', index=False)

9065
7646
3995
3748
