In [1]:
import numpy as np
import pandas as pd

In [2]:
provider = pd.read_csv('PartD_Prescriber_PUF_NPI_17.txt', sep='\t')

In [3]:
#Confirming dataset seemed to read in correctly
provider.head()

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,...,beneficiary_male_count,beneficiary_race_white_count,beneficiary_race_black_count,beneficiary_race_asian_pi_count,beneficiary_race_hispanic_count,beneficiary_race_nat_ind_count,beneficiary_race_other_count,beneficiary_nondual_count,beneficiary_dual_count,beneficiary_average_risk_score
0,1003000126,ENKESHAFI,ARDALAN,,M.D.,M,I,900 SETON DR,,CUMBERLAND,...,92.0,220.0,14.0,0.0,0.0,0.0,0.0,143.0,91.0,2.1685
1,1003000142,KHALIL,RASHID,,M.D.,M,I,4126 N HOLLAND SYLVANIA RD,SUITE 220,TOLEDO,...,92.0,195.0,58.0,,,0.0,,143.0,133.0,1.8029
2,1003000167,ESCOBAR,JULIO,E,DDS,M,I,5 PINE CONE RD,,DAYTON,...,17.0,,0.0,0.0,,0.0,0.0,,,1.0598
3,1003000175,REYES-VASQUEZ,BELINDA,,D.D.S.,F,I,322 N AZUSA AVE STE 202,,LA PUENTE,...,,,,,,,,,,
4,1003000282,BLAKEMORE,ROSIE,K,FNP,F,I,TENNESSEE PRISON FOR WOMEN,3881 STEWARTS LANE,NASHVILLE,...,,,,0.0,,0.0,0.0,,,4.5148


In [4]:
#Looking at columns for unneeded columns
provider.columns

Index(['npi', 'nppes_provider_last_org_name', 'nppes_provider_first_name',
       'nppes_provider_mi', 'nppes_credentials', 'nppes_provider_gender',
       'nppes_entity_code', 'nppes_provider_street1', 'nppes_provider_street2',
       'nppes_provider_city', 'nppes_provider_zip5', 'nppes_provider_zip4',
       'nppes_provider_state', 'nppes_provider_country',
       'specialty_description', 'description_flag',
       'medicare_prvdr_enroll_status', 'total_claim_count',
       'total_30_day_fill_count', 'total_drug_cost', 'total_day_supply',
       'bene_count', 'ge65_suppress_flag', 'total_claim_count_ge65',
       'total_30_day_fill_count_ge65', 'total_drug_cost_ge65',
       'total_day_supply_ge65', 'bene_count_ge65_suppress_flag',
       'bene_count_ge65', 'brand_suppress_flag', 'brand_claim_count',
       'brand_drug_cost', 'generic_suppress_flag', 'generic_claim_count',
       'generic_drug_cost', 'other_suppress_flag', 'other_claim_count',
       'other_drug_cost', 'mapd_suppress

In [5]:
#Dropping a few unneeded columns since it's a large dataset and these don't connect to our other exclusions data
provider_2017 = provider.drop(columns = ['nppes_provider_street2', 'nppes_provider_zip4'])

In [6]:
#Confirming columns dropped
provider_2017.shape

(1162898, 82)

In [7]:
#Looking at data types
print(provider_2017.dtypes)

npi                                   int64
nppes_provider_last_org_name         object
nppes_provider_first_name            object
nppes_provider_mi                    object
nppes_credentials                    object
nppes_provider_gender                object
nppes_entity_code                    object
nppes_provider_street1               object
nppes_provider_city                  object
nppes_provider_zip5                 float64
nppes_provider_state                 object
nppes_provider_country               object
specialty_description                object
description_flag                     object
medicare_prvdr_enroll_status         object
total_claim_count                     int64
total_30_day_fill_count             float64
total_drug_cost                     float64
total_day_supply                      int64
bene_count                          float64
ge65_suppress_flag                   object
total_claim_count_ge65              float64
total_30_day_fill_count_ge65    

In [8]:
#Looking at null values and where we may need to impute or drop values
provider_2017.isnull().sum()

npi                                       0
nppes_provider_last_org_name             34
nppes_provider_first_name                22
nppes_provider_mi                    361527
nppes_credentials                     70738
nppes_provider_gender                     3
nppes_entity_code                         0
nppes_provider_street1                    1
nppes_provider_city                       0
nppes_provider_zip5                      52
nppes_provider_state                      0
nppes_provider_country                    0
specialty_description                     0
description_flag                          0
medicare_prvdr_enroll_status              0
total_claim_count                         0
total_30_day_fill_count                   0
total_drug_cost                           0
total_day_supply                          0
bene_count                           131840
ge65_suppress_flag                   910442
total_claim_count_ge65               252456
total_30_day_fill_count_ge65    

In [9]:
#Due to the high number of null values, removing the columns specifying beneficiary age and race
provider_2017_clean = provider_2017.drop(columns = ['beneficiary_age_less_65_count', 'beneficiary_age_65_74_count', 'beneficiary_age_75_84_count', 'beneficiary_age_greater_84_count', 'beneficiary_race_white_count', 'beneficiary_race_black_count', 'beneficiary_race_asian_pi_count', 'beneficiary_race_hispanic_count', 'beneficiary_race_nat_ind_count', 'beneficiary_race_other_count'])

In [10]:
#Checking columns dropped as columns should now number 70
provider_2017_clean.shape

(1162898, 72)

According to documentation for the dataset, if the following categories are less than 11, this number is suppressed, resulting in null values: bene_count, total_claim_count, opioid_claim_count, la_opiod_claim_count, antiobiotic_claim_count, opioid_bene_count, and la_opioid_bene_count. Therefore, to use this as a feature in our model, we can impute the null values with 5 as this is the average number that these values could be if it is less than 11 or we can classify it in value ranges. After combining data we can look more into the options for managing these.

In [13]:
#Saving as new file
provider_2017 = provider_2017_clean.to_csv('Provider_2017_72Columns.txt', sep='\t', index = False)