In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

In [2]:
df = pd.read_csv('Data/Providers_updated1.txt', sep='\t', low_memory=False)

# Creating higher level groups for Specialty and Credentials

The credentials and specialty description columns have a lot of categories. For the credentials column, this seems to be due to a lack of uniformity in how the values are entered into this column (e.g. M.D. vs MD) while specialty has a wide range of options. To see if we can use these as possible features, will work on cleaning these two columns and group them into a smaller number of groups. 

## Specialty

In [3]:
df['specialty_description'].value_counts()

Nurse Practitioner                                                153964
Dentist                                                           130646
Internal Medicine                                                 130302
Family Practice                                                   110290
Physician Assistant                                                92495
Student in an Organized Health Care Education/Training Program     53815
Emergency Medicine                                                 48475
Obstetrics & Gynecology                                            35496
Optometry                                                          29460
Psychiatry                                                         25413
General Surgery                                                    22822
Orthopedic Surgery                                                 21323
Cardiology                                                         20265
Ophthalmology                                      

In [4]:
#Creating a list of all descriptions that include surgery to create one grouping
surgery_check = df['specialty_description'].str.contains('Surgery|Anesthe')
surgery_true = df[surgery_check]
surgery = surgery_true['specialty_description'].unique()

In [5]:
#Checking all terms fit in this category
print(surgery)

['Anesthesiology' 'General Surgery' 'Cardiac Surgery' 'Orthopedic Surgery'
 'Orthopaedic Surgery' 'Plastic and Reconstructive Surgery'
 'Oral Surgery (Dentist only)' 'Thoracic Surgery' 'Hand Surgery' 'Surgery'
 'Vascular Surgery' 'Colorectal Surgery (Proctology)'
 'Maxillofacial Surgery' 'Oral & Maxillofacial Surgery'
 'Certified Registered Nurse Anesthetist (CRNA)' 'Neurological Surgery'
 'Plastic Surgery' 'Colon & Rectal Surgery'
 'Thoracic Surgery (Cardiothoracic Vascular Surgery)'
 'Anesthesiology Assistant']


In [6]:
#Creating surgical grouping
df['specialty_description'] = df['specialty_description'].replace(surgery, 'Surgery')

In [7]:
#Creating a list of all descriptions that include oncology
oncology_check = df['specialty_description'].str.contains('oncology', case=False)
oncology_true = df[oncology_check]
oncology = oncology_true['specialty_description'].unique()

In [8]:
#Checking all terms fit in this category
print(oncology)

['Radiation Oncology' 'Hematology-Oncology' 'Medical Oncology'
 'Surgical Oncology' 'Gynecological Oncology']


In [9]:
#Creating oncology grouping
df['specialty_description'] = df['specialty_description'].replace(oncology, 'Oncology')

In [10]:
#Creating a list of all descriptions that include neurology and psychiatry
n_and_p_check = df['specialty_description'].str.contains('neuro|psychiatr|psycho', case=False)
n_and_p_true = df[n_and_p_check]
neuro = n_and_p_true['specialty_description'].unique()

In [11]:
#Checking all terms fit in this category
print(neuro)

['Psychiatry' 'Neurology' 'Psychiatry & Neurology' 'Neurosurgery'
 'Neuropsychiatry' 'Psychologist, Clinical' 'Psychoanalyst'
 'Neuromusculoskeletal Medicine, Sports Medicine' 'Psychologist'
 'Geriatric Psychiatry' 'Clinical Neuropsychologist'
 'Licensed Psychiatric Technician'
 'Psychiatric Residential Treatment Facility']


In [12]:
#Creating psychiatry and neurology grouping
df['specialty_description'] = df['specialty_description'].replace(neuro, 'Psychiatry/Psychology & Neurology')

In [13]:
#Creating a list of all descriptions that include emergency
emergency_check = df['specialty_description'].str.contains('emergency', case=False)
emergency_true = df[emergency_check]
emergency = emergency_true['specialty_description'].unique()

In [14]:
#Checking all terms fit in this category
print(emergency)

['Emergency Medicine' 'Personal Emergency Response Attendant'
 'Emergency Medical Technician, Basic'
 'Non-emergency Medical Transport (VAN)'
 'Emergency Medical Technician, Paramedic'
 'Emergency Medical Technician, Intermediate']


In [15]:
#Creating emergency medicine grouping but removed the non-emergency description
df['specialty_description'] = df['specialty_description'].replace(['Emergency Medicine', 'Personal Emergency Response Attendant', 'Emergency Medical Technician, Basic', 'Emergency Medical Technician, Paramedic', 'Emergency Medical Technician, Intermediate'], 'Emergency Medicine')

In [16]:
#Creating a list of all descriptions that include dentist/dental
dentist_check = df['specialty_description'].str.contains('dentist|dental', case=False)
dentist_true = df[dentist_check]
dental = dentist_true['specialty_description'].unique()

In [17]:
#Checking all terms fit in this category
print(dental)

['Dentist' 'Dental Therapist' 'Advanced Practice Dental Therapist'
 'Dental Laboratory Technician' 'Dental Assistant' 'Dental Hygienist']


In [18]:
#Creating emergency medicine grouping but removed the non-emergency description
df['specialty_description'] = df['specialty_description'].replace(dental, 'Dentist')

In [19]:
#Creating a list of all descriptions that include cardio
cardio_check = df['specialty_description'].str.contains('cardio|cardiac|vascular', case=False)
cardio_true = df[cardio_check]
cardio = cardio_true['specialty_description'].unique()

In [20]:
#Checking all terms fit in this category
print(cardio)

['Cardiology' 'Interventional Cardiology'
 'Advanced Heart Failure and Transplant Cardiology'
 'Clinical Cardiac Electrophysiology' 'Peripheral Vascular Disease'
 'Technician, Cardiology' 'Specialist/Technologist Cardiovascular']


In [21]:
#Creating cardiology group
df['specialty_description'] = df['specialty_description'].replace(cardio, 'Cardiologist')

In [22]:
#Creating a list of all descriptions that include general nurse
nurse_check = df['specialty_description'].str.contains('nurs', case=False)
nurse_true = df[nurse_check]
nurse = nurse_true['specialty_description'].unique()

In [23]:
print(nurse)

['Nurse Practitioner' 'Certified Nurse Midwife'
 'Certified Clinical Nurse Specialist' 'Registered Nurse'
 'Licensed Practical Nurse' 'Skilled Nursing Facility'
 'Licensed Vocational Nurse' "Nurse's Aide"
 'Religious Nonmedical Nursing Personnel' 'Other Nursing Facility']


In [24]:
#Creating nurse grouping
df['specialty_description'] = df['specialty_description'].replace(nurse, 'Nurse')

In [25]:
#Creating a list of all descriptions that include general diet or nutritional specialties 
peds_check = df['specialty_description'].str.contains('pedia', case=False)
peds_true = df[peds_check]
peds = peds_true['specialty_description'].unique()

In [26]:
print(peds)

['Pediatric Medicine' 'Pediatrics']


In [27]:
df['specialty_description'] = df['specialty_description'].replace(peds, 'Pediatrics')

In [28]:
#Creating a list of all descriptions that include physical, podiatry or ortho related fields 
physical_check = df['specialty_description'].str.contains('physical|ortho|pod|sports', case=False)
physical_true = df[physical_check]
physical = physical_true['specialty_description'].unique()

In [29]:
print(physical)

['Physical Medicine and Rehabilitation' 'Podiatry' 'Sports Medicine'
 'Physical Therapist in Private Practice'
 'Physical Medicine & Rehabilitation' 'Physical Therapy Assistant'
 'Orthotic Fitter' 'Residential Treatment Facility, Physical Disabilities'
 'Assistant, Podiatric']


In [30]:
df['specialty_description'] = df['specialty_description'].replace(physical, 'Physical Medicine')

In [31]:
#Combining similar fields
df['specialty_description']= df['specialty_description'].replace(['Family Practice', 'General Practice', 'Family Medicine'], 'General/Family Practice')

In [32]:
#Creating a list of all descriptions that include radiology 
radio_check = df['specialty_description'].str.contains('radio|radia', case=False)
radio_true = df[radio_check]
radio = radio_true['specialty_description'].unique()

In [33]:
print(radio)

['Diagnostic Radiology' 'Interventional Radiology' 'Radiology'
 'Radiology Practitioner Assistant' 'Radiologic Technologist'
 'Radiation Therapy Center']


In [34]:
df['specialty_description'] = df['specialty_description'].replace(radio, 'Radiology')

In [35]:
#Creating a list of all optometry/eye descriptions 
eye_check = df['specialty_description'].str.contains('eye|opt|ophtha', case=False)
eye_true = df[eye_check]
eye = eye_true['specialty_description'].unique()

In [36]:
print(eye)

['Optometry' 'Ophthalmology' 'Optician']


In [37]:
df['specialty_description'] = df['specialty_description'].replace(eye, 'Optometry/Ophthalmology')

In [38]:
#Combining similar fields from visual check
df['specialty_description']= df['specialty_description'].replace(['Nephrology', 'Endocrinology', 'Hematology', 'Gastroenterology'], 'Internal Medicine')

In [39]:
#Creating a list of all immunology grouping 
immune_check = df['specialty_description'].str.contains('rheu|aller|immun', case=False)
immune_true = df[immune_check]
immune = immune_true['specialty_description'].unique()

In [40]:
print(immune)

['Rheumatology' 'Allergy/ Immunology' 'Mass Immunizer Roster Biller']


In [41]:
df['specialty_description']= df['specialty_description'].replace(immune, 'Allergy/Immunology')

In [42]:
#Creating a pain category
pain_check = df['specialty_description'].str.contains('pain|hospice', case=False)
pain_true = df[pain_check]
pain = pain_true['specialty_description'].unique()

In [43]:
print(pain)

['Pain Management' 'Hospice and Palliative Care'
 'Interventional Pain Management']


In [44]:
df['specialty_description']= df['specialty_description'].replace(pain, 'Pain/Palliative Care')

In [45]:
#Creating a pathology
path_check = df['specialty_description'].str.contains('pathol', case=False)
path_true = df[path_check]
path = path_true['specialty_description'].unique()

In [46]:
print(path)

['Pathology' 'Speech Language Pathologist'
 'Specialist/Technologist, Pathology' 'Technician, Pathology']


In [47]:
df['specialty_description']= df['specialty_description'].replace(path, 'Pathology')

In [48]:
pulm_check = df['specialty_description'].str.contains('pulm|respir', case=False)
pulm_true = df[pulm_check]
pulmonary = pulm_true['specialty_description'].unique()

In [49]:
print(pulmonary)

['Pulmonary Disease' 'Respiratory Therapist, Registered'
 'Respiratory Therapist, Certified']


In [50]:
df['specialty_description']= df['specialty_description'].replace(pulmonary, 'Respiratory/Pulmonary')

In [51]:
pharma_check = df['specialty_description'].str.contains('pharma', case=False)
pharma_true = df[pharma_check]
pharma = pharma_true['specialty_description'].unique()

In [52]:
print(pharma)

['Pharmacist' 'Clinical Pharmacology' 'Pharmacy Technician']


In [53]:
df['specialty_description']= df['specialty_description'].replace(pharma, 'Pharmacist/Pharmacology')

In [54]:
#Classifying all values that appear less than 675 times as 'other'
unique = df.groupby('specialty_description').filter(lambda x:len(x) < 675)
few = unique['specialty_description'].unique()

In [55]:
df['specialty_description']= df['specialty_description'].replace(few, 'Other/Undefined')

In [56]:
#Combining other fields
df['specialty_description']= df['specialty_description'].replace(['Hospitalist', 'Specialist', 'Preventive Medicine', 'Critical Care (Intensivists)', 'Naturopath', 'Other'], 'Other/Undefined')

In [57]:
#cleaning up the student field
df['specialty_description'] = df['specialty_description'].str.replace('Student in an Organized Health Care Education/Training Program', 'Student')

In [58]:
#Looking at final classifications
df['specialty_description'].value_counts()

Nurse                                160125
Internal Medicine                    159155
Dentist                              130663
General/Family Practice              120913
Physician Assistant                   92495
Surgery                               77271
Psychiatry/Psychology & Neurology     60684
Student                               53815
Optometry/Ophthalmology               49253
Emergency Medicine                    48598
Obstetrics & Gynecology               35496
Cardiologist                          25540
Physical Medicine                     25017
Oncology                              18066
Other/Undefined                       17140
Dermatology                           13297
Pediatrics                            10965
Urology                               10707
Otolaryngology                        10461
Respiratory/Pulmonary                  9469
Allergy/Immunology                     8743
Pharmacist/Pharmacology                6691
Infectious Disease              

## Transforming credentials

In [59]:
df['nppes_credentials'].value_counts()

M.D.                    292988
MD                      277584
DDS                      50865
PA-C                     45393
D.O.                     39715
D.D.S.                   31981
DO                       25768
NP                       25175
DMD                      21202
PA                       20146
O.D.                     18786
FNP                      16461
D.M.D.                   12379
ARNP                     11846
M.D                      11637
DPM                      10070
APRN                      9337
OD                        8523
CRNP                      7589
FNP-C                     6480
P.A.                      6293
NP-C                      6047
N.P.                      4888
CNP                       4687
FNP-BC                    4531
D.D.S                     4362
D.P.M.                    4123
APN                       3902
PAC                       3762
D.M.D                     2738
                         ...  
D.D.S. LTD.                  1
ARPN, FN

In [60]:
#Dropping the periods to harmonize entries
df['nppes_credentials'] = df['nppes_credentials'].str.replace(r'.', '')

In [61]:
#Dropping the Certified (-C) credential to combine the same credentials together, certified or not
df['nppes_credentials'] = df['nppes_credentials'].str.replace(r'-C', '')

In [62]:
#Dropping out the Board Certified (-BC) credentials to combine the same credentials together
#Dropping periods to merge the same abbreviation but with different punctuation
df['nppes_credentials'] = df['nppes_credentials'].str.replace(r'-BC', '')

In [63]:
#Standardizing the commas vs spaces between various credentials
df['nppes_credentials'] = df['nppes_credentials'].str.replace(r' ', '')
df['nppes_credentials'] = df['nppes_credentials'].str.replace(r',', ', ')

In [64]:
#Dropping dashed to merge the same abbreviation but with different punctuation
df['nppes_credentials']= df['nppes_credentials'].str.replace(r'-', '')

In [65]:
#Splitting columns by comma
df1 = pd.concat([df, df['nppes_credentials'].str.split(', ', expand=True)], axis=1)

In [66]:
df1.columns

Index([                             'npi',     'nppes_provider_last_org_name',
              'nppes_provider_first_name',                'nppes_provider_mi',
                      'nppes_credentials',            'nppes_provider_gender',
                      'nppes_entity_code',           'nppes_provider_street1',
                    'nppes_provider_city',              'nppes_provider_zip5',
                  'specialty_description',                 'description_flag',
           'medicare_prvdr_enroll_status',                'total_claim_count',
                'total_30_day_fill_count',                  'total_drug_cost',
                       'total_day_supply',                       'bene_count',
                     'ge65_suppress_flag',           'total_claim_count_ge65',
           'total_30_day_fill_count_ge65',             'total_drug_cost_ge65',
                  'total_day_supply_ge65',    'bene_count_ge65_suppress_flag',
                        'bene_count_ge65',          

In [67]:
#For classification purposes, will only look at the first credential listed so removing columns 1, 2, 3, 4, 5 as well as the orginal column
df2 = df1.drop(columns = ['nppes_credentials', 1, 2, 3, 4, 5])

In [68]:
#Looking at values with just one credential
df2[0].value_counts()

MD                     598881
DDS                     91172
PA                      75791
DO                      69258
DMD                     37966
NP                      37010
FNP                     29541
OD                      28331
DPM                     14962
ARNP                    13488
APRN                    12473
CRNP                     8135
CNP                      5099
PAC                      5070
APN                      4884
RN                       4752
MSN                      3774
PHARMD                   3608
ANP                      3206
CNM                      2177
DNP                      2018
MBBS                     1791
RPA                      1779
PMHNP                    1683
NURSEPRACTITIONER        1616
ACNP                     1445
APNP                     1345
RPH                      1304
MS                       1203
PHYSICIANASSISTANT       1177
                        ...  
MDFACSRPVIRVTRDM            1
MDMPU                       1
RNACNS    

In [70]:
#Inserting blank for null values
df2[0].fillna('Unknown', inplace = True)

In [71]:
#Lumping credentials together starting with dentist
dentist_check = df2[0].str.contains('dentist|DMD|DDS', case=False)
dentist_true = df2[dentist_check]
dentist = dentist_true[0].unique()

In [72]:
print(dentist)

['DDS' 'DMDPA' 'DMD' 'DDSPHD' 'DENTIST' 'DDSMAGD' 'PHARMD;DDS' 'DDSMBAPA'
 'DDSMSFAGD' 'DDSMS' 'DDSPC' 'DDSMD' 'DDSDENTIST' 'DMDMDS' 'DDSMSPHD'
 'GENERALDENTIST' 'BSDDS' 'DMDPSC' 'DMDDRMEDDENT' 'DMDMAGD' 'DMDMD'
 'DDSMSD' 'PHDDMD' 'DMDPC' 'DDSMSCDDSCDSCD' 'DMDLLC' 'MDDDS' 'ODMD'
 'DDSMSCDPC' 'MDDMD' 'PHDMD' 'DENTISTDDS' 'DDSPA' 'DDSMSC' 'DMDMSD'
 'DDSLTD' 'DDSMPH' 'DDSINC' 'DMDMSED' 'DDSPS' 'DDSMSPA' 'DDSRPH' 'DMDMPH'
 'DDSENDODONTICS' 'DMDMS' 'DDSSC' 'DMDPHD' 'DDSPROSTHODONTIST' 'DDSMDS'
 'GENERALDENTISTDDS' 'BSDMDMS' 'DMD;MSD' 'DMD>' 'DDSGENERAL' 'DMDMMSC'
 '(DDS)' 'MSDMDMD' 'DCHDDMDMSCD' 'DMDD' 'DRDMD' 'DDSQ' 'DDS>' 'MSDDS'
 'DMDDENTIST' 'DDSMSPC' 'DDSMDSPHD' 'DMDMSC' 'DDS;PC' 'TERENCEPEGELDDS'
 'DMDFAGD' 'DDSDMD' 'DMDMSDPA' 'ODPHDDSC' 'DDSPLC' 'DDSFAGD'
 'BSDDSFAAMPFACP' 'DDSPSC' 'DDSOWNER' 'DMDPROSTHODONTIST'
 'ANDREALISOWSKIDMD' 'DRDDS' 'ABDDS' 'MSDMD' 'DDS;MD' 'DMDMDMPH' 'MDMD'
 'DDSPLLC' 'DDSBSBIOLOGY' 'MSDDSMAGD' 'DDSFICOI' 'DDSOMS' 'DRDENTIST'
 'DDSRPHPA' 'DDSCAGS' 'DDS/' 'DM

In [73]:
#One value in the above (PHDMD) shouldn't be included so converting that to catch with the MD code
df2[0] = df2[0].str.replace('PHDMD', 'MDPHD')

In [74]:
#Rerunning now that the PHDMD will not be caught
dentist_check = df2[0].str.contains('dentist|DMD|DDS', case=False)
dentist_true = df2[dentist_check]
dentist = dentist_true[0].unique()

In [75]:
#Converting all entries to dentist
df2[0] = df2[0].replace(dentist, 'Dentist/DDS')

In [76]:
#Classifying pharmacists
pharma_check = df2[0].str.contains('PHARM|RPH', case=False)
pharma_true = df2[pharma_check]
pharma = pharma_true[0].unique()

In [77]:
print(pharma)

['PHARMD' 'RPH' 'RPHD' 'PHARMACIST' 'MSPHARM' 'PHARMD/RPH' 'MPHARM'
 'BSPHARM' 'RPHBCACP' 'BSPHARMACY' 'BPHARM' 'REGISTEREDPHARMACIS'
 'DOCTOROFPHARMACY' 'MD/PHARMD' 'PHARMD;RPH' 'PHARM' 'PHARMDPHC'
 'RPHCALIFORNIA' 'DO;PHARMD' 'DRPH' 'BSCPHARM' 'BSCINPHARMACY'
 'PHARMD(PHARMACIST)' 'RPHMBA' 'CLINICALPHARMACIST' 'PHARMACISTMANAGER'
 'PHARMDCACP' 'PHARMDMBA' 'PHARMDRPH' 'RPARPH' 'PHARMD>' 'DORPH'
 'MDPHARMD' 'DOANDPHARMD' 'MDRPH' 'PHARMDBCPP' 'PHARMD(RPH)'
 'MBBCHIRPHDMHS' 'PHARMDBCPS' 'PHDRPH' 'MASTEROFPHARMACY'
 'LICENSEDPHARMACIST' 'RPHLICENSE' 'PHARMDR' 'RPHCDM' 'PHARMACYINTERN'
 'BSCPHARMACYRPH' 'BSOFPHARMACY' 'PHARMDCANDIDATE' 'BSCPHARMACY'
 'INTERNPHARMACIST' 'BSRPH' 'BSPHARMAXCY' 'PHARMD/MBA' 'RPHPHARMACIST'
 'REGPHARMACIST' 'MDDRPH' 'DOPHARMD' 'PHARMACISTLICENSE'
 'PHARMACYDOCTORATE' 'RPH(PHARMACIST)' 'MD(PHARMD' 'PHARMACIST(PHARMD)'
 'DOCTORATEOFPHARM']


In [78]:
df2[0] = df2[0].replace(pharma, 'Pharmacist')

In [79]:
#Classifying everything with MD
MD_check = df2[0].str.contains('MD|MEDICINE|MEDICAL', case=False)
MD_true = df2[MD_check]
MD = MD_true[0].unique()

In [80]:
print(MD)

['MD' 'MEDICINEDOCTORMD' 'MDPHD' 'MDGENERALSURGEON' 'MDFACS' 'NMDPMHNP'
 'MD/PHD' 'MD/MPH' 'MEDICALDOCTOR' 'NMD' 'MDMPH' 'MDFACC'
 'MDDIPLOMATEABFP' 'MDFACEP' 'MDLLC' 'MDFACP' 'MDFACOG' 'MDMBBCH'
 'MDANDINTERNIST' 'MDAMEDICALCORPOR' 'MDPA' 'MDINC' 'MDFRCS(C)FACSFICS'
 'MDCM' 'MDFAAFP' 'MD/MBBS' 'MDFAAP' 'MDMS' 'MDOPHTHALMOLOGIST' 'MD&DO'
 'MDMPHFACOEM' 'DOMD' 'MBBS(MD)' 'MDMBA' 'MEDICALSTUDENT' 'MD/PA'
 'MEDICALPHYSICIAN' 'MD;FACOG' 'MDMSC' 'MDMSCR' 'MDMACCDFACECDE' 'MDFCCP'
 'MD(MBBS)' 'MDMPHMASTERSINPU' 'MDPC' 'MD(ENDOCRINOLOGIST)' 'PHDIMD'
 'MDMACAE' 'MDBS' 'MDPHDMSCI' 'MD/MBA' 'OPHTHALMOLGISTMD' 'MDLAC' 'MDMPU'
 'MEDICALDIRECTOR' 'MDFRCPC' 'MD(PCINCORPORAT' 'MEDICINEDOCTOR' 'PRARMD'
 'MDPHDMRCP(UK)' 'MBBS(MDEQUIV)' 'OPMDPM' 'MDFACCFSCAI' 'MDMEPHDMA' 'MDMA'
 'MDBSC' 'MD:' 'HOWARDESSNERMD' 'MD&PHD' 'MDFRCS' 'MBBSMD' 'MD>'
 'MDMPHMBA' 'MD(H)' 'MDCANDIDATE' 'MD/DO' 'MDMRCPFRCR' 'MDFRCSC'
 'MDFACCFACP' 'MDJD' 'MDDABPMFIPPMBA' 'MDINST' 'MDM' 'MDDO' 'PSYCHIATRYMD'
 'MD;FACRRM;DCN;' 'FATIM

In [81]:
df2[0] = df2[0].replace(MD, 'Medical Doctor')

In [82]:
#Classifying everything with possible nursing abbreviations
nurse_check = df2[0].str.contains('NP|LPN|CNM|MSN|RN|NURS|APN|BSN', case=False)
nurse_true = df2[nurse_check]
nurse = nurse_true[0].unique()

In [83]:
print(nurse)

['FNP' 'DNP' 'APRN' ... 'MSNRNNPC' 'CFNPCDE' 'RNMSNHSAFNP']


In [84]:
df2[0] = df2[0].replace(nurse, 'Nurse')

In [85]:
#Classifying everything with PA
PA_check = df2[0].str.contains('PA|PHYSCIAN|PHYSICIAN', case=False)
PA_true = df2[PA_check]
PA = PA_true[0].unique()

In [86]:
print(PA)

['PA' 'RPA' 'PA/MPH' 'PHYSICIANASSISTANT' 'RPAC' 'MSPAS' 'PAC' 'MPAS'
 'PAS' 'MSPA' 'PHYSICIANASSISTAT' 'MPA' 'DPMPA' 'PAMPAS'
 'PHYSICIANSASSISTANT' 'MSPAC' 'EDDMBAPA' 'PHYSICIANSASST' 'PHYSCIAN'
 'PHYSICIANASSISTENT' 'GPA' 'MTPA' 'OPA' 'MPAP' 'PHYSICIANASST'
 'PA(PHYSICIANASSIST' 'MSPASPA' 'PACS' 'PHYSICIANSASSISTANC' 'MMSPA'
 'MPASPA' 'ODPA' 'PACCDE' 'PAPHYSICIANASST' 'PHYSICIAN' 'CPA' 'MSHSPA'
 'PAPHD' 'DOPA' 'MPAC' 'PAPHYSICIANASSISTA' 'PAMIDLEVELPRACTIT'
 'NDNATUROPATHICDO' 'PHYSICIANASSITANT' 'PA/C' 'DOCTOROFOSTEOPATHY'
 'PAMPH' 'MHSPA' 'PHYSICIANEXTENDER' 'PHYSICIANASSTCERTI' 'JOEPALMQUIST'
 'PACMS' 'PHYSICIANSASSTPA' 'PHYSICIANASSISANT' 'PAL' 'MPASPAC' 'PACMHSA'
 'PACERTIFIEDPAC' 'MPHPAC' 'DPMPAC' 'PACATC' 'PHYSICIANASSTC' 'PAERTIFIED'
 'PA/SA' 'PACMHS' 'APAC' 'PACPHYSICIANASSIST' 'RPACPHYSICIANASSIS' 'CRPA'
 'RPA(PHYSICIANAS' 'PHYSICIANASSIATANT' 'PHDPA' 'LPAC' 'PA_C'
 'FAMILYPHYSICIAN' 'MPAS/ATC/PAC' 'PAPHYSICIANASSIS' 'REGISTEREDPA'
 'PACMPAS' 'MS;PA' 'PHYSICIANASSTCERT' 'P

In [87]:
df2[0] = df2[0].replace(PA, 'Physican Assistant')

In [88]:
#Classifying everything with DPM
dpm_check = df2[0].str.contains('DPM|PODIA', case=False)
dpm_true = df2[dpm_check]
dpm = dpm_true[0].unique()

In [89]:
print(dpm)

['DPM' 'DPM/PHD' 'DPMFACFO' 'DPMCWS' 'DPMMPH' 'DPMPODIATRIST' 'DPMPC'
 'DPMINC' 'DPM/MPH' 'DPMPHD' 'DPMPT' 'DPMCWSFACFAOM' 'DPMMS' 'DPMPS'
 'DPMFACFAS' 'DPMFACFASCPED' 'DPMLLC' 'DPM/CWP' 'DPMABPS' 'DPMPLLC'
 'PODIATRISTDPM' 'DPMND' 'PODIATRIST' 'PHDPMHCNS' 'DPMMA' 'DCDPM'
 'DPMFACFS' 'DPMCWSP' 'DPMFACFASSC' 'DPMMHALLC' 'DPMMSME' 'DPMPODIATRY'
 'PODIATRY' 'DOCTOROFPODIATRY' 'DPMFACFAOM']


In [90]:
df2[0] = df2[0].replace(dpm, 'Podiatrist')

In [91]:
#Classifying everything with PA
eye_check = df2[0].str.contains('OD|OPTO', case=False)
eye_true = df2[eye_check]
eye = eye_true[0].unique()

In [92]:
print(eye)

['OD' 'Podiatrist' 'OPTOMETRYDOCTOR' 'DOCTOROFOPTOMETRY' 'OPTOMETRIST'
 'ODPC' 'ODFAAO' 'OD>' 'ODFCOVD' 'PERIODONTIST' 'ODLLC' 'ODOPTOMETRIST'
 'ODPLC' 'ODCHARTERED' 'OPTOMETIST' 'ODMBA' 'ODDROFOPTOMETRY' 'OD/MS'
 'ODMS' 'KENTUCKYPERIODONTAL' 'PHDOD' 'OPTOMERISTS' 'MOD' 'OPTOMETRIS'
 'ODFCOVDPC' 'OD&ASSOCPC' 'ODPSC' 'ODMSC' 'KENGOODELL' 'PROSTHODONTIST'
 'ODINC' 'KEVINRECKLEYOD' 'ANOPTOMETRICCORP' 'FRANKBONIFACIODO'
 'OPTOMETRY' 'MAXILLOPROSTHODONTIS' 'OD(OPTOMETRIST)' 'ODPHD' 'ODS'
 'ODMED' 'OPTOMOTRIST' 'OPTOMETRICDOCTOR' 'ODMPH' 'ODMBAFAAO'
 'ODOPTOMETRY' 'ODLTD' 'OD/MBA' 'ODAPC' 'MSOD' 'ODPS' 'OOD'
 'OPTOMETRISTOD' 'ENDODONTIST' 'ODPLLC' 'MELODYLYND' 'OPTOMETRYSTUDENT'
 'RODNEYHELM' 'PEARKARLSGODT' 'THERAPEUTICOPTOMETR']


In [93]:
df2[0] = df2[0].replace(eye, 'Optometrist')

In [94]:
#Classifying all values that appear less than 300 times as 'other'
fewer = df2.groupby(0).filter(lambda x:len(x) < 300)
few = fewer[0].unique()

In [95]:
df2[0] = df2[0].replace(few, 'Other')

In [96]:
#Looking at final groupings 
df2[0].value_counts()

Medical Doctor        603127
Nurse                 144359
Dentist/DDS           131158
Physican Assistant     85522
unknown                70738
DO                     69258
Optometrist            43618
Pharmacist              5141
Other                   4218
MBBS                    1791
MS                      1203
ND                       831
PHD                      784
CNS                      529
BDS                      319
DC                       302
Name: 0, dtype: int64

In [97]:
#Renaming this cleaned credential field to the orginal name for merging with other data
df3 = df2.rename(columns = {0: 'nppes_credentials'})

In [98]:
#Checking column name
df3.columns

Index(['npi', 'nppes_provider_last_org_name', 'nppes_provider_first_name',
       'nppes_provider_mi', 'nppes_provider_gender', 'nppes_entity_code',
       'nppes_provider_street1', 'nppes_provider_city', 'nppes_provider_zip5',
       'specialty_description', 'description_flag',
       'medicare_prvdr_enroll_status', 'total_claim_count',
       'total_30_day_fill_count', 'total_drug_cost', 'total_day_supply',
       'bene_count', 'ge65_suppress_flag', 'total_claim_count_ge65',
       'total_30_day_fill_count_ge65', 'total_drug_cost_ge65',
       'total_day_supply_ge65', 'bene_count_ge65_suppress_flag',
       'bene_count_ge65', 'brand_suppress_flag', 'brand_claim_count',
       'brand_drug_cost', 'generic_suppress_flag', 'generic_claim_count',
       'generic_drug_cost', 'other_suppress_flag', 'other_claim_count',
       'other_drug_cost', 'mapd_suppress_flag', 'mapd_claim_count',
       'mapd_drug_cost', 'pdp_suppress_flag', 'pdp_claim_count',
       'pdp_drug_cost', 'lis_suppress_fla

In [99]:
#Checking shape to confirm all rows still there
df3.shape

(1162898, 76)

In [100]:
#Looking at data before saving as new file
df3.head()

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_city,nppes_provider_zip5,specialty_description,...,beneficiary_nondual_count,beneficiary_dual_count,beneficiary_average_risk_score,EXCLYear,REINYear,excl_type,exclusion_flag,country,state,nppes_credentials
0,1003000126,ENKESHAFI,ARDALAN,,M,I,900 SETON DR,CUMBERLAND,21502.0,Internal Medicine,...,143.0,91.0,2.1685,,,,0,US,MD,Medical Doctor
1,1003000142,KHALIL,RASHID,,M,I,4126 N HOLLAND SYLVANIA RD,TOLEDO,43623.0,Surgery,...,143.0,133.0,1.8029,,,,0,US,OH,Medical Doctor
2,1003000167,ESCOBAR,JULIO,E,M,I,5 PINE CONE RD,DAYTON,89403.0,Dentist,...,,,1.0598,,,,0,US,NV,Dentist/DDS
3,1003000175,REYES-VASQUEZ,BELINDA,,F,I,322 N AZUSA AVE STE 202,LA PUENTE,91744.0,Dentist,...,,,,,,,0,US,CA,Dentist/DDS
4,1003000282,BLAKEMORE,ROSIE,K,F,I,TENNESSEE PRISON FOR WOMEN,NASHVILLE,37243.0,Nurse,...,,,4.5148,,,,0,US,TN,Nurse


In [101]:
df3.to_csv('Data/Providers_Updated_430.txt', sep='\t', index = False)