# import packages

In [1]:
import pandas as pd
import numpy as np
import csv
from urllib import request
from io import StringIO

# important functions

In [2]:
#if value is NaN, return ifnan_value, otherwise return value
def if_null_value(value,if_nan_value):
    return (if_nan_value if np.isnan(value) else value)

#combines strings, used to combine first and last names with spaces between.
def combine_strings(*strings):
    combine = ''
    for x in strings:
        if isinstance(x,str):
          combine += x + ' '
    if(combine == ''):
        return combine
    else:
        return combine[0:-1] 

# Create dataframe from file
Data is not loaded in github and can be found at https://data.cms.gov/Medicare-Claims/Medicare-Part-D-Opioid-Prescriber-Summary-File-201/yb2j-f3fp.

In [3]:
filename = '2013_data/Medicare_Part_D_Opioid_Prescriber_Summary_File_2013.csv'
opioid_df = pd.read_csv(filename)[['NPI',
                                   'NPPES Provider First Name',
                                   'NPPES Provider Last Name',
                                   'NPPES Provider ZIP Code',
                                   'NPPES Provider State',
                                   'Specialty Description',
                                   'Total Claim Count',
                                   'Opioid Claim Count',
                                   'Opioid Prescribing Rate']]
#change name of columns
opioid_df.columns =['doctor id',
                    'first name',
                    'last name',                    
                    'zip code',                       
                    'state',
                    'specialty description',
                    'total claims',
                    'opioid claims',
                    'percent opioid claims']
#set doctor id to the index
opioid_df.set_index('doctor id',inplace=True)
#combine first and last name into a single column
opioid_df['doctor name']=opioid_df[['first name','last name']].apply(lambda x:combine_strings(*x),axis=1)
opioid_df.drop(labels=['first name','last name'],axis=1,inplace=True)

#setting specialty description index
doc_spec = set(list(opioid_df['specialty description']))
spec_compreh=((x,counter) for counter, x in enumerate(doc_spec))
spec_df=pd.DataFrame(spec_compreh,columns=['specialty description','specialty index'])
opioid_df = opioid_df.merge(spec_df,how='left',on='specialty description')

#Droping doctors with nan or zero claims  
opioid_df['opioid claims']= opioid_df['opioid claims'].apply(lambda x: if_null_value(x,0))
opioid_df['percent opioid claims']=opioid_df['percent opioid claims'].apply(lambda x: if_null_value(x,0))
opioid_reduced=opioid_df[opioid_df['opioid claims'] != 0.0]

# Quick analyze of data and Outlier detection

In [29]:
print('# health providers: {}'
      .format(opioid_df['doctor name'].count()))
print('# health providers who prescribe opioid: {}'
      .format(opioid_reduced['doctor name'].count()))
print('% health providers that prescribe opioids: {:.2f}%'.
     format(opioid_reduced['doctor name'].count()/opioid_df['doctor name'].count()*100))
print('# specialities: {}'.format(spec_df['specialty index'].count()))
print('# specialities that prescribe opioid: {}'
      .format(opioid_reduced['specialty index'].nunique()))
print('Max % prescriptions that are opioids: {}%'
      .format(opioid_reduced['percent opioid claims'].max()))
print('Min % prescriptions that are opioids(other than zero): {}%'
      .format(opioid_reduced['percent opioid claims'].min()))

# health providers: 1049326
# health providers who prescribe opioid: 496744
% health providers that prescribe opioids: 47.34%
# specialities: 246
# specialities that prescribe opioid: 169
Max % prescriptions that are opioids: 100.0%
Min % prescriptions that are opioids(other than zero): 0.03%


# Create dataframe for API
Currently only use small portion of the dataframe. Unfortately the URL only provides 1000 entries.

In [17]:
#source: https://data.cms.gov/Medicare-Claims/Medicare-Part-D-Opioid-Prescriber-Summary-File-201/yb2j-f3fp
#Download from URL and into a pandas and turn into a pandas dataframe, note that this is only a 
#small portion of the data
URL = 'https://data.cms.gov/resource/aksg-4qws.csv'
response = request.urlopen(URL)
content = response.read()
opioid_api_df = pd.read_csv(StringIO(content.decode('utf-8')))[['npi',
                                                            'nppes_provider_first_name',
                                                            'nppes_provider_last_name',
                                                            'nppes_provider_state',
                                                            'specialty_description',
                                                            'nppes_provider_zip_code',
                                                            'opioid_claim_count',
                                                            'total_claim_count',
                                                            'percent_opioid_claims']]


#change name of columns
opioid_api_df.columns =['doctor id',
                    'first name',
                    'last name',
                    'state',
                    'specialty description',
                    'zip code',
                    'opioid claims',
                    'total claim count',
                   'percent opioid claims']
#set doctor id to the index
opioid_api_df.set_index('doctor id',inplace=True)
#combine first and last name into a single column
opioid_api_df['doctor name']=opioid_api_df[['first name','last name']].apply(lambda x:combine_strings(*x),axis=1)
opioid_api_df.drop(labels=['first name','last name'],axis=1,inplace=True)

#Droping doctors with nan or zero claims  
opioid_api_df['opioid claims']= opioid_api_df['opioid claims'].apply(lambda x: if_null_value(x,0))
opioid_api_df['percent opioid claims']= opioid_api_df['percent opioid claims'].apply(lambda x: if_null_value(x,0))
opioid_api_reduced=opioid_api_df[opioid_api_df['opioid claims'] != 0.0]
print(opioid_api_df.count())
print(opioid_api_reduced.count())
print('Max % prescriptions that are opioids {}'
      .format(opioid_api_reduced['percent opioid claims'].max()))
print('Min % prescriptions that are opioids {} (Other than zero)'
      .format(opioid_api_reduced['percent opioid claims'].min()))


state                    1000
specialty description    1000
zip code                 1000
opioid claims            1000
total claim count        1000
percent opioid claims    1000
doctor name              1000
dtype: int64
state                    14
specialty description    14
zip code                 14
opioid claims            14
total claim count        14
percent opioid claims    14
doctor name              14
dtype: int64
Max % prescriptions that are opioids 0.5
Min % prescriptions that are opioids 0.055999999999999994 (Other than zero)
