# import packages and important functions

In [13]:
#used to stop program under certain conditions
from sys import exit

#dataframes and numerical processes
import pandas as pd
import numpy as np

#importing database from file or online
import csv
from urllib import request
from sodapy import Socrata

# visualization
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns

#if value is NaN, return ifnan_value, otherwise return value
def if_null_value(value,if_nan_value):
    return (if_nan_value if pd.isnull(value) else value)

# If state is listed as "ZZ" or "XX", than zip codeis null.
# Fill with 1 if 'ZZ' (unknown location)
# or 2 if 'XX'. Not in the US 
# or 3 if 'AP' for Pacific armed forces
def fill_zip_code(state,zipcode):
    final_code = ''
    if not pd.isnull(zipcode):
        final_code = zipcode
    elif state == 'ZZ':
        final_code = 1
    elif state == 'XX':
        final_code = 2
    elif state == 'AP':
        final_code = 3
    else:
        print('ERROR: state: \'{}\' not recognized'.format(state))
        exit()
    return final_code


#combines strings, used to combine first and last names with spaces between.
def combine_strings(*strings):
    combine = ''
    for x in strings:
        if isinstance(x,str):
          combine += x + ' '
    if(combine == ''):
        return combine
    else:
        return combine[0:-1] 
    

# Create dataframe of population by zipcode
The population of a zip code is replaced with the sum of populations of zip codes in the same city. 
Considering that these zip codes will be geographically close, the city population is a better estimate of the population
of a health provider's cliental. The zip code is linked to a city and state using a second database (See below). This will also also fill many of empty populations zip codes with empty entries.

Source: https://simplemaps.com/data/us-zips 
accessed: November 7, 2018

In [28]:
def create_pop_dataframe():
    #population from the 2010 census
    filename = 'Zip_code_to_address/uszipsv1.4.csv'
    population_df = pd.read_csv(filename)[['zip','city','state_id','lat','lng','population']]
    population_df.rename(columns={'zip':'zipcode'},inplace=True)
    
    #summing over the population of zip codes in the same city
    #and replacing the population with it.This will also fill null values
    location_pop=population_df.groupby(['state_id','city'])['city','state_id','population'].sum().reset_index()

    population_df=population_df.merge(location_pop,how='left',on=['state_id','city'])
    population_df.rename(columns={'population_x':'population','population_y':'population sum'},inplace=True)

    #if the population sum is zero. Do not use it because the population of all the zip code in that city are null
    population_df['population'] =  population_df[['population','population sum']].apply(lambda x:
                                                                                          x[1] if x[1] != 0 
                                                                                          else x[0],
                                                                                         axis=1)
    population_df.drop(columns='population sum',axis=1,inplace=True)
    return population_df,location_pop
    
if __name__ == '__main__':
    population_df,location_pop=create_pop_dataframe()

# Create dataframe for API

Here a dataframe for the percentage and amount of opioid prescribed using Medicare Part D by health care providers. The data is cleaned and using the previous dataset, the population of the area around each healthcare provider is set.

Source: https://data.cms.gov/Medicare-Claims/Medicare-Part-D-Opioid-Prescriber-Summary-File-201/yb2j-f3fp 
first access: Oct 30, 2018

In [30]:
def create_opioid_dataframe(population_df):

    WEBSITE = 'data.cms.gov'
    DATASET_ID = 'aksg-4qws'

    client = Socrata(WEBSITE,None)
    results = client.get(DATASET_ID,content_type='json',limit=1500000)
    opioid_df = pd.DataFrame.from_records(results)[['npi',
                                                    'nppes_provider_first_name',
                                                    'nppes_provider_last_name',
                                                    'nppes_provider_zip_code',
                                                    'nppes_provider_state',
                                                    'specialty_description',
                                                    'total_claim_count',
                                                    'opioid_claim_count',
                                                    'percent_opioid_claims']]


    #change name of columns
    opioid_df.columns =['doctor id',
                        'first name',
                        'last name',                    
                        'zip code',                       
                        'state',
                        'specialty description',
                        'total claims',
                        'opioid claims',
                        'percent opioid claims']
    #set doctor id to the index
    opioid_df.set_index('doctor id',inplace=True)
    #combine first and last name into a single column
    opioid_df['doctor name']=opioid_df[['first name','last name']].apply(lambda x:combine_strings(*x),axis=1)
    opioid_df.drop(labels=['first name','last name'],axis=1,inplace=True)

    #Unknown location (state listed as ZZ) gets a zipcode of 0
    #Not in the US (state listed as XX) get a zipcode of 1
    opioid_df['zip code'] = opioid_df[['state','zip code']].apply(lambda x:fill_zip_code(x[0],x[1]),axis=1)

    #setting specialty description index
    doc_spec = set(list(opioid_df['specialty description']))
    spec_compreh=((x,counter) for counter, x in enumerate(doc_spec))
    spec_df=pd.DataFrame(spec_compreh,columns=['specialty description','specialty index'])
    opioid_df = opioid_df.merge(spec_df,how='left',on='specialty description')

    #merging with population data
    population_df['zipcode'] = population_df['zipcode'].astype(int)
    opioid_df['zip code'] = opioid_df['zip code'].astype(int)
    opioid_df = opioid_df.merge(population_df,how='left',left_on='zip code',right_on='zipcode')
    opioid_df.drop('zipcode',axis=1,inplace=True)
    opioid_df.drop('state',axis=1,inplace=True)

    #Droping doctors with nan or zero claims.
    #converting fraction of opioid claims to a percentage
    opioid_df['opioid claims']= opioid_df['opioid claims'].apply(lambda x: if_null_value(x,0))
    opioid_df['opioid claims']= opioid_df['opioid claims'].astype(int)
    opioid_df['percent opioid claims']=opioid_df['percent opioid claims'].apply(lambda x: if_null_value(x,0))
    opioid_df['percent opioid claims']=opioid_df['percent opioid claims'].astype(float)
    opioid_df['total claims']= opioid_df['total claims'].astype(int)
    opioid_reduced=opioid_df[opioid_df['opioid claims'] != 0.0]


    # Quick analyze of data and outlier detection
    print('# health providers: {}'
          .format(opioid_df['doctor name'].count()))
    print('# health providers who prescribe opioid: {}'
          .format(opioid_reduced['doctor name'].count()))
    print('% health providers that prescribe opioids: {:.2f}%'.
         format(opioid_reduced['doctor name'].count()/opioid_df['doctor name'].count()*100))
    print('# specialities: {}'.format(spec_df['specialty index'].count()))
    print('# specialities that prescribe opioid: {}'
          .format(opioid_reduced['specialty index'].nunique()))
    print('Max % prescriptions that are opioids: {:.2f}%'
          .format(opioid_reduced['percent opioid claims'].max()))
    print('Min % prescriptions that are opioids(other than zero): {:.2f}%'
          .format(opioid_reduced['percent opioid claims'].min()))
    print('Missing population for {} ({:.2f}%) entries'.
          format(opioid_df[(opioid_df['population'].isnull())]['zip code'].count()
              ,opioid_df[(opioid_df['population'].isnull()) ]['zip code'].count()/opioid_df['zip code'].count()*100))

    #Get rid of the remaining null values
    opioid_df=opioid_df[opioid_df['population'].notnull()]
    opioid_reduced=opioid_reduced[opioid_reduced['population'].notnull()]
    return opioid_df,opioid_reduced
if __name__ == '__main__':
    opioid_df,opioid_reduced = create_opioid_dataframe(population_df)



# health providers: 1049326
# health providers who prescribe opioid: 496744
% health providers that prescribe opioids: 47.34%
# specialities: 246
# specialities that prescribe opioid: 169
Max % prescriptions that are opioids: 100.00%
Min % prescriptions that are opioids(other than zero): 0.03%
Missing population for 1820 (0.17%) entries
