In [1]:
from copy import deepcopy
import pandas as pd
import numpy as np
import re

In [2]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [3]:
nadac_df = pd.read_csv('/Users/charlieyaris/NADAC__National_Average_Drug_Acquisition_Cost_.csv')

FileNotFoundError: File b'/Users/charlieyaris/NADAC__National_Average_Drug_Acquisition_Cost_.csv' does not exist

In [5]:
nadac_df.columns

Index(['NDC Description', 'NDC', 'NADAC_Per_Unit', 'Effective_Date',
       'Pricing_Unit', 'Pharmacy_Type_Indicator', 'OTC', 'Explanation_Code',
       'Classification_for_Rate_Setting',
       'Corresponding_Generic_Drug_NADAC_Per_Unit',
       'Corresponding_Generic_Drug_Effective_Date', 'As of Date'],
      dtype='object')

In [6]:
nadac_df.head()

Unnamed: 0,NDC Description,NDC,NADAC_Per_Unit,Effective_Date,Pricing_Unit,Pharmacy_Type_Indicator,OTC,Explanation_Code,Classification_for_Rate_Setting,Corresponding_Generic_Drug_NADAC_Per_Unit,Corresponding_Generic_Drug_Effective_Date,As of Date
0,SIMVASTATIN 80 MG TABLET,16714068502,0.08,03/23/2016,EA,C/I,N,1,G,,,04/13/2016
1,CARVEDILOL 12.5 MG TABLET,68382009405,0.033,03/23/2016,EA,C/I,N,1,G,,,04/13/2016
2,CLONAZEPAM 1 MG TABLET,93083310,0.026,03/23/2016,EA,C/I,N,1,G,,,04/13/2016
3,NAPHCON-A EYE DROPS,65008542,0.567,12/23/2015,ML,C/I,Y,"4, 5, 6",B,,,04/13/2016
4,OFLOXACIN 0.3% EAR DROPS,24208041005,17.279,02/17/2016,ML,C/I,N,"4, 5",G,,,04/13/2016


In [7]:
# created new time columns in datetime format for groupby dataframes and visualizations.
nadac_df['Effective_Date'] = pd.to_datetime(nadac_df['Effective_Date'])
nadac_df['month_year'] = nadac_df['Effective_Date'].dt.strftime('%Y-%m')
nadac_df['month'] = nadac_df['Effective_Date'].dt.month
nadac_df['year'] = nadac_df['Effective_Date'].dt.year

In [8]:
nadac_df.to_pickle('nadac_df_date.pkl')

In [9]:
nadac_df = pd.read_pickle('nadac_df_date.pkl')

In [10]:
# created nadac_year_df for visualizations in visualization notebook.
aggregations = {
    'NADAC_Per_Unit': 'sum',
    }

nadac_year_df = nadac_df.groupby(['year', 'OTC']).agg(aggregations).reset_index()

In [11]:
nadac_year_df.to_pickle('../Pickles/nadac_year_df.pkl')

In [12]:
# sorted drugs by effective_date to enable future removing of duplicates.
nadac_df = deepcopy(nadac_df.sort_values(by = 'Effective_Date', ascending = False))

In [13]:
# removed duplicate rows of drugs
# so that only the least expensive version of each drug was kept in the dataframe.
duplicate_list = ['NDC Description', 'NADAC_Per_Unit',
                  'Pricing_Unit', 'Pharmacy_Type_Indicator', 'OTC', 'Explanation_Code',
                  'Classification_for_Rate_Setting',
                  'Corresponding_Generic_Drug_NADAC_Per_Unit']

nadac_df.drop_duplicates(subset = duplicate_list, keep = 'first', inplace = True)

In [14]:
# continued removing duplicate rows of drugs
# so that only the least expensive versions of each drug were kept in the dataframe.
nadac_df.drop_duplicates(subset = 'NDC Description', keep = 'first', inplace = True)

In [15]:
nadac_df = deepcopy(nadac_df.reset_index())

In [16]:
# fixed formatting in ndc_drug_name column.
nadac_df['ndc_drug_name'] = deepcopy([re.split('\ (\d+)', s)[0] for s in nadac_df['NDC Description']])

In [17]:
# fixed formatting in ndc_drug_dose column.
nadac_df['ndc_drug_dose'] = deepcopy([re.split('\ (\d+)', s)[1:] for s in nadac_df['NDC Description']])
nadac_df['ndc_drug_dose'] = deepcopy(([s.strip("[") for s in nadac_df['ndc_drug_dose'].astype(str)]))
nadac_df['ndc_drug_dose'] = deepcopy(([s.strip("]") for s in nadac_df['ndc_drug_dose'].astype(str)]))
for i, s in enumerate(nadac_df['ndc_drug_dose']):
    nadac_df['ndc_drug_dose'][i] = deepcopy(''.join(s.split("', '")))
nadac_df['ndc_drug_dose'] = deepcopy(([s.strip("'") for s in nadac_df['ndc_drug_dose'].astype(str)]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [18]:
# dropped columns from the dataframe that will no longer be needed for identification or analysis.
nadac_df.drop(['index', 'NDC Description', 'Pharmacy_Type_Indicator', 'As of Date',
               'Classification_for_Rate_Setting', 'Corresponding_Generic_Drug_NADAC_Per_Unit',
               'Corresponding_Generic_Drug_Effective_Date'], axis = 1, inplace = True)

In [19]:
# renamed nadac_df columns for clarity in analysis following future join.
nadac_df.rename({'NDC': 'ndc_code', 'NADAC_Per_Unit': 'nadac_cost_per_unit',
                 'Effective_Date': 'nadac_effective_date', 'Pricing_Unit': 'nadac_pricing_unit',
                 'OTC': 'over_the_counter', 'Explanation_Code': 'data_source_code'},
                axis = 1, inplace = True)

In [20]:
nadac_df.head()

Unnamed: 0,ndc_code,nadac_cost_per_unit,nadac_effective_date,nadac_pricing_unit,over_the_counter,data_source_code,month_year,month,year,ndc_drug_name,ndc_drug_dose
0,143122710,0.166,2018-10-03,EA,N,1,2018-10,10,2018,DICYCLOMINE,20 MG TABLET
1,31722059730,5.432,2018-10-03,EA,N,1,2018-10,10,2018,RITONAVIR,100 MG TABLET
2,51991070505,0.023,2018-10-03,EA,N,1,2018-10,10,2018,ALPRAZOLAM,0.5 MG TABLET
3,42192033001,0.38,2018-10-03,EA,N,1,2018-10,10,2018,NP THYROID,60 MG TABLET
4,64980032802,3.067,2018-10-03,GM,N,"3, 5",2018-10,10,2018,ERYTHROMYCIN-BENZOYL GEL,


In [21]:
# replaced nadac_df with new version, grouping by ndc_drug_name and over_the_counter
# for future visualizations.
aggregations = {
    'nadac_cost_per_unit': 'mean',
    }

nadac_df = deepcopy(nadac_df.groupby(['ndc_drug_name', 'over_the_counter']).agg(aggregations).reset_index())

In [22]:
nadac_df['nadac_cost_per_unit'] = nadac_df2['nadac_cost_per_unit'].astype(float)

In [23]:
nadac_df.to_pickle('../Pickles/nadac_df.pkl')

In [3]:
nadac_df.info()