In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# read in data (12 month-ending provisional counts and percent change of drug overdose death)
cdc_data = pd.read_csv('https://data.cdc.gov/api/views/xkb8-kh2a/rows.csv?accessType=DOWNLOAD&bom=true&format=true')
print("Number of periods not '12 month-ending': " + str(cdc_data[cdc_data['Period'] != '12 month-ending']['Period'].count()))
cdc_data.drop(columns=['Period', 'Footnote', 'Footnote Symbol', 'Predicted Value', 'State Name'], inplace=True)

### Exploratory Data Analysis

In [None]:
print(cdc_data.columns)
cdc_data.info()
#print(df.loc[:,'State'].value_counts(dropna=False))
#print(df.loc[:,'State Name'].value_counts(dropna=False))
#print(df.loc[:,'Year'].value_counts(dropna=False))
#print(df.loc[:,'Month'].value_counts(dropna=False))

In [None]:
print(cdc_data.loc[:,'Indicator'].value_counts(dropna=False))
print(cdc_data.loc[:,'Percent Complete'].value_counts(dropna=False))
#print(cdc_data.loc[:,'Percent Pending Investigation'].value_counts(dropna=False))
print(cdc_data.head())

**Explore the numeric data, grouped by indicators - do they add up?**

## Visual Exploratory Data Analysis

In [None]:
# create DateTimeIndex from 'Year' and 'Month' columns to plot number of deaths by state against (stack graph)
cdc_data.index = pd.to_datetime(cdc_data['Year'].astype('str') + cdc_data['Month'], format='%Y%B')
cdc_data.drop(columns=['Year', 'Month'], inplace=True)
# extract the data for 'Number of Drug Overdose Deaths' indicator and convert data values from strings to floats
deaths_by_state = cdc_data[cdc_data['Indicator'] == 'Number of Drug Overdose Deaths'].loc[:, ['State', 'Data Value']]
deaths_by_state['Data Value'] = deaths_by_state['Data Value'].str.replace(',', '').astype(float)
# sort data by date then state
deaths_by_state.index.names = ['Date']
deaths_by_state.sort_values(by=['Date', 'State'], inplace=True)

In [None]:
# pivot on State
deaths_by_state2 = deaths_by_state.pivot(columns='State')
deaths_by_state2.columns = deaths_by_state2.columns.droplevel(0)
# check wheter US number matches all the other totals
df_check = deaths_by_state2.sum(axis=1)/deaths_by_state2['US']
# if all entries are 2.0, US is total of rest
if (((df_check == 2.0).count()) == (df_check.count())):
    # drop US number
    deaths_by_state2.drop(columns=['US'], inplace=True)
deaths_by_state2.head()

In [None]:
# make index just month and year, rather than DateTimeIndex - to make a neater plot
deaths_by_state2 = deaths_by_state2.assign(Month_Year=deaths_by_state2.index.strftime("%b-%Y"))
deaths_by_state2.set_index('Month_Year', inplace=True)
deaths_by_state2.head()

In [None]:
# plot the deaths by state as a stacked bar chart
_ = deaths_by_state2.plot.bar(stacked=True, figsize=(16,8), legend=False)
_ = plt.ylabel('Number of Drug Overdose Deaths')
_ = plt.title('CDC data')
#_ = plt.legend(loc='upper right', labels=deaths_by_state2.columns)
plt.show()

Explore the percent complete fields

In [None]:
print("Number of not complete: " + str(cdc_data[cdc_data['Percent Complete'] != '100']['Percent Complete'].count()))
print(cdc_data['Percent Complete'].unique())
cdc_data['Percent Complete'] = cdc_data['Percent Complete'].str.replace('99.5+', '99.5', regex=False).astype('float')
print(cdc_data['Percent Complete'].unique())
cdc_data = cdc_data.assign(Total_Percent=cdc_data['Percent Complete'] + cdc_data['Percent Pending Investigation'])
incomplete_data = cdc_data[abs(cdc_data['Total_Percent'] - 100.0) > 1.0].loc[:, ['State', 'Total_Percent']]
print(incomplete_data)

Map the rather verbose indicator descriptions into something more concise

In [None]:
#print(df['Indicator'].unique())
indicator_map = {'Opioids (T40.0-T40.4,T40.6)' : 'Opiods',
                 'Natural & semi-synthetic opioids, incl. methadone (T40.2, T40.3)' : 'Natural and semi-synthetic opiods',
                 'Cocaine (T40.5)' : 'Cocaine',
                 'Percent with drugs specified' : '% drug specified',
                 'Psychostimulants with abuse potential (T43.6)' : 'Psychostimulants',
                 'Heroin (T40.1)' : 'Heroin',
                 'Number of Deaths' : 'Number of Deaths',
                 'Natural, semi-synthetic, & synthetic opioids, incl. methadone (T40.2-T40.4)' : 'All opioids',
                 'Natural & semi-synthetic opioids (T40.2)' : 'Natural and semi-synthetic opiods (2)',
                 'Number of Drug Overdose Deaths' : 'Number of Drug Overdose Deaths',
                 'Methadone (T40.3)' : 'Methadone',
                 'Synthetic opioids, excl. methadone (T40.4)' : 'Synthetic opiods'}
cdc_data['Indicator'] = cdc_data['Indicator'].map(indicator_map)
#print(df.head())

### Determine the annual drug overdose deaths

In [None]:
overdose_deaths_us = deaths_by_state[deaths_by_state['State'] == 'US']
# TODO is the arbitrary choice of October for annual numbers the best (correct) choice?
annual_deaths = overdose_deaths_us[overdose_deaths_us.index.month == 10]
annual_deaths = annual_deaths.assign(Year = pd.Series(annual_deaths.index.year, index=annual_deaths.index))
annual_deaths.set_index('Year', inplace=True)
annual_deaths.drop(columns=['State'], inplace=True)
annual_deaths.rename(columns={'Data Value':'Overdose deaths'}, inplace=True)
print(annual_deaths)

In [None]:
# plot the annual numbers
_ = annual_deaths.plot.bar(legend=False)
_ = plt.ylabel('Number of overdose deaths in US')
plt.show()

**Hypothesis to test**: The annual number of deaths due to drug overdoses in the US has fallen from 2017 to 2018.

The CDC numbers indicate that there were 68,500 drug overdose deaths in the US in 2018, down from 72,000 the previous year.
The CDC data show that overdose deaths from fentanyl, syntheric opiods, cocaine and methamphetamines are still increasing.

*What are the set of observations that we can bootstrap?*

*What is the test stastistic?*