## Importing all packages

In [7]:
import requests
from bs4 import BeautifulSoup  
import pandas as pd
from matplotlib import pyplot as plt
import openpyxl
import numpy as np

## Opioid Dispensing Rate
### - Scraping one table to get code working

In [8]:
website_url= 'https://www.cdc.gov/drugoverdose/maps/rxstate2019.html'
response = requests.get(website_url)
response.status_code

200

In [9]:
soup = BeautifulSoup(response.content, 'lxml')
print(soup.title)

<title>U.S. State Opioid Dispensing Rates, 2019  | Drug Overdose | CDC Injury Center </title>


In [10]:
tables = soup.find_all('table', attrs = {'class': 'table table-striped'})  #filters table down even more-drill 
#tables

In [11]:
type(tables)

bs4.element.ResultSet

In [12]:
len(tables)

1

In [13]:
result_list = pd.read_html(str(tables[0])) # a list of dataframes
len(result_list)

1

In [14]:
type(result_list)

list

In [15]:
world_soups = result_list[0]  # get the first df from the list
world_soups.head()

Unnamed: 0,State,Abbreviation,Opioid Dispensing Rate per 100
0,United States,US,46.7
1,Alaska,AK,39.1
2,Alabama,AL,85.8
3,Arkansas,AR,80.9
4,Arizona,AZ,44.1


In [16]:
world_soups['Year']=2019

In [17]:
world_soups.head()

Unnamed: 0,State,Abbreviation,Opioid Dispensing Rate per 100,Year
0,United States,US,46.7,2019
1,Alaska,AK,39.1,2019
2,Alabama,AL,85.8,2019
3,Arkansas,AR,80.9,2019
4,Arizona,AZ,44.1,2019


### - Scraping all years 

In [18]:
all_years = pd.DataFrame()
for year in range (2006,2020):
    cdc_url = f'https://www.cdc.gov/drugoverdose/maps/rxstate{year}.html'
    response = requests.get(cdc_url)
    soup = BeautifulSoup(response.content, 'lxml')

    tables = soup.find_all('table', attrs = {'class': 'table table-striped'})
    result_list = pd.read_html(str(tables[0]))
    world_soups = result_list[0]
    world_soups['Year']=year

    all_years=all_years.append(world_soups)

In [19]:
all_years.head()

Unnamed: 0,State,State Abbreviation,Opioid Dispensing Rate per 100,Year,Abbreviation
0,Alabama,AL,115.6,2006,
1,Alaska,AK,63.4,2006,
2,Arizona,AZ,74.3,2006,
3,Arkansas,AR,98.3,2006,
4,California,CA,51.0,2006,


In [20]:
len(all_years)

717

In [21]:
## taking out NaN and making one column with state abbreviation
all_years['my_state']=np.nan
all_years['my_state']= all_years.my_state.fillna(all_years['State Abbreviation']).fillna(all_years.Abbreviation)


In [22]:
all_years

Unnamed: 0,State,State Abbreviation,Opioid Dispensing Rate per 100,Year,Abbreviation,my_state
0,Alabama,AL,115.6,2006,,AL
1,Alaska,AK,63.4,2006,,AK
2,Arizona,AZ,74.3,2006,,AZ
3,Arkansas,AR,98.3,2006,,AR
4,California,CA,51.0,2006,,CA
...,...,...,...,...,...,...
47,Vermont,,36.9,2019,VT,VT
48,Washington,,42.7,2019,WA,WA
49,Wisconsin,,42.5,2019,WI,WI
50,West Virginia,,59.4,2019,WV,WV


In [23]:
#Taking out extra columns
all_years=all_years[['State','Opioid Dispensing Rate per 100','Year','my_state']]

In [24]:
all_years

Unnamed: 0,State,Opioid Dispensing Rate per 100,Year,my_state
0,Alabama,115.6,2006,AL
1,Alaska,63.4,2006,AK
2,Arizona,74.3,2006,AZ
3,Arkansas,98.3,2006,AR
4,California,51.0,2006,CA
...,...,...,...,...
47,Vermont,36.9,2019,VT
48,Washington,42.7,2019,WA
49,Wisconsin,42.5,2019,WI
50,West Virginia,59.4,2019,WV


In [25]:
#saving to csv file
all_years.to_csv("DISP_RATE.csv")

## Bringing in Cause of Death by state data
### - Imported openpyxl above to load in an xlsx file
### - Data is for dates 1999-2019

In [26]:
all_years_cod_df = pd.read_excel('../data/icd10_grouped.xlsx',sheet_name = 0)

In [27]:
# reading in data, excluding notes at bottom (after rows 11831)
#all_years_cod_df = pd.read_excel('../data/Indiviual_years_Overdose.xlsx',nrows=11831, sheet_name = 0)

In [28]:
#making year not a float
#all_years_cod_df['Year'] = all_years_cod_df['Year'].astype(Int64Dtype())
all_years_cod_df['Year'] = pd.to_numeric(all_years_cod_df['Year'],errors='coerce').astype(pd.Int64Dtype())

In [29]:
#changing all suppressed values to 5
#the suppressed values are 0-9, a mathmetician suggested to use either 0 or half the value of 10
all_years_cod_df['Deaths'].mask(all_years_cod_df['Deaths'] == 'Suppressed', '5', inplace=True)
all_years_cod_df

Unnamed: 0,State,Year,Multiple Cause of death,Multiple Cause of death Code,Deaths,Population,Crude Rate
0,Alabama,1999,Heroin,T40.1,5,4430141,Suppressed
1,Alabama,1999,Other opioids,T40.2,12,4430141,Unreliable
2,Alabama,1999,Methadone,T40.3,16,4430141,Unreliable
3,Alabama,1999,Other synthetic narcotics,T40.4,11,4430141,Unreliable
4,Alabama,1999,Cocaine,T40.5,26,4430141,0.6
...,...,...,...,...,...,...,...
6421,Wyoming,2019,Other opioids,T40.2,31,578759,5.4
6422,Wyoming,2019,Methadone,T40.3,5,578759,Suppressed
6423,Wyoming,2019,Other synthetic narcotics,T40.4,18,578759,Unreliable
6424,Wyoming,2019,Cocaine,T40.5,5,578759,Suppressed


In [30]:
#dropping irrelevant columns
all_years_cod_df.drop('Multiple Cause of death Code',
  axis='columns', inplace=True)
all_years_cod_df.drop('Crude Rate',
  axis='columns', inplace=True)

In [31]:
#renaming row labels
all_years_cod_df=all_years_cod_df.replace(to_replace ="Other and unspecified narcotics",
                 value ="Other narcotics")

In [32]:
#renaming row labels
all_years_cod_df=all_years_cod_df.replace(to_replace ="Other synthetic narcotics",
                 value ="Other narcotics")

In [33]:
all_years_cod_df['Deaths'] = pd.to_numeric(all_years_cod_df['Deaths'],errors='coerce').astype(pd.Int64Dtype())

In [34]:
#group by other narcotics
#all_years_cod_df=all_years_cod_df.groupby(['State','Year','Multiple Cause of death','Population'],as_index=False).sum()
#all_years_cod_df=all_years_cod_df.groupby(['Deaths'])['Deaths'].sum()
#all_years_cod_df=all_years_cod_df.groupby(['State','Year','Multiple Cause of death']).agg({'Deaths':['sum']})
all_years_cod_df=all_years_cod_df.groupby(['State','Year','Multiple Cause of death','Population'],as_index=False)
all_years_cod_df=all_years_cod_df.aggregate(np.sum)
all_years_cod_df

Unnamed: 0,State,Year,Multiple Cause of death,Population,Deaths
0,Alabama,1999,Cocaine,4430141,26
1,Alabama,1999,Heroin,4430141,5
2,Alabama,1999,Methadone,4430141,16
3,Alabama,1999,Other narcotics,4430141,16
4,Alabama,1999,Other opioids,4430141,12
...,...,...,...,...,...
5350,Wyoming,2019,Cocaine,578759,5
5351,Wyoming,2019,Heroin,578759,12
5352,Wyoming,2019,Methadone,578759,5
5353,Wyoming,2019,Other narcotics,578759,23


In [35]:
#adding crude rate column
all_years_cod_df['Crude Rate']=all_years_cod_df['Deaths']/(all_years_cod_df['Population']/100000)
all_years_cod_df

Unnamed: 0,State,Year,Multiple Cause of death,Population,Deaths,Crude Rate
0,Alabama,1999,Cocaine,4430141,26,0.586889
1,Alabama,1999,Heroin,4430141,5,0.112863
2,Alabama,1999,Methadone,4430141,16,0.361162
3,Alabama,1999,Other narcotics,4430141,16,0.361162
4,Alabama,1999,Other opioids,4430141,12,0.270872
...,...,...,...,...,...,...
5350,Wyoming,2019,Cocaine,578759,5,0.863917
5351,Wyoming,2019,Heroin,578759,12,2.073402
5352,Wyoming,2019,Methadone,578759,5,0.863917
5353,Wyoming,2019,Other narcotics,578759,23,3.97402


In [36]:
all_years_cod_df=all_years_cod_df.astype({'Crude Rate':'float'})

In [37]:
all_years_cod_df

Unnamed: 0,State,Year,Multiple Cause of death,Population,Deaths,Crude Rate
0,Alabama,1999,Cocaine,4430141,26,0.586889
1,Alabama,1999,Heroin,4430141,5,0.112863
2,Alabama,1999,Methadone,4430141,16,0.361162
3,Alabama,1999,Other narcotics,4430141,16,0.361162
4,Alabama,1999,Other opioids,4430141,12,0.270872
...,...,...,...,...,...,...
5350,Wyoming,2019,Cocaine,578759,5,0.863917
5351,Wyoming,2019,Heroin,578759,12,2.073402
5352,Wyoming,2019,Methadone,578759,5,0.863917
5353,Wyoming,2019,Other narcotics,578759,23,3.974020


In [38]:
all_years_cod_df = all_years_cod_df.round({'Crude Rate': 2})

In [39]:
all_years_cod_df

Unnamed: 0,State,Year,Multiple Cause of death,Population,Deaths,Crude Rate
0,Alabama,1999,Cocaine,4430141,26,0.59
1,Alabama,1999,Heroin,4430141,5,0.11
2,Alabama,1999,Methadone,4430141,16,0.36
3,Alabama,1999,Other narcotics,4430141,16,0.36
4,Alabama,1999,Other opioids,4430141,12,0.27
...,...,...,...,...,...,...
5350,Wyoming,2019,Cocaine,578759,5,0.86
5351,Wyoming,2019,Heroin,578759,12,2.07
5352,Wyoming,2019,Methadone,578759,5,0.86
5353,Wyoming,2019,Other narcotics,578759,23,3.97


In [40]:
all_years_cod_df.to_csv("CAUSE_OF_DEATH1.csv")

### Bringing in illicit vs rx COD

In [43]:
#this is age adjusted data
rx_cod = pd.read_excel('../data/illicit_rx_sep_files.xlsx', sheet_name = 0)
ill_cod = pd.read_excel('../data/illicit_rx_sep_files.xlsx', sheet_name = 1)

In [44]:
ill_cod['rx_ill']='IL'

In [46]:
ill_cod

Unnamed: 0,State,Year,Deaths,Population,Crude Rate,Age Adjusted Rate,rx_ill
0,Alabama,1999,27,4430141,0.6,0.6,IL
1,Alabama,2000,18,4447100,Unreliable,Unreliable,IL
2,Alabama,2001,34,4467634,0.8,0.8,IL
3,Alabama,2002,33,4480089,0.7,0.7,IL
4,Alabama,2003,27,4503491,0.6,0.6,IL
...,...,...,...,...,...,...,...
1066,Wyoming,2015,Suppressed,586107,Suppressed,Suppressed,IL
1067,Wyoming,2016,10,585501,Unreliable,Unreliable,IL
1068,Wyoming,2017,Suppressed,579315,Suppressed,Suppressed,IL
1069,Wyoming,2018,Suppressed,577737,Suppressed,Suppressed,IL


In [45]:
rx_cod['rx_ill']='RX'
rx_cod

Unnamed: 0,State,Year,Deaths,Population,Crude Rate,Age Adjusted Rate,rx_ill
0,Alabama,1999,37,4430141,0.8,0.8,RX
1,Alabama,2000,46,4447100,1.0,1.1,RX
2,Alabama,2001,66,4467634,1.5,1.5,RX
3,Alabama,2002,74,4480089,1.7,1.7,RX
4,Alabama,2003,53,4503491,1.2,1.2,RX
...,...,...,...,...,...,...,...
1066,Wyoming,2015,45,586107,7.7,7.7,RX
1067,Wyoming,2016,44,585501,7.5,7.7,RX
1068,Wyoming,2017,48,579315,8.3,8.7,RX
1069,Wyoming,2018,38,577737,6.6,6.3,RX


### merging illicit and rx cod files

In [None]:
rx_ill_cod = pd.concat([rx_cod,ill_cod])
rx_ill_cod.sort_values(['State','Year'])

In [None]:
#changing all suppressed values to 5
#the suppressed values are 0-9, a mathmetician suggested to use either 0 or half the value of 10
rx_ill_cod['Deaths'].mask(rx_ill_cod['Deaths'] == 'Suppressed', '5', inplace=True)
rx_ill_cod

In [None]:
rx_ill_cod=rx_ill_cod.astype({'Deaths':'int64'})

In [None]:
rx_ill_cod['Crude Rate']=rx_ill_cod['Deaths']/(rx_ill_cod['Population']/100000)
rx_ill_cod

In [None]:
#rx_ill_cod=rx_ill_cod['Crude Rate'].round(decimals=2)
#rx_ill_cod=rx_ill_cod.round(2)
#rx_ill_cod['Crude Rate']=rx_ill_cod['Crude Rate'].round()
#rx_ill_cod['Crude Rate'] = rx_ill_cod['Crude Rate'].apply(lambda x: round(x, 2))
rx_ill_cod = rx_ill_cod.round({'Crude Rate': 2})

In [None]:
rx_ill_cod

In [None]:
rx_ill_cod.to_csv("RX_ILL_COD.csv")

### Creating a dataframe when PDMP started

In [None]:
pdmp=pd.read_csv('../data/PDMP_info.csv',usecols=['Jurisdiction','1.4. When did the PDMP start receiving prescription data electronically?'])

In [None]:
pdmp

In [None]:
#renaming columns
pdmp=pdmp.rename(columns={'Jurisdiction':'State','1.4. When did the PDMP start receiving prescription data electronically?':'Year'},inplace=False)

In [None]:
#changing year to datetime
pdmp['Year']=pd.to_datetime(pdmp['Year'])

In [None]:
#extracting year from date
pdmp['Year'] = pdmp['Year'].dt.year

In [None]:
pdmp.to_csv("PDMP.csv")

### Creating a dataframe for laws

In [None]:
laws={2000:''}