In [66]:
import requests
from bs4 import BeautifulSoup  
import pandas as pd
from matplotlib import pyplot as plt
import openpyxl

### Scraping one table to get code working

In [20]:
website_url= 'https://www.cdc.gov/drugoverdose/maps/rxstate2019.html'
response = requests.get(website_url)
response.status_code

200

In [21]:
soup = BeautifulSoup(response.content, 'lxml')
print(soup.title)

<title>U.S. State Opioid Dispensing Rates, 2019  | Drug Overdose | CDC Injury Center </title>


In [64]:
tables = soup.find_all('table', attrs = {'class': 'table table-striped'})  #filters table down even more-drill 
#tables

In [23]:
type(tables)

bs4.element.ResultSet

In [24]:
len(tables)

1

In [25]:
result_list = pd.read_html(str(tables[0])) # a list of dataframes
len(result_list)

1

In [26]:
type(result_list)

list

In [27]:
world_soups = result_list[0]  # get the first df from the list
world_soups.head()

Unnamed: 0,State,Abbreviation,Opioid Dispensing Rate per 100
0,United States,US,46.7
1,Alaska,AK,39.1
2,Alabama,AL,85.8
3,Arkansas,AR,80.9
4,Arizona,AZ,44.1


In [28]:
world_soups['Year']=2019

In [29]:
world_soups.head()

Unnamed: 0,State,Abbreviation,Opioid Dispensing Rate per 100,Year
0,United States,US,46.7,2019
1,Alaska,AK,39.1,2019
2,Alabama,AL,85.8,2019
3,Arkansas,AR,80.9,2019
4,Arizona,AZ,44.1,2019


### Scraping all years 

In [52]:
all_years = pd.DataFrame()
for year in range (2006,2020):
    cdc_url = f'https://www.cdc.gov/drugoverdose/maps/rxstate{year}.html'
    response = requests.get(cdc_url)
    soup = BeautifulSoup(response.content, 'lxml')

    tables = soup.find_all('table', attrs = {'class': 'table table-striped'})
    result_list = pd.read_html(str(tables[0]))
    world_soups = result_list[0]
    world_soups['Year']=year

    all_years=all_years.append(world_soups)

In [53]:
all_years.head()

Unnamed: 0,State,State Abbreviation,Opioid Dispensing Rate per 100,Year,Abbreviation
0,Alabama,AL,115.6,2006,
1,Alaska,AK,63.4,2006,
2,Arizona,AZ,74.3,2006,
3,Arkansas,AR,98.3,2006,
4,California,CA,51.0,2006,
...,...,...,...,...,...
47,Vermont,,36.9,2019,VT
48,Washington,,42.7,2019,WA
49,Wisconsin,,42.5,2019,WI
50,West Virginia,,59.4,2019,WV


In [54]:
len(all_years)

717

In [56]:
import numpy as np

all_years['my_state']=np.nan
all_years['my_state']= all_years.my_state.fillna(all_years['State Abbreviation']).fillna(all_years.Abbreviation)


In [60]:
all_years

Unnamed: 0,State,State Abbreviation,Opioid Dispensing Rate per 100,Year,Abbreviation,my_state
0,Alabama,AL,115.6,2006,,AL
1,Alaska,AK,63.4,2006,,AK
2,Arizona,AZ,74.3,2006,,AZ
3,Arkansas,AR,98.3,2006,,AR
4,California,CA,51.0,2006,,CA
...,...,...,...,...,...,...
47,Vermont,,36.9,2019,VT,VT
48,Washington,,42.7,2019,WA,WA
49,Wisconsin,,42.5,2019,WI,WI
50,West Virginia,,59.4,2019,WV,WV


In [62]:
all_years=all_years[['State','Opioid Dispensing Rate per 100','Year','my_state']]

In [63]:
all_years

Unnamed: 0,State,Opioid Dispensing Rate per 100,Year,my_state
0,Alabama,115.6,2006,AL
1,Alaska,63.4,2006,AK
2,Arizona,74.3,2006,AZ
3,Arkansas,98.3,2006,AR
4,California,51.0,2006,CA
...,...,...,...,...
47,Vermont,36.9,2019,VT
48,Washington,42.7,2019,WA
49,Wisconsin,42.5,2019,WI
50,West Virginia,59.4,2019,WV


### Bringing in Cause of Death by state data
#### - Imported openpyxl above to load in an xlsx file
#### - Data is for 1999-2019

In [80]:
all_drug_cod_df = pd.read_excel('data/Cause_of_death.xlsx', sheet_name = 1)
heroin_cod_df = pd.read_excel('data/Cause_of_death.xlsx', sheet_name = 2)
opioids_cod_df = pd.read_excel('data/Cause_of_death.xlsx', sheet_name = 3)
methadone_cod_df = pd.read_excel('data/Cause_of_death.xlsx', sheet_name = 4)
synthetic_cod_df = pd.read_excel('data/Cause_of_death.xlsx', sheet_name = 5)
cocaine_cod_df = pd.read_excel('data/Cause_of_death.xlsx', sheet_name = 6)
other_narcs_cod_df = pd.read_excel('data/Cause_of_death.xlsx', sheet_name = 7)

#### - Dropping columns

In [82]:
all_drug_cod_df=all_drug_cod_df[['State','Deaths','Population']]
heroin_cod_df = heroin_cod_df[['State','Deaths','Population']]
opioids_cod_df = opioids_cod_df[['State','Deaths','Population']]
methadone_cod_df =methadone_cod_df[['State','Deaths','Population']]
synthetic_cod_df = synthetic_cod_df[['State','Deaths','Population']]
cocaine_cod_df = cocaine_cod_df[['State','Deaths','Population']]
other_narcs_cod_df = other_narcs_cod_df[['State','Deaths','Population']]

### Loading Cause of death data by date and drug

In [94]:
heroin_2006_cod_df = pd.read_excel("data/heroin_2006_cod.xlsx", skiprows = 1, sheet_name = 0)

In [95]:
heroin_2006_cod_df

Unnamed: 0,State,State Code,Deaths,Population
0,Alabama,1,Suppressed,4628981
1,Alaska,2,Suppressed,675302
2,Arizona,4,52,6029141
3,Arkansas,5,Suppressed,2821761
4,California,6,289,36021202
5,Colorado,8,39,4720423
6,Connecticut,9,93,3517460
7,Delaware,10,Suppressed,859268
8,District of Columbia,11,Suppressed,570681
9,Florida,12,90,18166990
