# Analysis of Drug Poisoning Mortality in the U.S. (2011 - 2018)

In [2]:
#Python Libraries
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [89]:
drug_mort = pd.read_csv("Drug_Mortality_byState.csv")
drug_mort.sample(50)

Unnamed: 0,State,Year,Sex,Age Group,Race and Hispanic Origin,Deaths,Population,Crude Death Rate,Standard Error for Crude Rate,Low Confidence Limit for Crude Rate,Upper Confidence Limit for Crude Rate,Age-adjusted Rate,Standard Error Age-adjusted Rate,Lower Confidence Limit for Age-adjusted Rate,Upper Confidence for Age-adjusted Rate,State Crude Rate in Range,US Crude Rate,US Age-adjusted Rate,Unit
2631,United States,2010,Male,65–74,Hispanic,33,734664,4.4918,0.78193,3.092,6.3082,,,,,,12.4144,12.2966,"per 100,000 population"
746,Oregon,2002,Both Sexes,All Ages,All Races-All Origins,318,3513424,9.051,0.50755,8.0562,10.0458,8.9269,0.50296,7.9411,9.9127,7.5–10.6,8.1766,8.1957,"per 100,000 population"
2240,United States,1999,Female,55–64,Non-Hispanic White,380,9721783,3.9087,0.20051,3.5157,4.3018,,,,,,6.0382,6.057,"per 100,000 population"
68,Arkansas,2006,Both Sexes,All Ages,All Races-All Origins,285,2821761,10.1001,0.59828,8.9275,11.2727,10.4539,0.62379,9.2313,11.6765,7.5–10.6,11.5373,11.4883,"per 100,000 population"
416,Maryland,2012,Both Sexes,All Ages,All Races-All Origins,831,5884563,14.1217,0.48988,13.1615,15.0819,13.6917,0.48202,12.7469,14.6364,13.1–17.5,13.2208,13.1422,"per 100,000 population"
2066,United States,2005,Female,55–64,Non-Hispanic Black,109,1674866,6.508,0.62335,5.2862,7.7298,,,,,,10.0884,10.0699,"per 100,000 population"
99,California,2017,Both Sexes,All Ages,All Races-All Origins,4868,39536653,12.3126,0.17647,11.9667,12.6585,11.6511,0.16981,11.3183,11.9839,10.7–13,21.5637,21.7048,"per 100,000 population"
2980,United States,1999,Male,65–74,Non-Hispanic White,173,6918262,2.5006,0.19012,2.128,2.8733,,,,,,6.0382,6.057,"per 100,000 population"
876,Texas,2012,Both Sexes,All Ages,All Races-All Origins,2447,26059203,9.3902,0.18983,9.0181,9.7622,9.4049,0.19166,9.0292,9.7805,7.5–10.6,13.2208,13.1422,"per 100,000 population"
2721,United States,2000,Male,25–34,Non-Hispanic Black,260,2461099,10.5644,0.65518,9.2802,11.8485,,,,,,6.1882,6.1749,"per 100,000 population"


# Data Cleaning:

### Checking data types

In [10]:
drug_mort.dtypes

State                                            object
Year                                              int64
Sex                                              object
Age Group                                        object
Race and Hispanic Origin                         object
Deaths                                            int64
Population                                        int64
Crude Death Rate                                float64
Standard Error for Crude Rate                   float64
Low Confidence Limit for Crude Rate             float64
Upper Confidence Limit for Crude Rate           float64
Age-adjusted Rate                               float64
Standard Error Age-adjusted Rate                float64
Lower Confidence Limit for Age-adjusted Rate    float64
Upper Confidence for Age-adjusted Rate          float64
State Crude Rate in Range                        object
US Crude Rate                                   float64
US Age-adjusted Rate                            

All datatypes are valid for each column so no need for conversion

### Checking for Null values

In [9]:
drug_mort.isnull().sum()

State                                              0
Year                                               0
Sex                                                0
Age Group                                          0
Race and Hispanic Origin                           0
Deaths                                             0
Population                                         0
Crude Death Rate                                   0
Standard Error for Crude Rate                      0
Low Confidence Limit for Crude Rate                0
Upper Confidence Limit for Crude Rate              0
Age-adjusted Rate                               1920
Standard Error Age-adjusted Rate                1920
Lower Confidence Limit for Age-adjusted Rate    1920
Upper Confidence for Age-adjusted Rate          1920
State Crude Rate in Range                       1920
US Crude Rate                                      0
US Age-adjusted Rate                               0
Unit                                          

These Nan values are under the same 1920 rows which are all listed under rows with Age Group categories (0-14, 15-24, 25-34, etc.) so there is no Age-adjusted rate being calculated for those rows. In this case Nan may imply that those values are missing so we will fill them with the same values for Crude Death Rate indicating no age adjustment.

In [93]:
drug_mort['Age-adjusted Rate'] = drug_mort['Age-adjusted Rate'].fillna(drug_mort['Crude Death Rate'])
drug_mort['Standard Error Age-adjusted Rate'] = drug_mort['Standard Error Age-adjusted Rate'].fillna(drug_mort['Standard Error for Crude Rate'])
drug_mort['Lower Confidence Limit for Age-adjusted Rate'] = drug_mort['Lower Confidence Limit for Age-adjusted Rate'].fillna(drug_mort['Low Confidence Limit for Crude Rate'])
drug_mort['Upper Confidence for Age-adjusted Rate'] = drug_mort['Upper Confidence for Age-adjusted Rate'].fillna(drug_mort['Upper Confidence Limit for Crude Rate'])

#fill in with US Crude Rate since Nan values are under rows with "United States" as State
drug_mort['State Crude Rate in Range'] = drug_mort['State Crude Rate in Range'].fillna(drug_mort['US Crude Rate'])

drug_mort.isnull().sum()

State                                           0
Year                                            0
Sex                                             0
Age Group                                       0
Race and Hispanic Origin                        0
Deaths                                          0
Population                                      0
Crude Death Rate                                0
Standard Error for Crude Rate                   0
Low Confidence Limit for Crude Rate             0
Upper Confidence Limit for Crude Rate           0
Age-adjusted Rate                               0
Standard Error Age-adjusted Rate                0
Lower Confidence Limit for Age-adjusted Rate    0
Upper Confidence for Age-adjusted Rate          0
State Crude Rate in Range                       0
US Crude Rate                                   0
US Age-adjusted Rate                            0
Unit                                            0
dtype: int64

### The column title Race and Hispanic Origin and its values are slightly confusing so renamed to simplify messiness

In [95]:
drug_mort.rename(columns = {'Race and Hispanic Origin':'Race Origin'}, inplace = True)
#removing messy strings from matching rows
drug_mort['Race Origin'] = drug_mort['Race Origin'].str.replace('-All Origins', '') 
drug_mort['Race Origin'] = drug_mort['Race Origin'].str.replace('Non-Hispanic', '') 
#drug_mort.sample(15)

## Creating dataframe grouped by the different age groups (0-14, 15-24, 25-34, 35-45, 45-64, 65-74, & 75+) and another dataframe for "All Ages" for each state and entire U.S.

In [32]:
#use to see the different age groups in column
#drug_mort['Age Group'].sample(50)

#### Age Grouped for entire U.S. 

Rows with specific age groups are only for the "United States" not individual states so no additional subsetting is needed

In [56]:
#Age Grouped for United States
age_grouped_state = drug_mort[drug_mort["Age Group"]!="All Ages"] 
#age_grouped_state.head()

#### All Ages for entire U.S. and each state. 
Rows with "All Ages" are separated by each state and also for the United States so two subset dataframes are needed

In [59]:
#All Ages for United States
all_ages_US = drug_mort[drug_mort["Age Group"]=="All Ages"]
all_ages_US = all_ages_US[all_ages_US["State"]=="United States"]
#all_ages_US.head()

#All Ages by State
all_ages_state = drug_mort[drug_mort["Age Group"]=="All Ages"]
all_ages_state = all_ages_state[all_ages_state["State"]!="United States"]
#all_ages_state.head()

## Creating dataframe grouped by different Race Origins for each state and the entire U.S.

In [72]:
#use to see the different Race Origins in column
drug_mort['Race Origin'].sample(10)

2325    All Races-All Origins
2082       Non-Hispanic Black
1491       Non-Hispanic White
1319       Non-Hispanic Black
2449    All Races-All Origins
1785                 Hispanic
846     All Races-All Origins
2380    All Races-All Origins
1425       Non-Hispanic White
2357    All Races-All Origins
Name: Race Origin, dtype: object

#### All Race Origins for Entire U.S. and each state
There is data for each state and entire U.S. so two subset dataframes required

In [67]:
#All races for entire U.S.
all_races_US = drug_mort[drug_mort['Race Origin']=="All Races-All Origins"]
all_races_US = all_races_US[all_races_US['State']=="United States"]
#all_races_US.head()

#All races for each state
all_races_state = drug_mort[drug_mort['Race Origin']=="All Races-All Origins"]
all_races_state = all_races_state[all_races_state['State']!="United States"]
#all_races_state.head()

#### Other Race Origins (Hispanic, Non-Hispanic Black, and Non-Hispanic White) for Entire U.S.
There is only data for entire U.S. under these categories so only one subset dataframe required

In [76]:
#Other Race Origins for entire U.S.
other_races_US = drug_mort[drug_mort['Race Origin']!="All Races-All Origins"]
#other_races_US.head()