# Team One: The Opioid Crisis in the United States - Data Cleanup

###### Development team notes and caveats:
* Question 1: Data was extracted from https://wonder.cdc.gov/   Rather than extracting data in one large chunk and disassembling with code, I downloaded into individual tab separated text files. This was becuase I noticed some anomalies when extracting data from Wonder in large comingled batches; most likely due to the grouping functionality used by the Wonder tool. If I can find and access the raw data behind the Wonder site, I will modify this notebook.
* Question 2:
* Question 3:
* Question 4:

#### Dependencies

In [1]:
import os                                                                    # os library

In [2]:
import numpy as np                                                           # numpy library

In [3]:
import pandas as pd                                                          # pandas library

In [4]:
import json                                                                  # json library

#### Research Question 1: clean up raw data and export to csv or JSON for use by main notebook

In [5]:
filename = 'Q1DS1_Year.txt'                                                  # 1st raw data file
rawdata_file = os.path.join(".", "Data Files", "Question_1", filename)       # creates path to read data
q1ds1_df = pd.read_csv(rawdata_file, sep='\t')                               # reads data from file

In [6]:
q1ds1_df.head()                                                              # displays raw data

Unnamed: 0,Notes,Year,Year Code,Deaths,Population,Crude Rate
0,,1999.0,1999.0,5594.0,279040168.0,2.0
1,,2000.0,2000.0,6011.0,281421906.0,2.1
2,,2001.0,2001.0,7088.0,284968955.0,2.5
3,,2002.0,2002.0,9318.0,287625193.0,3.2
4,,2003.0,2003.0,10389.0,290107933.0,3.6


In [7]:
q1ds1_df.drop('Notes', axis=1, inplace=True)                                 # drops Notes column
q1ds1_df.dropna(axis=0, how='any', inplace=True)                             # drops rows with NaN
q1ds1_df['Year'] = q1ds1_df['Year'].astype(int)                              # casts datatypes for columns
q1ds1_df['Year Code'] = q1ds1_df['Year Code'].astype(int)
q1ds1_df['Year Code'] = q1ds1_df['Year Code'].astype(str)
q1ds1_df['Deaths'] = q1ds1_df['Deaths'].astype(int)
q1ds1_df['Population'] = q1ds1_df['Population'].astype(int)

# Calculates death rate per 100,000 of population (Deaths / Population) * 100,000
q1ds1_df['Death Rate'] = q1ds1_df['Deaths'] /  q1ds1_df['Population'] * 100000
q1ds1_df.drop('Crude Rate', axis=1, inplace=True)                            # drops Crude Rate column
q1ds1_df.set_index('Year', inplace=True)                                     # reindexes by Year

In [8]:
q1ds1_df.head()                                                              # displays cleaned dataframe

Unnamed: 0_level_0,Year Code,Deaths,Population,Death Rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1999,1999,5594,279040168,2.004729
2000,2000,6011,281421906,2.135939
2001,2001,7088,284968955,2.487288
2002,2002,9318,287625193,3.239633
2003,2003,10389,290107933,3.581081


In [9]:
filename = 'Q1DS1.csv'                                                       # 1st cleaned data file
csv_file = os.path.join(".", "Data Files", "Question_1", filename)           # creates path to write data
q1ds1_df.to_csv(csv_file)                                                    # writes cleaned data to csv

##### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
filename = 'Q1DS2_Sanity.txt'                                                # 2nd raw data file
rawdata_file = os.path.join(".", "Data Files", "Question_1", filename)       # creates path to read data
q1ds2_df = pd.read_csv(rawdata_file, sep='\t')                               # reads data from file

In [11]:
q1ds2_df.head()                                                              # displays raw data

Unnamed: 0,Notes,Year,Year Code,Deaths,Population,Crude Rate
0,,1999.0,1999.0,101.0,279040168.0,0.0
1,,2000.0,2000.0,92.0,281421906.0,0.0
2,,2001.0,2001.0,98.0,284968955.0,0.0
3,,2002.0,2002.0,164.0,287625193.0,0.1
4,,2003.0,2003.0,121.0,290107933.0,0.0


In [12]:
q1ds2_df.drop('Notes', axis=1, inplace=True)                                 # drops Notes column
q1ds2_df.dropna(axis=0, how='any', inplace=True)                             # drops rows with NaN
q1ds2_df['Year'] = q1ds2_df['Year'].astype(int)                              # casts datatypes for columns
q1ds2_df['Year Code'] = q1ds2_df['Year Code'].astype(int)
q1ds2_df['Year Code'] = q1ds2_df['Year Code'].astype(str)
q1ds2_df['Deaths'] = q1ds2_df['Deaths'].astype(int)
q1ds2_df['Population'] = q1ds2_df['Population'].astype(int)

# Calculates death rate per 100,000 of population (Deaths / Population) * 100,000
q1ds2_df['Death Rate'] = q1ds2_df['Deaths'] /  q1ds2_df['Population'] * 100000
q1ds2_df.drop('Crude Rate', axis=1, inplace=True)                            # drops Crude Rate column
q1ds2_df.set_index('Year', inplace=True)                                     # reindexes by Year

In [13]:
q1ds2_df.head()                                                              # displays cleaned dataframe

Unnamed: 0_level_0,Year Code,Deaths,Population,Death Rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1999,1999,101,279040168,0.036196
2000,2000,92,281421906,0.032691
2001,2001,98,284968955,0.03439
2002,2002,164,287625193,0.057019
2003,2003,121,290107933,0.041709


In [14]:
filename = 'Q1DS2.csv'                                                       # 2nd cleaned data file
csv_file = os.path.join(".", "Data Files", "Question_1", filename)           # creates path to write data
q1ds2_df.to_csv(csv_file)                                                    # writes cleaned data to csv

##### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
filename = 'Q1DS3_Gender.txt'                                                # 3rd raw data file
rawdata_file = os.path.join(".", "Data Files", "Question_1", filename)       # creates path to read data
q1ds3_df = pd.read_csv(rawdata_file, sep='\t')                               # reads data from file

In [16]:
q1ds3_df.head()                                                              # displays raw data

Unnamed: 0,Notes,Gender,Gender Code,Deaths,Population,Crude Rate,Age Adjusted Rate
0,,Female,F,109681.0,2763844000.0,4.0,4.0
1,,Male,M,204081.0,2671903000.0,7.6,7.6
2,Total,,,313762.0,5435746000.0,5.8,5.8
3,---,,,,,,
4,"Dataset: Multiple Cause of Death, 1999-2016",,,,,,


In [17]:
q1ds3_df.drop('Notes', axis=1, inplace=True)                                 # drops Notes column
q1ds3_df.dropna(axis=0, how='any', inplace=True)                             # drops rows with NaN
q1ds3_df['Deaths'] = q1ds3_df['Deaths'].astype(int)                          # casts datatypes for columns

# Calculates death rate per 100,000 of population (Deaths / Population) * 100,000
q1ds3_df['Death Rate'] = q1ds3_df['Deaths'] /  q1ds3_df['Population'] * 100000
q1ds3_df.drop('Crude Rate', axis=1, inplace=True)                            # drops Crude Rate column
q1ds3_df.drop('Age Adjusted Rate', axis=1, inplace=True)                     # drops Age Adj rate column

In [18]:
q1ds3_df.head()                                                              # displays cleaned dataframe

Unnamed: 0,Gender,Gender Code,Deaths,Population,Death Rate
0,Female,F,109681,2763844000.0,3.968422
1,Male,M,204081,2671903000.0,7.638041


In [19]:
filename = 'Q1DS3.csv'                                                       # 3rd cleaned data file
csv_file = os.path.join(".", "Data Files", "Question_1", filename)           # creates path to write data
q1ds3_df.to_csv(csv_file)                                                    # writes cleaned data to csv

##### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
filename = 'Q1DS4_Age.txt'                                                   # 4th raw data file
rawdata_file = os.path.join(".", "Data Files", "Question_1", filename)       # creates path to read data
q1ds4_df = pd.read_csv(rawdata_file, sep='\t')                               # reads data from file

In [21]:
q1ds4_df.head()                                                              # displays raw data

Unnamed: 0,Notes,Ten-Year Age Groups,Ten-Year Age Groups Code,Deaths,Population,Crude Rate
0,,< 1 year,1,158.0,71608191,0.2
1,,1-4 years,1-4,425.0,285248834,0.1
2,,5-14 years,5-14,436.0,737079143,0.1
3,,15-24 years,15-24,32545.0,764330433,4.3
4,,25-34 years,25-34,71071.0,737039056,9.6


In [22]:
q1ds4_df.drop('Notes', axis=1, inplace=True)                                 # drops Notes column
q1ds4_df.dropna(axis=0, how='any', inplace=True)                             # drops rows with NaN
q1ds4_df.drop(q1ds4_df.index[-1], inplace=True)                              # drops age not stated row
q1ds4_df['Deaths'] = q1ds4_df['Deaths'].astype(int)                          # casts datatypes for columns
q1ds4_df['Population'] = q1ds4_df['Population'].astype(int)

# Calculates death rate per 100,000 of population (Deaths / Population) * 100,000
q1ds4_df['Death Rate'] = q1ds4_df['Deaths'] /  q1ds4_df['Population'] * 100000
q1ds4_df.drop('Crude Rate', axis=1, inplace=True)                            # drops Crude Rate column

In [23]:
q1ds4_df.head()                                                              # displays cleaned dataframe

Unnamed: 0,Ten-Year Age Groups,Ten-Year Age Groups Code,Deaths,Population,Death Rate
0,< 1 year,1,158,71608191,0.220645
1,1-4 years,1-4,425,285248834,0.148993
2,5-14 years,5-14,436,737079143,0.059152
3,15-24 years,15-24,32545,764330433,4.257975
4,25-34 years,25-34,71071,737039056,9.642773


In [24]:
filename = 'Q1DS4.csv'                                                       # 4th cleaned data file
csv_file = os.path.join(".", "Data Files", "Question_1", filename)           # creates path to write data
q1ds4_df.to_csv(csv_file)                                                    # writes cleaned data to csv

##### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [25]:
filename = 'Q1DS5_Race.txt'                                                  # 5th raw data file
rawdata_file = os.path.join(".", "Data Files", "Question_1", filename)       # creates path to read data
q1ds5_df = pd.read_csv(rawdata_file, sep='\t')                               # reads data from file

In [26]:
q1ds5_df.head()                                                              # displays raw data

Unnamed: 0,Notes,Race,Race Code,Deaths,Population,Crude Rate,Age Adjusted Rate
0,,American Indian or Alaska Native,1002-5,3578.0,69249570.0,5.2,5.4
1,,Asian or Pacific Islander,A-PI,1986.0,284873900.0,0.7,0.6
2,,Black or African American,2054-5,24111.0,733107600.0,3.3,3.4
3,,White,2106-3,284087.0,4348515000.0,6.5,6.6
4,Total,,,313762.0,5435746000.0,5.8,5.8


In [27]:
q1ds5_df.drop('Notes', axis=1, inplace=True)                                 # drops Notes column
q1ds5_df.dropna(axis=0, how='any', inplace=True)                             # drops rows with NaN
q1ds5_df['Deaths'] = q1ds5_df['Deaths'].astype(int)                          # casts datatypes for columns

# Calculates death rate per 100,000 of population (Deaths / Population) * 100,000
q1ds5_df['Death Rate'] = q1ds5_df['Deaths'] /  q1ds5_df['Population'] * 100000
q1ds5_df.drop('Crude Rate', axis=1, inplace=True)                            # drops Crude Rate column
q1ds5_df.drop('Age Adjusted Rate', axis=1, inplace=True)                     # drops Age Adj rate column

In [28]:
q1ds5_df.head()                                                              # displays cleaned dataframe

Unnamed: 0,Race,Race Code,Deaths,Population,Death Rate
0,American Indian or Alaska Native,1002-5,3578,69249570.0,5.166819
1,Asian or Pacific Islander,A-PI,1986,284873900.0,0.697151
2,Black or African American,2054-5,24111,733107600.0,3.288876
3,White,2106-3,284087,4348515000.0,6.532965


In [29]:
filename = 'Q1DS5.csv'                                                       # 5th cleaned data file
csv_file = os.path.join(".", "Data Files", "Question_1", filename)           # creates path to write data
q1ds5_df.to_csv(csv_file)                                                    # writes cleaned data to csv

##### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [30]:
filename = 'Q1DS6_County.txt'                                                # 6th raw data file
rawdata_file = os.path.join(".", "Data Files", "Question_1", filename)       # creates path to read data
q1ds6_df = pd.read_csv(rawdata_file, sep='\t')                               # reads data from file

In [31]:
q1ds6_df.head()                                                              # displays raw data

Unnamed: 0,Notes,County,County Code,Deaths,Population,Crude Rate
0,,"Baldwin County, AL",1003.0,185.0,3104213.0,6.0
1,,"Bibb County, AL",1007.0,12.0,397480.0,Unreliable
2,,"Blount County, AL",1009.0,44.0,997776.0,4.4
3,,"Calhoun County, AL",1015.0,12.0,2074478.0,Unreliable
4,,"Cherokee County, AL",1019.0,13.0,455087.0,Unreliable


In [32]:
q1ds6_df.drop('Notes', axis=1, inplace=True)                                 # drops Notes column
q1ds6_df.dropna(axis=0, how='any', inplace=True)                             # drops rows with NaN
q1ds6_df['Deaths'] = q1ds6_df['Deaths'].astype(int)                          # casts datatypes for columns
q1ds6_df['Population'] = q1ds6_df['Population'].astype(int)
q1ds6_df['County Code'] = q1ds6_df['County Code'].astype(int)
q1ds6_df['County Code'] = q1ds6_df['County Code'].astype(str)

# Calculates death rate per 100,000 of population (Deaths / Population) * 100,000
q1ds6_df['Death Rate'] = q1ds6_df['Deaths'] /  q1ds6_df['Population'] * 100000
q1ds6_df.drop('Crude Rate', axis=1, inplace=True)                            # drops Crude Rate column

In [33]:
q1ds6_df.head()                                                              # displays cleaned dataframe

Unnamed: 0,County,County Code,Deaths,Population,Death Rate
0,"Baldwin County, AL",1003,185,3104213,5.959643
1,"Bibb County, AL",1007,12,397480,3.01902
2,"Blount County, AL",1009,44,997776,4.409807
3,"Calhoun County, AL",1015,12,2074478,0.578459
4,"Cherokee County, AL",1019,13,455087,2.856597


In [34]:
filename = 'Q1DS6.csv'                                                       # 6th cleaned data file
csv_file = os.path.join(".", "Data Files", "Question_1", filename)           # creates path to write data
q1ds6_df.to_csv(csv_file)                                                    # writes cleaned data to csv

In [35]:
q1ds6_df.drop('County', axis=1, inplace=True)                                # drops unneeded columns
q1ds6_df.drop('Deaths', axis=1, inplace=True)                                 
q1ds6_df.drop('Population', axis=1, inplace=True)
q1ds6_df.set_index('County Code', inplace=True)                              # reindexes by County Code

In [36]:
q1ds6_df.head()                                                              # displays cleaned JSON data

Unnamed: 0_level_0,Death Rate
County Code,Unnamed: 1_level_1
1003,5.959643
1007,3.01902
1009,4.409807
1015,0.578459
1019,2.856597


In [37]:
json_dict = {}                                                               # initializes empty dict 

for index, row in q1ds6_df.iterrows():                                       # iterates through dataframe
    if index not in json_dict:                                               # checks for duplicates...
        json_dict[index] = row[0]                                            # adds data to dictionary

In [38]:
filename = 'Q1DS6.json'                                                      # 6th cleaned json data file
json_file = os.path.join(".", "Data Files", "Question_1", filename)          # creates path to write data

with open(json_file, "w") as f:                                              # opens file in write mode
    f.write(json.dumps(json_dict))                                           # writes json data to file 

#### Research Question 2: clean up raw data and export

#### Research Question 3: clean up raw data and export

#### Research Question 4: clean up raw data and export