In [2]:
import pandas as pd

In [8]:
def read_all_files():
    data_list = []
    for year in range(2003, 2016):
        df_temp = pd.read_csv(f"../00_source_data/US_VitalStatistics/Underlying Cause of Death, {year}.txt", delimiter="\t")
        data_list.append(df_temp)
    return pd.concat(data_list)

In [9]:
df = read_all_files()
df.sample(10)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
4419,,"Fremont County, WY",56013.0,2010.0,2010.0,All other alcohol-induced causes,A9,12.0
365,,"Orange County, CA",6059.0,2015.0,2015.0,"Alcohol poisonings (overdose) (X45, X65, Y15)",A1,24.0
271,,"Del Norte County, CA",6015.0,2013.0,2013.0,All other non-drug and non-alcohol causes,O9,277.0
4372,,"Mineral County, WV",54057.0,2012.0,2012.0,All other non-drug and non-alcohol causes,O9,306.0
1281,,"Osage County, KS",20139.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,180.0
2370,,"Roosevelt County, MT",30085.0,2013.0,2013.0,All other alcohol-induced causes,A9,15.0
1445,,"Fayette County, KY",21067.0,2007.0,2007.0,All other alcohol-induced causes,A9,31.0
3034,,"Wilson County, NC",37195.0,2015.0,2015.0,All other alcohol-induced causes,A9,13.0
1791,,"Marquette County, MI",26103.0,2005.0,2005.0,All other non-drug and non-alcohol causes,O9,626.0
92,,"Aleutians West Census Area, AK",2016.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,17.0


In [10]:
# Remove abnormal values from data scripting
df = df.drop(df[df["County"].isnull()].index)

In [53]:
# Basic Data Validity Check
assert not df["County"].isnull().any()
assert not df["Year"].isnull().any()
assert not df["Drug/Alcohol Induced Cause"].isnull().any()
assert not df["Deaths"].isnull().any()
assert not (df["Deaths"] == 0).any()

In [39]:
'''
Perform the first subset, only include:
    1. Three target states: Texas(TX), Florida(FL), and Washington(WA)
    2. Three reference states: North Carolina(NC), South Carolina(SC), and Pennsylvania(PA)
'''
df_sub = df[
    (df.County.str.contains("TX")) | 
    (df.County.str.contains("FL")) | 
    (df.County.str.contains("WA")) | 
    (df.County.str.contains("NC")) | 
    (df.County.str.contains("SC")) | 
    (df.County.str.contains("PA"))
].copy()
df_sub.sample(5)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
531,,"Collier County, FL",12021.0,2004.0,2004.0,Drug poisonings (overdose) Unintentional (X40-...,D1,29.0
3648,,"Dimmit County, TX",48127.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,79.0
3401,,"Aiken County, SC",45003.0,2011.0,2011.0,Drug poisonings (overdose) Unintentional (X40-...,D1,22.0
2857,,"Scotland County, NC",37165.0,2011.0,2011.0,All other non-drug and non-alcohol causes,O9,355.0
4123,,"Oldham County, TX",48359.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,18.0


In [54]:
# Data validity check 1: abnormal value check for Causes, looks normal
df_sub["Drug/Alcohol Induced Cause"].value_counts()

All other non-drug and non-alcohol causes             7212
Drug poisonings (overdose) Unintentional (X40-X44)    2077
All other alcohol-induced causes                      1659
Drug poisonings (overdose) Suicide (X60-X64)           424
All other drug-induced causes                          113
Drug poisonings (overdose) Undetermined (Y10-Y14)       74
Alcohol poisonings (overdose) (X45, X65, Y15)           46
Name: Drug/Alcohol Induced Cause, dtype: int64

In [43]:
# Data validity check 2: County Check

# Since TX has 254 counties, the County column should have all 254 unique values
df_TX = df_sub[df_sub.County.str.contains("TX")].copy()

# Assertion failed, there are only 251 counties, commented out
# assert df_TX["County"].nunique() == 254

AssertionError: 

In [44]:
# Find missing counties
real_counties = []
with open ("../00_source_data/texas_counties") as file:
    for line in file:
        real_counties.append(line.replace("\n", ""))

dataset_counties = []
for county in df_TX["County"].unique():
    dataset_counties.append(county.split(",")[0])

list(set(real_counties) - set(dataset_counties))

['King County', 'Loving County', 'Kenedy County']

In [50]:
assert df[(df.County.str.contains("King County")) & df.County.str.contains("TX")].empty
assert df[(df.County.str.contains("Loving County")) & df.County.str.contains("TX")].empty
assert df[(df.County.str.contains("Kenedy County")) & df.County.str.contains("TX")].empty

# It seems that these three counties had no overdose deaths. So we do not need to perform county check on all states anymore.

In [17]:
df_sub.to_csv("../20_intermediate_files/Underlying Cause of Death, 2003-2015.csv")
df_sub.to_parquet("../20_intermediate_files/Underlying Cause of Death, 2003-2015.gzip", compression="gzip")