In [55]:
import pandas as pd

In [56]:
def read_all_files():
    data_list = []
    for year in range(2003, 2016):
        df_temp = pd.read_csv(f"../00_source_data/US_VitalStatistics/Underlying Cause of Death, {year}.txt", delimiter="\t")
        data_list.append(df_temp)
    return pd.concat(data_list)

In [57]:
df = read_all_files()
df.sample(10)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
1520,,"Jefferson Parish, LA",22051.0,2004.0,2004.0,Drug poisonings (overdose) Unintentional (X40-...,D1,88.0
2356,,"Meagher County, MT",30059.0,2013.0,2013.0,All other non-drug and non-alcohol causes,O9,30.0
3538,,"Limestone County, TX",48293.0,2004.0,2004.0,All other non-drug and non-alcohol causes,O9,289.0
3327,,"Oklahoma County, OK",40109.0,2015.0,2015.0,All other alcohol-induced causes,A9,108.0
4117,,"Clark County, WA",53011.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,2733.0
4203,,"Caroline County, VA",51033.0,2014.0,2014.0,All other non-drug and non-alcohol causes,O9,240.0
2366,,"Garden County, NE",31069.0,2010.0,2010.0,All other non-drug and non-alcohol causes,O9,28.0
2586,,"Grafton County, NH",33009.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,778.0
4025,,"Carroll County, VA",51035.0,2010.0,2010.0,All other non-drug and non-alcohol causes,O9,357.0
1176,,"Monroe County, IN",18105.0,2010.0,2010.0,Drug poisonings (overdose) Unintentional (X40-...,D1,16.0


In [58]:
# Remove abnormal values from data scripting
df = df.drop(df[df["County"].isnull()].index)

In [59]:
# Basic Data Validity Check
assert not df["County"].isnull().any()
assert not df["Year"].isnull().any()
assert not df["Drug/Alcohol Induced Cause"].isnull().any()
assert not df["Deaths"].isnull().any()
assert not (df["Deaths"] == 0).any()

In [60]:
'''
Perform the first subset, only include:
    1. Three target states: Texas(TX), Florida(FL), and Washington(WA)
    2. Three reference states: North Carolina(NC), South Carolina(SC), and Pennsylvania(PA)
'''
df_sub = df[
    (df.County.str.contains("TX")) | 
    (df.County.str.contains("FL")) | 
    (df.County.str.contains("WA")) | 
    (df.County.str.contains("NC")) | 
    (df.County.str.contains("SC")) | 
    (df.County.str.contains("PA"))
].copy()
df_sub.sample(5)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
4198,,"Walker County, TX",48471.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,516.0
621,,"Clay County, FL",12019.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,1648.0
3829,,"Donley County, TX",48129.0,2013.0,2013.0,All other non-drug and non-alcohol causes,O9,56.0
586,,"Charlotte County, FL",12015.0,2014.0,2014.0,All other drug-induced causes,D9,10.0
4036,,"Travis County, TX",48453.0,2013.0,2013.0,All other alcohol-induced causes,A9,86.0


In [62]:
# Data validity check: County Check

# Since TX has 254 counties, the County column should have all 254 unique values
df_TX = df_sub[df_sub.County.str.contains("TX")].copy()

# Assertion failed, there are only 251 counties, commented out
# assert df_TX["County"].nunique() == 254

In [63]:
# Find missing counties
real_counties = []
with open ("../00_source_data/texas_counties") as file:
    for line in file:
        real_counties.append(line.replace("\n", ""))

dataset_counties = []
for county in df_TX["County"].unique():
    dataset_counties.append(county.split(",")[0])

list(set(real_counties) - set(dataset_counties))

['King County', 'Loving County', 'Kenedy County']

In [64]:
assert df[(df.County.str.contains("King County")) & df.County.str.contains("TX")].empty
assert df[(df.County.str.contains("Loving County")) & df.County.str.contains("TX")].empty
assert df[(df.County.str.contains("Kenedy County")) & df.County.str.contains("TX")].empty

# It seems that these three counties had no overdose deaths. So we do not need to perform county check on all states anymore.

In [65]:
df_sub.to_csv("../20_intermediate_files/Underlying Cause of Death, 2003-2015.csv")
df_sub.to_parquet("../20_intermediate_files/Underlying Cause of Death, 2003-2015.gzip", compression="gzip")