In [1]:
import pandas as pd

In [29]:
def read_all_files():
    data_list = []
    for year in range(2003, 2016):
        df_temp = pd.read_csv(f"../00_source_data/US_VitalStatistics/Underlying Cause of Death, {year}.txt", delimiter="\t")
        data_list.append(df_temp)
    return pd.concat(data_list)

In [30]:
df = read_all_files()
df.sample(10)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
990,,"Madison County, ID",16065.0,2014.0,2014.0,All other non-drug and non-alcohol causes,O9,123.0
4464,,"Hardy County, WV",54031.0,2014.0,2014.0,All other non-drug and non-alcohol causes,O9,156.0
1389,,"Palo Alto County, IA",19147.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,126.0
1363,,"Sedgwick County, KS",20173.0,2006.0,2006.0,All other alcohol-induced causes,A9,38.0
791,,"Monroe County, GA",13207.0,2005.0,2005.0,All other non-drug and non-alcohol causes,O9,214.0
4043,,"Real County, TX",48385.0,2014.0,2014.0,All other non-drug and non-alcohol causes,O9,54.0
2650,,"Nassau County, NY",36059.0,2011.0,2011.0,"Alcohol poisonings (overdose) (X45, X65, Y15)",A1,15.0
1568,,"Hancock County, KY",21091.0,2014.0,2014.0,All other non-drug and non-alcohol causes,O9,74.0
2461,,"Douglas County, NV",32005.0,2012.0,2012.0,All other non-drug and non-alcohol causes,O9,391.0
2856,,"Pierce County, ND",38069.0,2009.0,2009.0,All other non-drug and non-alcohol causes,O9,59.0


In [31]:
# Remove abnormal values from data scripting
df = df.drop(df[df["County"].isnull()].index)

In [5]:
# Basic Data Validity Check
assert not df["County"].isnull().any()
assert not df["Year"].isnull().any()
assert not df["Drug/Alcohol Induced Cause"].isnull().any()
assert not df["Deaths"].isnull().any()
assert not (df["Deaths"] == 0).any()

In [32]:
def check_state(state_str):
    target_list = ["TX", "FL"]
    state = state_str.split(", ")[1]
    return 1 if state in target_list else 0
     

In [35]:
df["target_state"] = df.apply(lambda row: check_state(row.County), axis=1)
df_sub = df[df["target_state"] == 1]
df_sub.sample(10)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,target_state
3558,,"Midland County, TX",48329.0,2004.0,2004.0,All other alcohol-induced causes,A9,11.0,1
3534,,"Brown County, TX",48049.0,2007.0,2007.0,All other non-drug and non-alcohol causes,O9,464.0,1
3468,,"Jefferson County, TX",48245.0,2003.0,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,16.0,1
4032,,"Parker County, TX",48367.0,2014.0,2014.0,All other non-drug and non-alcohol causes,O9,1017.0,1
3681,,"Armstrong County, TX",48011.0,2011.0,2011.0,All other non-drug and non-alcohol causes,O9,25.0,1
519,,"Broward County, FL",12011.0,2004.0,2004.0,All other drug-induced causes,D9,15.0,1
4151,,"San Augustine County, TX",48405.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,128.0,1
3837,,"Tarrant County, TX",48439.0,2009.0,2009.0,"Alcohol poisonings (overdose) (X45, X65, Y15)",A1,10.0,1
3503,,"Hood County, TX",48221.0,2004.0,2004.0,All other non-drug and non-alcohol causes,O9,517.0,1
678,,"Sumter County, FL",12119.0,2008.0,2008.0,All other non-drug and non-alcohol causes,O9,999.0,1


In [62]:
# Data validity check: County Check

# Since TX has 254 counties, the County column should have all 254 unique values
df_TX = df_sub[df_sub.County.str.contains("TX")].copy()

# Assertion failed, there are only 251 counties, commented out
# assert df_TX["County"].nunique() == 254

In [63]:
# Find missing counties
real_counties = []
with open ("../00_source_data/texas_counties") as file:
    for line in file:
        real_counties.append(line.replace("\n", ""))

dataset_counties = []
for county in df_TX["County"].unique():
    dataset_counties.append(county.split(",")[0])

list(set(real_counties) - set(dataset_counties))

['King County', 'Loving County', 'Kenedy County']

In [64]:
assert df[(df.County.str.contains("King County")) & df.County.str.contains("TX")].empty
assert df[(df.County.str.contains("Loving County")) & df.County.str.contains("TX")].empty
assert df[(df.County.str.contains("Kenedy County")) & df.County.str.contains("TX")].empty

# It seems that these three counties had no overdose deaths. So we do not need to perform county check on all states anymore.

In [65]:
df_sub.to_csv("../20_intermediate_files/Underlying Cause of Death, 2003-2015.csv")
df_sub.to_parquet("../20_intermediate_files/Underlying Cause of Death, 2003-2015.gzip", compression="gzip")