**Importing files and looking at structures**

In [None]:
import pandas as pd
import numpy as np

In [None]:
overall_pop = pd.read_csv('~/720/pds2021-opioids-team-8-1/20_intermediate_files/Population_2000-2019.csv')

In [None]:
overall_cause_of_death = pd.read_csv('~/720/pds2021-opioids-team-8-1/20_intermediate_files/Underlying Cause of Death, 2003-2015.csv')

In [None]:
overall_cause_of_death.sample(15)

In [None]:
overall_pop.sample(15)

In [None]:
overall_pop.columns

**Now we need to get both datasets into a form where we can merge on state-county-year**

**It looks like the cause of death data has *state* included in the *county* column, but the population has seperate *county* and *state* columns. So we need to separate the county-state information in the cause of death dataset**

In [None]:
# extract state information from county column in the cause of death data

overall_cause_of_death['State'] = ''
overall_cause_of_death.State = [overall_cause_of_death.County[i][-2:] for i in range(len(overall_cause_of_death.County))]

In [None]:
overall_cause_of_death.sample(10)

In [None]:
# extract county information from county column in the cause of death data

overall_cause_of_death['temp_County'] = ''
overall_cause_of_death['temp_County']= [overall_cause_of_death.County[i][:-4] for i in range(len(overall_cause_of_death.County))]

In [None]:
overall_cause_of_death.sample(10)

**We now have seperate *state* and *county* columns in the cause of death data set, so we drop the County column (and other unnecessary columns for merging) and rename the temp_County column**

In [None]:
overall_cause_of_death.drop(labels=['County', 'Unnamed: 0', 'Notes', 'Year Code'], axis=1, inplace=True)

In [None]:
overall_cause_of_death.rename({'temp_County':'County'}, axis=1, inplace=True)

In [None]:
overall_cause_of_death.sample(10)

**Now must melt down year headers from the population data set into single column of observations to match the cause of death data set**

In [None]:
# first drop unnecessary columns "Unnamed: 0" and "_merge"
overall_pop.drop(labels=['Unnamed: 0', '_merge'], axis=1, inplace=True)

In [None]:
overall_pop.head()

In [None]:
overall_pop_melted = overall_pop.melt(id_vars=['County', 'State'], value_vars=overall_pop.filter(like='20').columns.tolist(), var_name='Year', value_name='Population')

In [None]:
overall_pop_melted.head()

In [None]:
overall_cause_of_death.head(10)

In [None]:
print(overall_cause_of_death.dtypes)
print(overall_pop_melted.dtypes)

**Change types of columns we want to merge so they match**

In [None]:
overall_pop_melted.Year = overall_pop_melted.Year.astype(np.float64)

In [None]:
overall_pop_melted.dtypes

**Cause of death data is now in a format to merge successfully with the population dataset**

In [None]:
merged = overall_cause_of_death.merge(overall_pop_melted, on=['Year', 'State', 'County'], validate='m:1', indicator=True)

**validity checks**

In [None]:
assert merged[merged['_merge'] != 'both'].empty

In [None]:
merged.isnull().sum().sum()

In [None]:
for col in merged.columns:
    assert not merged[col].isnull().any()

In [32]:
merged.head()

Unnamed: 0,County Code,Year,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,target_state,State,County,Population,_merge
0,4001.0,2003.0,All other alcohol-induced causes,A9,22.0,1,AZ,Apache County,68072.0,both
1,4001.0,2003.0,All other non-drug and non-alcohol causes,O9,464.0,1,AZ,Apache County,68072.0,both
2,4003.0,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,11.0,1,AZ,Cochise County,120638.0,both
3,4003.0,2003.0,All other alcohol-induced causes,A9,14.0,1,AZ,Cochise County,120638.0,both
4,4003.0,2003.0,All other non-drug and non-alcohol causes,O9,1109.0,1,AZ,Cochise County,120638.0,both
