Yearly opioid overdoses per county were gathered from https://wonder.cdc.gov/mcd-icd10.html.
To replace suppressed values with an estimate, yearly_death_totals was compiled from the same source.
The cell below creates a new column for each year rather than leaving 'year' as a single column 
and replaces suppressed values (values less than 10) with 
(yearly_death_totals - unsuppressed_vals)/num_suppressed_entries) for each year.


In [None]:
import pandas as pd
import numpy as np

# Read the data from the files
data = pd.read_excel("all_deaths.xlsx")
yearly_death_totals_data = pd.read_excel("yearly_death_totals.xlsx")

# Pivot the data using the pivot_table function
pivoted_data = data.pivot_table(index=['County', 'County Code'],
                                columns='Year',
                                values=['Deaths', 'Population'],
                                aggfunc='first').reset_index()

# Flatten the column names
pivoted_data.columns.name = None
pivoted_data.columns = [f'{col[0]}_{col[1]}' if col[1] else col[0] for col in pivoted_data.columns]

# Calculate suppressed values
for year in range(2013, 2021):
    yearly_death_totals = yearly_death_totals_data[f'yearly_death_totals{year}'].iloc[0]
    unsuppressed_deaths = pivoted_data.loc[pivoted_data[f'Deaths_{year}'] != 'Suppressed', f'Deaths_{year}'].sum()
    num_suppressed_entries = (pivoted_data[f'Deaths_{year}'] == 'Suppressed').sum()
    
    if num_suppressed_entries > 0:
        suppressed_value = (yearly_death_totals - unsuppressed_deaths) / num_suppressed_entries
    else:
        suppressed_value = 0
        
    # Replace 'Suppressed' values with the calculated suppressed_value
    pivoted_data.loc[pivoted_data[f'Deaths_{year}'] == 'Suppressed', f'Deaths_{year}'] = suppressed_value

# Export the updated pivoted data to a new Excel file
pivoted_data.to_excel("yearly_deaths.xlsx", index=False)


We found that the death rate is correlated with the white population, so we will use the white proportion of county populations as a feature.

In [5]:
import pandas as pd

# Read the white_population data from the file
white_population_data = pd.read_excel("white_population.xlsx")

# Pivot the data using the pivot_table function
pivoted_white_population = white_population_data.pivot_table(index=['County', 'FIPS'],
                                                              columns='Year',
                                                              values='Population',
                                                              aggfunc='first').reset_index()

# Flatten the column names
pivoted_white_population.columns.name = None
pivoted_white_population.columns = [f'{col[0]}_{col[1]}' if isinstance(col, tuple) else col for col in pivoted_white_population.columns]

# Export the pivoted data
pivoted_white_population.to_excel("yearly_white_population.xlsx", index=False)


In [16]:
# Read the yearly_deaths and yearly_white_population data from the files
yearly_deaths = pd.read_excel("yearly_deaths.xlsx")
yearly_white_population = pd.read_excel("yearly_white_population.xlsx")

# Rename the columns in the yearly_white_population DataFrame
yearly_white_population.columns = [
    f"Population_{col}_white" if isinstance(col, int) else col for col in yearly_white_population.columns
]

# Merge the DataFrames on the ['County', 'FIPS'] columns
merged_data = yearly_deaths.merge(yearly_white_population, on=['County', 'FIPS'])

# Calculate the proportion of the white population for each year and create new columns
for year in range(2013, 2021):
    merged_data[f'Proportion_White_{year}'] = merged_data[f'Population_{year}_white'] / merged_data[f'Population_{year}']

# Display the updated merged_data DataFrame
print(merged_data.head())


# Export the updated merged_data DataFrame
merged_data.to_excel("merged.xlsx", index=False)


                 County   FIPS  Deaths_2013  Deaths_2014  Deaths_2015  \
0  Abbeville County, SC  45001      1.82925     1.824377     1.919068   
1     Acadia Parish, LA  22001      1.82925     1.824377     1.919068   
2   Accomack County, VA  51001      1.82925     1.824377     1.919068   
3        Ada County, ID  16001     38.00000    36.000000    22.000000   
4      Adair County, IA  19001      1.82925     1.824377     1.919068   

   Deaths_2016  Deaths_2017  Deaths_2018  Deaths_2019  Deaths_2020  ...  \
0     1.925623     2.031753     1.967213     2.012048     2.229486  ...   
1     1.925623     2.031753     1.967213     2.012048     2.229486  ...   
2     1.925623     2.031753     1.967213     2.012048     2.229486  ...   
3    30.000000    35.000000    49.000000    46.000000    52.000000  ...   
4     1.925623     2.031753     1.967213     2.012048     2.229486  ...   

   Population_2019_white  Population_2020_white  Proportion_White_2013  \
0                  17269            

Next we will merge the data with employment data

In [20]:
import pandas as pd

# Load the employment data
employment_data = pd.read_excel('employment.xlsx')

# Load the merged data
merged_data = pd.read_excel('merged.xlsx')

# Merge the merged_data and employment_data DataFrames on the 'FIPS' column
merged_data = merged_data.merge(employment_data, on='FIPS')

# Export the data 
merged_data.to_excel("data.xlsx", index=False)


now to clean some problematic values from proportion_white_2013 and normalize the data

In [26]:
import pandas as pd
import numpy as np

# Load the data
data = pd.read_excel('data.xlsx')

# Replace inf and -inf values with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with a suitable value, e.g., 1
data['Proportion_White_2013'].fillna(1, inplace=True)

# Clip values greater than 1 to 1
data['Proportion_White_2013'] = data['Proportion_White_2013'].clip(upper=1)

# Save the cleaned data
data.to_excel("cleaned_data.xlsx", index=False)


In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_excel('cleaned_data.xlsx')

# Identify the columns to normalize, excluding 'FIPS'
columns_to_normalize = [col for col in data.columns if col not in ['County', 'FIPS']]

# Apply Standard Scaling to the selected columns
scaler = StandardScaler()
normalized_values = scaler.fit_transform(data[columns_to_normalize])

# Replace the original columns with the normalized values
normalized_data = data.copy()
normalized_data[columns_to_normalize] = normalized_values

# Export the normalized data to an Excel file
normalized_data.to_excel("normalized_data.xlsx", index=False)
