In [2]:
import pandas as pd

# Read the dataset
data = pd.read_excel('total_deaths.xlsx', engine='openpyxl')

# Remove the rows containing total values
data_without_totals = data[data['County'].notna()]

# Melt the dataset to have a Year column
data_without_totals = pd.melt(data_without_totals, id_vars=['Notes', 'County', 'County Code'], var_name='Year', value_name='Deaths')

# Count suppressed values
suppressed_counts = data_without_totals.groupby('Year').apply(lambda x: (x['Deaths'] == 'Suppressed').sum())

# Replace suppressed values with NaN
data_without_totals.replace('Suppressed', pd.NA, inplace=True)

# Convert Deaths column to numeric data type
data_without_totals['Deaths'] = data_without_totals['Deaths'].apply(pd.to_numeric)

# Group by year and sum non-suppressed values for each age bracket
non_suppressed_totals = data_without_totals.groupby('Year')['Deaths'].sum(skipna=True)

# Extract the rows containing total values
total_values = data_without_totals[data_without_totals['County'].isna()].set_index('Year').drop(columns=['County Code'])
total_values.replace('Suppressed', 0, inplace=True)

suppressed_totals = total_values - non_suppressed_totals

# Replace suppressed values with suppressed_totals divided by the number of suppressed values
replacement_values = suppressed_totals / suppressed_counts
replacement_values = replacement_values.reset_index()

def replace_suppressed(row, replacement_df):
    year = row['Year']
    if pd.isna(row['Deaths']):
        replacement_value = replacement_df.loc[(replacement_df['Year'] == year), 'Deaths'].iloc[0]
        row['Deaths'] = replacement_value
    return row

data_without_totals['Deaths'] = data_without_totals.apply(lambda x: replace_suppressed(x, replacement_values), axis=1)

# Pivot the dataset back to the original format
data_without_totals = data_without_totals.pivot_table(values='Deaths', index=['Notes', 'County', 'County Code'], columns='Year', aggfunc='sum').reset_index()

# Save the resulting dataset to a new xlsx file
with pd.ExcelWriter('death_data_replaced.xlsx', engine='openpyxl') as writer:
    data_without_totals.to_excel(writer, index=False)


ValueError: Unable to parse string "Missing" at position 0

In [None]:

import pandas as pd

# Read the data from the files
death_data_replaced_df = pd.read_excel('death_data_replaced.xlsx')
death_data_white_df = pd.read_excel('death_data_white.xlsx')
unemployment_med_inc_df = pd.read_excel('unemployment-med-hh-inc.xlsx')

# Remove rows where the 'Area_name' field doesn't contain a comma
unemployment_med_inc_df = unemployment_med_inc_df[unemployment_med_inc_df['Area_name'].str.contains(',')]

# Create separate dataframes for each year and merge them later
years = range(2013, 2021)
merged_dfs = []

for year in years:
    # Extract relevant columns for the current year
    cols = ['FIPS', f'Civilian_labor_force_{year}', f'Employed_{year}', f'Unemployed_{year}', f'Unemployment_rate_{year}']
    if year == 2020:
        cols.extend(['Median_Household_Income_2020', 'Med_HH_Income_Percent_of_State_Total_2020'])
    unemployment_med_inc_year_df = unemployment_med_inc_df[cols]
    unemployment_med_inc_year_df['Year'] = year
    
    # Merge the datasets for the current year
    merged_year_df = pd.merge(disp_rates_df[disp_rates_df['Year'] == year], race_pop_df[race_pop_df['Year'] == year], on=['FIPS', 'Year'])
    merged_year_df = pd.merge(merged_year_df, unemployment_med_inc_year_df, on=['FIPS', 'Year'])
    
    # Append the merged dataframe for the current year to the list
    merged_dfs.append(merged_year_df)

# Concatenate the dataframes for all years
merged_df = pd.concat(merged_dfs, ignore_index=True)

# Write the result to an xlsx file
merged_df.to_excel('data.xlsx', index=False)
