Preprocess death data by estimating suppressed values

In [10]:
import pandas as pd

# Read the dataset
data = pd.read_excel('total_deaths.xlsx', engine='openpyxl')

# Extract the row containing total values
total_values = data.iloc[-1]

# Remove the rows containing total values
data_without_totals = data[:-1]

# Melt the dataset to have a Year column
data_without_totals = pd.melt(data_without_totals, id_vars=['Notes', 'County', 'County Code'], var_name='Year', value_name='Deaths')

# Count suppressed values
suppressed_counts = data_without_totals.groupby('Year').apply(lambda x: (x['Deaths'] == 'Suppressed').sum())

# Replace suppressed values with NaN
data_without_totals.replace('Suppressed', pd.NA, inplace=True)

# Convert Deaths column to numeric data type
data_without_totals['Deaths'] = data_without_totals['Deaths'].apply(pd.to_numeric)

# Group by year and sum non-suppressed values for each year
non_suppressed_totals = data_without_totals.groupby('Year')['Deaths'].sum()

# Calculate the suppressed totals
suppressed_totals = total_values[3:] - non_suppressed_totals

# Replace suppressed values with suppressed_totals divided by the number of suppressed values
replacement_values = suppressed_totals / suppressed_counts
replacement_values = replacement_values.reset_index()

def replace_suppressed(row, replacement_df):
    year = row['Year']
    if pd.isna(row['Deaths']):
        replacement_value = replacement_df.loc[(replacement_df['index'] == year), 0].iloc[0]
        row['Deaths'] = replacement_value
    return row

data_without_suppressed = data_without_totals.apply(lambda x: replace_suppressed(x, replacement_values), axis=1)
data_without_totals['Deaths'] = data_without_suppressed['Deaths']

# Pivot the dataset back to the original format
data_without_totals = data_without_totals.pivot(index=['Notes', 'County', 'County Code'], columns='Year', values='Deaths').reset_index()

# Save the resulting dataset to a new xlsx file
with pd.ExcelWriter('total_deaths_preprocessed.xlsx', engine='openpyxl') as writer:
    data_without_totals.to_excel(writer, index=False)


Remove rows for US total and for states from unemployment data

In [25]:
import pandas as pd

# Load the xlsx file into a pandas dataframe
df = pd.read_excel('unemployment-med-hh-inc.xlsx')

# Filter the dataframe to keep only rows where the value in the Area_name column does not contain a comma
df = df[df['Area_name'].str.contains(',')]

# Generate a filename for the output file
output_filename = 'employment.xlsx'

# Save the filtered dataframe to a new xlsx file
df.to_excel(output_filename, index=False)

print('Filtered data saved to {filename}'.format(filename=output_filename))


Filtered data saved to employment.xlsx
