In [None]:
import pandas as pd
import os

In [None]:
# Move directory from notebooks to main
os.chdir('..')
cwd = os.getcwd()

### CDC PLACES Local Data for Better Health (2023 flavor)

##### Load data

In [None]:
cdc_df = pd.read_csv(cwd + f'\\data\\raw\\PLACES__Local_Data_for_Better_Health__County_Data_2023_release_20240811.csv')

cdc_df.head()

##### Filter to only 2021 (most recent)

In [None]:
# Filter the CDC dataset to include only records from 2021
cdc_df_2021 = cdc_df[cdc_df['Year'] == 2021]

##### Standardize geo information

In [None]:
# Standardize geographic information in the CDC dataset
cdc_df_2021['GEO_TYPE'] = cdc_df_2021['StateDesc'].apply(lambda x: 'National' if x == 'United States' else 'State')
cdc_df_2021['GEO_VALUE'] = cdc_df_2021['StateDesc'].apply(lambda x: 'United States' if x == 'United States' else x)

##### Aggregate county percentages into a weighted percentage value for state-level

In [None]:
# Aggregate county-level data to state level using a weighted average
cdc_state_df = cdc_df_2021[cdc_df_2021['GEO_TYPE'] == 'State'].copy()

In [None]:
# Calculate weighted average for each state and condition
cdc_state_aggregated = cdc_state_df.groupby(['StateDesc', 'Measure']).apply(
    lambda x: pd.Series({
        'Weighted_Average': (x['Data_Value'] * x['TotalPopulation']).sum() / x['TotalPopulation'].sum(),
        'Total_Population': x['TotalPopulation'].sum()
    })
).reset_index()

# Drop unnecessary columns, keeping only those of interest
columns_to_keep = ['StateDesc', 'Measure', 'GEO_TYPE', 'GEO_VALUE']
cdc_state_df_cleaned = cdc_state_df[columns_to_keep]

# Merge the aggregated results back with the cleaned data
cdc_state_aggregated = pd.merge(
    cdc_state_aggregated, 
    cdc_state_df_cleaned,
    on=['StateDesc', 'Measure'], 
    how='left'
).drop_duplicates()

# Drop the 'StateDesc' column after merging
cdc_state_aggregated = cdc_state_aggregated.drop(columns=['StateDesc'])


##### Filter to Measures of interest

In [None]:
# Define the relevant measures and their shorthand names
measure_shorthand_mapping = {
    'Diagnosed diabetes among adults aged >=18 years': 'Diabetes',
    'High blood pressure among adults aged >=18 years': 'Hypertension',
    'High cholesterol among adults aged >=18 years who have been screened in the past 5 years': 'High_Cholesterol',
    'Chronic obstructive pulmonary disease among adults aged >=18 years': 'COPD',
    'Coronary heart disease among adults aged >=18 years': 'Heart_Disease',
    'Arthritis among adults aged >=18 years': 'Arthritis',
    'Current asthma among adults aged >=18 years': 'Asthma',
    'Depression among adults aged >=18 years': 'Depression',
    'Chronic kidney disease among adults aged >=18 years': 'CKD'
}

# Filter the CDC dataset to include only the relevant measures and add the shorthand column
cdc_filtered_df = cdc_state_aggregated[cdc_state_aggregated['Measure'].isin(measure_shorthand_mapping.keys())]

# Add the shorthand column
cdc_filtered_df['Measure_Short'] = cdc_filtered_df['Measure'].map(measure_shorthand_mapping)

##### Re-order cols to final

In [None]:
# Reorder the columns as specified
cdc_filtered_df = cdc_filtered_df[['GEO_TYPE', 'GEO_VALUE', 'Measure', 'Measure_Short', 'Weighted_Average', 'Total_Population']]

##### Send to dataframe

# Save the processed CDC data to a CSV file
cdc_filtered_df.to_csv(cwd + f'/data/processed/processed_CDC_PLACES_data.csv', index=False)