# Create Divisional & Regional Behavior Datasets
This notebook will create a dataframe for each census division and region that contains behavioral data for the female population.The data contained includes the percentage of the female population who are obese, overweight, and report no physical activity as well as overall female population size.

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sns
import re
import pickle

### Load Existing Datasets

In [102]:
# import dataframe containing the census division and region for each state
census_divisions_df = pd.read_csv("data/census_divisions.csv")

# import dicitionary of dataframes containing the behavioral data for each state
with open('data/state_behavior_data.pkl', 'rb') as file:
    state_behavior_data = pickle.load(file)

# import dataframe containing all the behavioral data for each state in a single dataframe    
with open('data/all_states_behavior_data.pkl', 'rb') as file:
    all_states_behavior_data = pickle.load(file)

# import dictionary of dataframes containing female population numbers for each state between 2010 - 2023
with open('data/population_by_state.pkl', 'rb') as file:
    population_by_state = pickle.load(file)

# import dictionary of dataframes containing ACS population data for each year between 2010 - 2023
with open('data/population_dfs.pkl', 'rb') as file:
    population_dfs = pickle.load(file)

### Functions

In [113]:
# compute the weighted average for a subset of the data
#       data: this is a pandas dataframe
#       category1 & category2: these are the indicators for the subset of the data we want to select for
#       category1_name & category2_name: these are the names of the columns that should be used to select for the subset of the data
#       weight_column: this is the name of the column that should be used as weights for the average
#       target_column: this is the name of the column that we want the weighted average of (we assume it consists of percentages)
def compute_weighted_average(data, category1, category1_name, category2, category2_name, weight_column, target_column):
    
    # get the relevant data subset and drop any rows that contain empty values
    subset_df = data[(data[category1_name] == category1) & (data[category2_name] == category2)].dropna(axis = 0, how = 'any')
    #print(subset_df)
    
    # calculate the overall total; this will be used in the denominator when calculating the weighted average
    total = subset_df[weight_column].sum()
    #print(f"total =", total)

    # calculate the numerator term in the weighted average
    numerator = sum(subset_df[weight_column] * subset_df[target_column] / 100)
    #print(f"numerator =", numerator)

    # compute the weighted average
    average = numerator / total * 100
    return average, total
        


### File-Specific Data Structures

In [109]:
census_divisions = ["Division 1: New England", "Division 2: Middle Atlantic", "Division 3: East North Central", 
                    "Division 4: West North Central", "Division 5: South Atlantic", "Division 6: East South Central",
                    "Division 7: West South Central", "Division 8: Mountain", "Division 9: Pacific"]

census_regions = ["Region 1: Northeast", "Region 2: Midwest", "Region 3: South", "Region 4: West"]

census_divisions_dict = dict(zip(census_divisions, 
                                 ['Region 1: Northeast', 'Region 1: Northeast', 'Region 2: Midwest', 'Region 2: Midwest',
                                  'Region 3: South', 'Region 3: South', 'Region 3: South', 'Region 4: West', 'Region 4: West']))

column_names = ['Percent_Overweight', 'Percent_Obese', 'Percent_Overweight_or_Obese', 'Percent_No_Activity']

# get years
years = []
for year in all_states_behavior_data['Year']:
    if year not in years:
        years.append(year)

### Initialize Target Data Structures

In [124]:
division_behavior_data = {}
for division in census_divisions:
    division_behavior_data[division] = pd.DataFrame({'Year': years,
                                                    'Census Division': division,
                                                    'Region': census_divisions_dict[division],
                                                    'Female_Population': [None] * len(years),
                                                    **{column: [None] * len(years) for column in column_names}})

region_behavior_data = {}
for region in census_regions:
    region_behavior_data[region] = pd.DataFrame({'Year': years,
                                                    'Region': region,
                                                    'Female_Population': [None] * len(years),
                                                    **{column: [None] * len(years) for column in column_names}})

### Fill in Data

In [135]:
# divisional data (in dictionary of dataframes format)
for division in census_divisions:
    for year in years:
        for column in column_names:
            average, total_population = compute_weighted_average(all_states_behavior_data, year, 'Year', division, 'Census Division', 'Female_Population', column)
            division_behavior_data[division].loc[(division_behavior_data[division]['Year'] == year), column] = average
        division_behavior_data[division].loc[(division_behavior_data[division]['Year'] == year), 'Female_Population'] = total_population

# create a single dataframe that contains all the information in each divisional dataframe
all_divisions_behavior_data = pd.concat(division_behavior_data.values(), ignore_index = True)

# regional data (in dictionary of dataframes format)
for region in census_regions:
    for year in years:
        for column in column_names:
            average, total_population = compute_weighted_average(all_states_behavior_data, year, 'Year', region, 'Region', 'Female_Population', column)
            region_behavior_data[region].loc[(region_behavior_data[region]['Year'] == year), column] = average
        region_behavior_data[region].loc[(region_behavior_data[region]['Year'] == year), 'Female_Population'] = total_population  

# create a single dataframe that contains all the information in each regional dataframe
all_regions_behavior_data = pd.concat(region_behavior_data.values(), ignore_index = True)

### Export Data Structures

In [148]:
# store divisional data as csv files
for division in division_behavior_data:
    filepath = "data/divisions/" + re.sub(r'[ :]', '', division[8:].lower()) + "_behavioral_data.csv"
    division_behavior_data[division].to_csv(filepath, index = False)
all_divisions_behavior_data.to_csv("data/divisions/all_divisions_behavior_data.csv", index = False)    

# store the division behavioral dictionary of division dataframes
with open('data/state_behavior_data.pkl', 'wb') as file:
    pickle.dump(state_behavior_data, file)

# store the dataframe containing all behavioral information for all divisions 
with open('data/all_divisions_behavior_data.pkl', 'wb') as file:
    pickle.dump(all_divisions_behavior_data, file) 

# store regional data as csv files
for region in region_behavior_data:
    filepath = "data/regions/" + re.sub(r'[ :]', '', region[6:].lower()) + "_behavioral_data.csv"
    region_behavior_data[region].to_csv(filepath, index = False)
all_regions_behavior_data.to_csv("data/regions/all_regions_behavior_data.csv", index = False)    

# store the region behavioral dictionary of region dataframes
with open('data/region_behavior_data.pkl', 'wb') as file:
    pickle.dump(region_behavior_data, file)

# store the dataframe containing all behavioral information for all regions 
with open('data/all_regions_behavior_data.pkl', 'wb') as file:
    pickle.dump(all_regions_behavior_data, file)     