# Create Behavioral State Datasets
This file creates csv files of each state that contain the percentage of the female population who are obese, overweight, and report no physical activity.

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sns
import re
import pickle

### Load Existing Datasets

In [28]:
# read in behavioral data from csv file downloaded from US Census Bureau
all_behavior_data = pd.read_csv("data/all_behavioral_data.csv")
census_divisions_df = pd.read_csv("data/census_divisions.csv")

# import dictionary of dataframes containing female population numbers for each state between 2010 - 2023
with open('data/population_by_state.pkl', 'rb') as file:
    population_by_state = pickle.load(file)

# import dictionary of dataframes containing ACS population data for each year between 2010 - 2023
with open('data/population_dfs.pkl', 'rb') as file:
    population_dfs = pickle.load(file)

In [29]:
all_behavior_data.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2011,2011,AK,Alaska,BRFSS,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,2011.0,Value,...,"(64.845079957001, -147.722059036)",OWS,OWS1,Q036,VALUE,2,Race/Ethnicity,2 or more races,RACE,RACE2PLUS
1,2011,2011,AK,Alaska,BRFSS,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,2011.0,Value,...,"(64.845079957001, -147.722059036)",OWS,OWS1,Q036,VALUE,2,Race/Ethnicity,Other,RACE,RACEOTH
2,2011,2011,AK,Alaska,BRFSS,Physical Activity,Physical Activity - Behavior,Percent of adults who achieve at least 150 min...,2011.0,Value,...,"(64.845079957001, -147.722059036)",PA,PA1,Q044,VALUE,2,Sex,Female,SEX,FEMALE
3,2011,2011,AK,Alaska,BRFSS,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,2011.0,Value,...,"(64.845079957001, -147.722059036)",OWS,OWS1,Q036,VALUE,2,Age (years),35 - 44,AGEYR,AGEYR3544
4,2011,2011,AK,Alaska,BRFSS,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,2011.0,Value,...,"(64.845079957001, -147.722059036)",OWS,OWS1,Q037,VALUE,2,Income,"$15,000 - $24,999",INC,INC1525


In [30]:
all_behavior_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104272 entries, 0 to 104271
Data columns (total 33 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   YearStart                   104272 non-null  int64  
 1   YearEnd                     104272 non-null  int64  
 2   LocationAbbr                104272 non-null  object 
 3   LocationDesc                104272 non-null  object 
 4   Datasource                  104272 non-null  object 
 5   Class                       104272 non-null  object 
 6   Topic                       104272 non-null  object 
 7   Question                    104272 non-null  object 
 8   Data_Value_Unit             88872 non-null   float64
 9   Data_Value_Type             104272 non-null  object 
 10  Data_Value                  93505 non-null   float64
 11  Data_Value_Alt              93505 non-null   float64
 12  Data_Value_Footnote_Symbol  10767 non-null   object 
 13  Data_Value_Foo

In [31]:
census_divisions_df.head()

Unnamed: 0,State,Census Division,Census Division Code,Region
0,Connecticut,Division 1: New England,CENS-D1,Region 1: Northeast
1,Maine,Division 1: New England,CENS-D1,Region 1: Northeast
2,Massachusetts,Division 1: New England,CENS-D1,Region 1: Northeast
3,New Hampshire,Division 1: New England,CENS-D1,Region 1: Northeast
4,Rhode Island,Division 1: New England,CENS-D1,Region 1: Northeast


### Functions

In [32]:
# this function extracts the percent of females that are overweight, obese, overweight or obese, and do not get physical activity
# and stores the information in a dataframe
def create_dataframe(data, state, years, census, population):
    df = pd.DataFrame(years, columns = ['Year'])
    df['State'] = state
    df['Census Division'] = census[state][0].iloc[0,0]
    df['Census Division Code'] = census[state][1].iloc[0,0]
    df['Region'] = census[state][2].iloc[0,0]
    df['Female_Population'] = population_by_state[state].loc[(population_by_state[state]['Year'].isin(years)), "Female Population"].reset_index(drop=True)
    df['Percent_Overweight'] = data[(data['LocationDesc'] == state) & (data['Sex'] == "Female") & (data['Question'] == "Percent of adults aged 18 years and older who have an overweight classification")].Data_Value.values
    df['Percent_Obese'] = data[(data['LocationDesc'] == state) & (data['Sex'] == "Female") & (data['Question'] == "Percent of adults aged 18 years and older who have obesity")].Data_Value.values
    df['Percent_Overweight_or_Obese'] = df['Percent_Overweight'] + df['Percent_Obese']
    df['Percent_No_Activity'] = data[(data['LocationDesc'] == state) & (data['Sex'] == "Female") & (data['Question'] == "Percent of adults who engage in no leisure-time physical activity")].Data_Value.values

    return df

### Initialize Target Data Structures

In [33]:
# get states
states = []
for state in all_behavior_data['LocationDesc']:
    if state not in states:
        states.append(state)

# get years
years = []
for year in all_behavior_data['YearStart']:
    if year not in years:
        years.append(year)

# get census divisions, codes, and regions for each state
census_divisions = {}
for state in states:
    census_divisions[state] = [census_divisions_df.loc[census_divisions_df["State"] == state, ["Census Division"]],
                               census_divisions_df.loc[census_divisions_df["State"] == state, ["Census Division Code"]],
                               census_divisions_df.loc[census_divisions_df["State"] == state, ["Region"]]]
    


### Fill in Data & Export

In [34]:
# create a dictionary of dataframes for each state

state_behavior_data = {}

for state in states:
    if state not in ['National', 'Guam', 'Puerto Rico', 'Virgin Islands', 'District of Columbia']:
        state_behavior_data[state] = create_dataframe(all_behavior_data, state, years, census_divisions, population_by_state)
        filepath = "data/states/" + re.sub(r'\s+', '_', state.lower()) + "_behavioral_data.csv"
        state_behavior_data[state].to_csv(filepath, index = False)

# create a single dataframe that contains all the information in each state dataframe
all_states_behavior_data = pd.concat(state_behavior_data.values(), ignore_index = True)
all_states_behavior_data.to_csv("data/states/all_states_behavior_data.csv")


In [35]:
# store the state behavioral dictionary of state dataframes
with open('data/state_behavior_data.pkl', 'wb') as file:
    pickle.dump(state_behavior_data, file)

# store the dataframe containing all behavioral information   
with open('data/all_states_behavior_data.pkl', 'wb') as file:
    pickle.dump(all_states_behavior_data, file)  

In [36]:
state_behavior_data["Alabama"].head(20)

Unnamed: 0,Year,State,Census Division,Census Division Code,Region,Female_Population,Percent_Overweight,Percent_Obese,Percent_Overweight_or_Obese,Percent_No_Activity
0,2011,Alabama,Division 6: East South Central,CENS-D6,Region 3: South,2478327,30.5,31.8,62.3,35.3
1,2012,Alabama,Division 6: East South Central,CENS-D6,Region 3: South,2485732,30.1,34.1,64.2,30.4
2,2013,Alabama,Division 6: East South Central,CENS-D6,Region 3: South,2485732,30.9,34.0,64.9,35.6
3,2014,Alabama,Division 6: East South Central,CENS-D6,Region 3: South,2501408,29.8,32.9,62.7,30.8
4,2015,Alabama,Division 6: East South Central,CENS-D6,Region 3: South,2506169,28.4,36.4,64.8,33.7
5,2016,Alabama,Division 6: East South Central,CENS-D6,Region 3: South,2507248,29.5,35.2,64.7,32.9
6,2017,Alabama,Division 6: East South Central,CENS-D6,Region 3: South,2514851,30.2,37.4,67.6,35.8
7,2018,Alabama,Division 6: East South Central,CENS-D6,Region 3: South,2518600,29.6,37.8,67.4,33.8
8,2019,Alabama,Division 6: East South Central,CENS-D6,Region 3: South,2533574,30.1,36.2,66.3,34.3
9,2020,Alabama,Division 6: East South Central,CENS-D6,Region 3: South,2527452,29.8,40.3,70.1,32.1
