# Create Population Dataset

The file will take data from the ACS survey from 2010 - 2023 and extract the female population per state for each year. The results are stored in a dictionary of dataframes for each state.

In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sns
import re
import pickle

### Load Existing Datasets

In [130]:
population_dfs = { 2010: pd.read_csv("data/population/ACSDP1Y2010DP05.csv"),
                   2011: pd.read_csv("data/population/ACSDP1Y2011DP05.csv"),
                   2012: pd.read_csv("data/population/ACSDP1Y2012DP05.csv"),
                   2013: pd.read_csv("data/population/ACSDP1Y2012DP05.csv"),
                   2014: pd.read_csv("data/population/ACSDP1Y2014DP05.csv"),
                   2015: pd.read_csv("data/population/ACSDP1Y2015DP05.csv"),
                   2016: pd.read_csv("data/population/ACSDP1Y2016DP05.csv"),
                   2017: pd.read_csv("data/population/ACSDP1Y2017DP05.csv"),
                   2018: pd.read_csv("data/population/ACSDP1Y2018DP05.csv"),
                   2019: pd.read_csv("data/population/ACSDP1Y2019DP05.csv"),
                   2020: pd.read_csv("data/population/ACSDP5Y2020DP05.csv"),
                   2021: pd.read_csv("data/population/ACSDP1Y2021DP05.csv"),
                   2022: pd.read_csv("data/population/ACSDP1Y2022DP05.csv"),
                   2023: pd.read_csv("data/population/ACSDP1Y2023DP05.csv")}

### File-Specific Data Structures

In [125]:
years = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
states = ['Alaska', 'Alabama', 'Arkansas', 'Arizona', 'California', 'Colorado', 'Connecticut', 'Delaware',  'Florida',
           'Georgia', 'Hawaii', 'Iowa', 'Idaho', 'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana', 'Massachusetts',
           'Maryland', 'Maine', 'Michigan', 'Minnesota', 'Missouri', 'Mississippi', 'Montana', 'North Carolina', 'North Dakota',
           'Nebraska', 'New Hampshire', 'New Jersey', 'New Mexico', 'Nevada', 'New York', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
           'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Virginia', 'Vermont', 'Washington',
           'Wisconsin', 'West Virginia', 'Wyoming']

### Initialize Target Data Structures

In [127]:
population_by_state = {}

for state in states:
    population_by_state[state] = pd.DataFrame({'Year': years, 
                                              'Female Population': [None] * len(years)})

### Fill in Data

In [128]:
for state in states:
    for year in years:
        # get the female population for this state; note: overall female population is stored in the 3rd row
        state_str = state + "!!Estimate"
        val = population_dfs[year].iloc[3][state_str]

        # the population data was stored as a string, so remove the comma and convert into an integer
        val = int(re.sub(r',', '', val))

        # assign the female population in the population_by_state dictionary/dataframe
        population_by_state[state].loc[(population_by_state[state]['Year'] == year),'Female Population'] = val

### Export

In [129]:
# store the dataframes using pickle

with open('data/population_by_state.pkl', 'wb') as file:
    pickle.dump(population_by_state, file)

with open('data/population_dfs.pkl', 'wb') as file:
    pickle.dump(population_dfs, file)