In [21]:
import pandas as pd

In [22]:
pop = pd.read_excel("https://www2.census.gov/programs-surveys/popest/tables/2020-2023/state/totals/NST-EST2023-POP.xlsx",
                    # skip the first 3 rows
                    skiprows=3,
                    # skip the final 8 rows
                    skipfooter=8,
                    )

# rename columns
pop.columns = ["name", "apr_2020", "2020", "2021", "2022", "2023"]

# drop apr_2020 column
pop.drop(
    "apr_2020",
    axis=1,
    inplace=True
    )

# drop the first 5 rows
pop = pop.iloc[5:]

# get rid of the leading . in the name column
pop["name"] = pop["name"].str.replace(".", "")

pop

  pop["name"] = pop["name"].str.replace(".", "")


Unnamed: 0,name,2020,2021,2022,2023
5,Alabama,5031864,5050380,5073903,5108468
6,Alaska,732964,734923,733276,733406
7,Arizona,7186683,7272487,7365684,7431344
8,Arkansas,3014348,3028443,3046404,3067732
9,California,39503200,39145060,39040616,38965193
10,Colorado,5785219,5811596,5841039,5877610
11,Connecticut,3577586,3603691,3608706,3617176
12,Delaware,991862,1004881,1019459,1031890
13,District of Columbia,670839,669037,670949,678972
14,Florida,21591299,21830708,22245521,22610726


In [23]:
# add a % change from 2020 to 2023 column
pop["change"] = (pop["2023"] - pop["2020"]) / pop["2020"]

# add a total change column
pop["total_change"] = pop["2023"] - pop["2020"]

In [24]:
pop

Unnamed: 0,name,2020,2021,2022,2023,change,total_change
5,Alabama,5031864,5050380,5073903,5108468,0.015224,76604
6,Alaska,732964,734923,733276,733406,0.000603,442
7,Arizona,7186683,7272487,7365684,7431344,0.034044,244661
8,Arkansas,3014348,3028443,3046404,3067732,0.01771,53384
9,California,39503200,39145060,39040616,38965193,-0.013619,-538007
10,Colorado,5785219,5811596,5841039,5877610,0.01597,92391
11,Connecticut,3577586,3603691,3608706,3617176,0.011066,39590
12,Delaware,991862,1004881,1019459,1031890,0.040356,40028
13,District of Columbia,670839,669037,670949,678972,0.012124,8133
14,Florida,21591299,21830708,22245521,22610726,0.047215,1019427


In [25]:
import us

# make a separate df with state fips codes for each state fips name
state_fips = pd.DataFrame(
    [
        {"name": state.name, "fips": state.fips} for state in us.states.STATES
    ]
)
state_fips

# merge
pop = pd.merge(pop, state_fips, on="name")

# make a separate df with state abbreviations for each state fips name
state_abbr = pd.DataFrame(
    [
        {"name": state.name, "abbr": state.abbr} for state in us.states.STATES
    ]
)

# merge
pop = pd.merge(pop, state_abbr, on="name")

# make fips an integer
pop["fips"] = pop["fips"].astype(int)

# rename fips to id
pop.rename(columns={"fips": "id"}, inplace=True)

pop

Unnamed: 0,name,2020,2021,2022,2023,change,total_change,id,abbr
0,Alabama,5031864,5050380,5073903,5108468,0.015224,76604,1,AL
1,Alaska,732964,734923,733276,733406,0.000603,442,2,AK
2,Arizona,7186683,7272487,7365684,7431344,0.034044,244661,4,AZ
3,Arkansas,3014348,3028443,3046404,3067732,0.01771,53384,5,AR
4,California,39503200,39145060,39040616,38965193,-0.013619,-538007,6,CA
5,Colorado,5785219,5811596,5841039,5877610,0.01597,92391,8,CO
6,Connecticut,3577586,3603691,3608706,3617176,0.011066,39590,9,CT
7,Delaware,991862,1004881,1019459,1031890,0.040356,40028,10,DE
8,Florida,21591299,21830708,22245521,22610726,0.047215,1019427,12,FL
9,Georgia,10732390,10790385,10913150,11029227,0.027658,296837,13,GA


In [26]:
# save to csv
pop.to_csv("data/state_pop_change.csv", index=False)