In [1]:
import json
import pandas as pd
import os
import numpy as np
import requests
from dotenv import load_dotenv
import pathlib as path

In [2]:
load_dotenv()
api_key = os.getenv("CENSUS_API_KEY")

In [3]:
#All Census api requests start with https://api.census.gov/data
#Then it's the dataset (one of over 1,200), ex: 2018/pep/charagegroups
# Then "?get=" followed by the list of variables and geographies we want data for, ex:
#POP,GEONAME, DATE_DESC&DATE_CODE=11&RACE=10
#Then, set the geography/ FoR ex: "&for=county:*&in=state:24" The * means all available values

#https://api.census.gov/data/2018/pep/charagegroups?get=POP,GEONAME,DATE_DESC&DATE_CODE=11&RACE=10&for=county:*&in=state:24



In [4]:
query_url = f'https://api.census.gov/data/2021/pep/population?get=DENSITY_2020,DENSITY_2021,POP_2020,POP_2021,PPOPCHG_2021,NAME,STATE,REGION&for=state&key={api_key}'
query = requests.get(query_url).json()
print(json.dumps(query, indent=4))

[
    [
        "DENSITY_2020",
        "DENSITY_2021",
        "POP_2020",
        "POP_2021",
        "PPOPCHG_2021",
        "NAME",
        "STATE",
        "REGION",
        "state"
    ],
    [
        "57.7584243640",
        "58.1171593930",
        "3962031",
        "3986639",
        "0.6210955947",
        "Oklahoma",
        "40",
        null,
        "40"
    ],
    [
        "25.5338435350",
        "25.5629643700",
        "1961455",
        "1963692",
        "0.1140479899",
        "Nebraska",
        "31",
        null,
        "31"
    ],
    [
        "226.0689240300",
        "224.4561379100",
        "1451911",
        "1441553",
        "-0.7134046100",
        "Hawaii",
        "15",
        null,
        "15"
    ],
    [
        "11.7016675950",
        "11.8108489860",
        "887099",
        "895376",
        "0.9330412953",
        "South Dakota",
        "46",
        null,
        "46"
    ],
    [
        "167.8316023700",
        "169.1679021400",
 

In [5]:
query_df = pd.DataFrame(query)
query_df.head()
#query_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,DENSITY_2020,DENSITY_2021,POP_2020,POP_2021,PPOPCHG_2021,NAME,STATE,REGION,state
1,57.7584243640,58.1171593930,3962031,3986639,0.6210955947,Oklahoma,40,,40
2,25.5338435350,25.5629643700,1961455,1963692,0.1140479899,Nebraska,31,,31
3,226.0689240300,224.4561379100,1451911,1441553,-0.7134046100,Hawaii,15,,15
4,11.7016675950,11.8108489860,887099,895376,0.9330412953,South Dakota,46,,46


In [6]:
#take index=0 and make it the column headers

query_df.columns = query_df.iloc[0]
query_df = query_df[1:]
query_df.reset_index(drop=True, inplace=True)
query_df.head()

Unnamed: 0,DENSITY_2020,DENSITY_2021,POP_2020,POP_2021,PPOPCHG_2021,NAME,STATE,REGION,state
0,57.758424364,58.117159393,3962031,3986639,0.6210955947,Oklahoma,40,,40
1,25.533843535,25.56296437,1961455,1963692,0.1140479899,Nebraska,31,,31
2,226.06892403,224.45613791,1451911,1441553,-0.71340461,Hawaii,15,,15
3,11.701667595,11.810848986,887099,895376,0.9330412953,South Dakota,46,,46
4,167.83160237,169.16790214,6920119,6975218,0.7962146316,Tennessee,47,,47


In [7]:
#Rename columns & convert to int & float

renamed_query_df = query_df.rename(columns={
    "DENSITY_2020": "Pop.Density 2020",
    "DENSITY_2021": "Pop.Density 2021",    
    "POP_2020": "Total Pop.2020",
    "POP_2021": "Total Pop.2021",
    "PPOPCHG_2021": "Pop.Change '20-'21",
    "NAME": "State",
    "STATE": "State Code",
    "REGION": "Region",
    "state": "state"}
)
del renamed_query_df['state']
del renamed_query_df['Region']

renamed_query_df["Pop.Density 2020"] = renamed_query_df[
   "Pop.Density 2020"].astype(float)

renamed_query_df["Pop.Density 2021"] = renamed_query_df[
   "Pop.Density 2021"].astype(float)

renamed_query_df["Total Pop.2020"] = renamed_query_df[
   "Total Pop.2020"].astype(int)

renamed_query_df["Total Pop.2021"] = renamed_query_df[
   "Total Pop.2021"].astype(int)

renamed_query_df["Pop.Change '20-'21"] = renamed_query_df[
   "Pop.Change '20-'21"].astype(float)

renamed_query_df["State Code"] = renamed_query_df[
   "State Code"].astype(int)

renamed_query_df.head()


Unnamed: 0,Pop.Density 2020,Pop.Density 2021,Total Pop.2020,Total Pop.2021,Pop.Change '20-'21,State,State Code
0,57.758424,58.117159,3962031,3986639,0.621096,Oklahoma,40
1,25.533844,25.562964,1961455,1963692,0.114048,Nebraska,31
2,226.068924,224.456138,1451911,1441553,-0.713405,Hawaii,15
3,11.701668,11.810849,887099,895376,0.933041,South Dakota,46
4,167.831602,169.167902,6920119,6975218,0.796215,Tennessee,47


In [8]:
#Reorder columns

new_col_order = ["State", "State Code", "Pop.Density 2020", "Pop.Density 2021", "Total Pop.2020", "Total Pop.2021", "Pop.Change '20-'21"]
new_col_query_df = renamed_query_df[new_col_order]
new_col_query_df.head()

Unnamed: 0,State,State Code,Pop.Density 2020,Pop.Density 2021,Total Pop.2020,Total Pop.2021,Pop.Change '20-'21
0,Oklahoma,40,57.758424,58.117159,3962031,3986639,0.621096
1,Nebraska,31,25.533844,25.562964,1961455,1963692,0.114048
2,Hawaii,15,226.068924,224.456138,1451911,1441553,-0.713405
3,South Dakota,46,11.701668,11.810849,887099,895376,0.933041
4,Tennessee,47,167.831602,169.167902,6920119,6975218,0.796215


In [9]:
final_query_df = new_col_query_df.sort_values(by="Pop.Change '20-'21", ascending=False)
final_query_df

Unnamed: 0,State,State Code,Pop.Density 2020,Pop.Density 2021,Total Pop.2020,Total Pop.2021,Pop.Change '20-'21
34,Idaho,16,22.3579,23.001023,1847772,1900923,2.876491
46,Utah,49,39.83746,40.520796,3281684,3337975,1.715308
36,Montana,30,7.462661,7.586865,1086193,1104271,1.664345
45,Arizona,4,63.156938,64.022114,7177986,7276316,1.369883
17,South Carolina,45,170.658525,172.653449,5130729,5190705,1.168957
23,Delaware,10,509.037824,514.93862,991886,1003384,1.159206
10,Texas,48,111.830574,113.018201,29217653,29527941,1.061988
40,Florida,12,402.043917,405.980418,21569932,21781128,0.979122
5,Nevada,32,28.345716,28.618062,3114071,3143991,0.9608
3,South Dakota,46,11.701668,11.810849,887099,895376,0.933041
