In [38]:
import pandas as pd
import requests
import textwrap
from time import sleep
import json
import xlsxwriter
import os
from openpyxl import workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import numpy as np

In [39]:
key = '593ff23df30236471e2bd1165031f208ce43d9f4'

### Variables for API Calls

Assigning all the variables for each call to a python variable to be inserterted in the 
API call as a string.

In [40]:
total_households = ('NAME,DP03_0051E')
	
th_range = ('NAME,DP03_0052E,DP03_0052PE,DP03_0053E,DP03_0053PE,DP03_0054E,'
            'DP03_0054PE,DP03_0055E,DP03_0055PE,DP03_0056E,DP03_0056PE,'
            'DP03_0057E,DP03_0057PE,DP03_0058E,DP03_0058PE,DP03_0059E,'
            'DP03_0059PE,DP03_0060E,DP03_0060PE,DP03_0061E,DP03_0061PE'
            )
	
th_w_earn = ('NAME,DP03_0064E')
	
th_w_ss = ('NAME,DP03_0066E,DP03_0066PE')
	
th_w_ri = ('NAME,DP03_0068E,DP03_0068PE')
	
th_w_ssi = ('NAME,DP03_0070E,DP03_0070PE')
	
th_w_cpa = ('NAME,DP03_0072E,DP03_0072PE')
	
th_w_snap = ('NAME,DP03_0074E,DP03_0074PE')
	
families = ('NAME,DP03_0075E')
	
fm_range = ('NAME,DP03_0076E,DP03_0076PE,DP03_0077E,DP03_0077PE,DP03_0078E,'
            'DP03_0078PE,DP03_0079E,DP03_0079PE,DP03_0080E,DP03_0080PE,'
            'DP03_0081E,DP03_0081PE,DP03_0082E,DP03_0082PE,DP03_0083E,'
            'DP03_0083PE,DP03_0084E,DP03_0084PE,DP03_0085E,DP03_0085PE'
            )
	
per_capita = ('NAME,DP03_0088E')
	
non_family = ('NAME,DP03_0089E')
	
worker_earn = ('NAME,DP03_0092E')
	
worker_ft_earn_male = ('NAME,DP03_0093E')
	
worker_ft_earn_female = ('NAME,DP03_0094E')



In [41]:
get_vars = [total_households,th_range,th_w_earn,th_w_ss,th_w_ri,th_w_ssi,
            th_w_cpa,th_w_snap,families,fm_range,per_capita,non_family,
            worker_earn,worker_ft_earn_male,worker_ft_earn_female
            ]

df_vars = ['DF_total_households','DF_th_range','DF_th_w_earn','DF_th_w_ss',
           'DF_th_w_ri','DF_th_w_ssi','DF_th_w_cpa','DF_th_w_snap',
           'DF_families','DF_fm_range','DF_per_capita','DF_non_family',
           'DF_worker_earn','DF_worker_ft_earn_male','DF_worker_ft_earn_female'
           ]

### API Calls

completing 15 calls to the API and dumping each into a named dataframe in a dict

In [None]:
# API call parameters
url = 'https://api.census.gov/data/2020/acs/acs5/profile'
how = 'state:*'
where = 'county:*'

# Create an empty dictionary to store the data frames
dfs = {}

# Iterate through get_vars and df_vars simultaneously
for var, name in zip(get_vars, df_vars):
    r = requests.request('GET', url, params={"get": var, "for": where, "in": how, "key": key})

    # Check if the API call was successful
    if r.status_code == 200:
        data = r.json()

        # Create the data frame using the returned JSON data
        df_data = data[1:]  # Skip the first row which contains the column names
        dfs[name] = pd.DataFrame(df_data, columns=data[0])

        # Join 'state' and 'county' columns into a new column 'FIPS'
        dfs[name]['FIPS'] = dfs[name]['state'] + dfs[name]['county']

        # Convert specific columns to desired data types
        columns_to_convert = ['NAME', 'state', 'county', 'FIPS']
        columns_to_convert = [col for col in columns_to_convert if col in dfs[name].columns]
        columns_to_convert_int = dfs[name].columns.difference(columns_to_convert)
        columns_to_convert_float = []

        # Handle integer and float conversion separately
        for col in columns_to_convert_int:
            try:
                dfs[name][col] = dfs[name][col].astype(int)
            except ValueError:
                # If it cannot be converted to int, add it to the float conversion list
                columns_to_convert_float.append(col)

        for col in columns_to_convert_float:
            dfs[name][col] = dfs[name][col].astype(float)

        # Display the created data frame
        print(f"Data frame '{name}':")
        print(dfs[name])
    else:
        # API call failed
        print(f"API call for '{var}' failed. Status code:", r.status_code)

### Verify Pulls from API

I should have 15 Dataframes total.

- There should be 8 dataframes of:
    - Shape: 3221 rows, 5 columns
- There should be 5 dataframes of:
    - Shape: 3221 rows, 6 columns
- There should be 2 dataframes of:
    - Shape: 3221 rows, 24 columns

In [None]:
# Print the number of data frame names stored in dfs dictionary
print(f"Number of data frames: {len(dfs)}")

print("Data Frames in dfs:")
for name, df in dfs.items():
    # Get the shape of the data frame
    rows, cols = df.shape

    # Print the data frame name and shape
    print(f"Data frame '{name}':")
    print(f"Shape: {rows} rows, {cols} columns")

### Checking for data errors

I should find no missing data.

In [None]:
for df_name, df in dfs.items():
    # Find the rows with missing data
    rows_with_missing_data = df[df.isnull().any(axis=1)]

    # Find the columns with missing data
    columns_with_missing_data = df.columns[df.isnull().any()]

    # Check if there is missing data in the dataframe
    if not rows_with_missing_data.empty or len(columns_with_missing_data) > 0:
        print(f"Missing data found in DataFrame '{df_name}'")

        # Replace NaN values with string 'NaN'
        df = df.replace({np.nan: 'NaN'})

        # Print the rows with missing data
        print("Rows with missing data:")
        print(rows_with_missing_data)

        # Print the columns with missing data
        print("Columns with missing data:")
        print(columns_with_missing_data)
        print()

        # Search for 'NaN' string in the dataframe
        rows_with_nan_string = df[df.eq('NaN').any(axis=1)]
        columns_with_nan_string = df.columns[df.eq('NaN').any()]

        # Print the rows with 'NaN' string
        print("Rows with 'NaN' string:")
        print(rows_with_nan_string)

        # Print the columns with 'NaN' string
        print("Columns with 'NaN' string:")
        print(columns_with_nan_string)
        print()
    else:
        print(f"No missing data found in DataFrame '{df_name}'")

### Changing Column Names

Changing names of dataframe columns

In [None]:
fm_range_col_names = ['Estimate Less than $10,000','Percent Less than $10,000',
                      'Estimate$10,000 to $14,999','Percent$10,000 to $14,999',
                      'Estimate$15,000 to $24,999','Percent$15,000 to $24,999',
                      'Estimate$25,000 to $34,999','Percent$25,000 to $34,999',
                      'Estimate$35,000 to $49,999','Percent$35,000 to $49,999',
                      'Estimate$50,000 to $74,999','Percent$50,000 to $74,999',
                      'Estimate$75,000 to $99,999','Percent$75,000 to $99,999',
                      'Estimate$100,000 to $149,999',
                      'Percent$100,000 to $149,999',
                      'Estimate$150,000 to $199,999',
                      'Percent$150,000 to $199,999','Estimate$200,000 or more',
                      'Percent$200,000 or more'
                      ]

# Iterate over the items in the 'merged_dfs' dictionary
for item, df in merged_dfs.items():
    df.columns = col_names  # Assign the new column names to the data frame in place
    merged_dfs[item] = df  # Update the data frame in the 'merged_dfs' dictionary