# Economics - Data Pull

## Import Needed Libraries

In [9]:
import pandas as pd
import requests
import textwrap
from time import sleep
import json
import xlsxwriter
import os
from openpyxl import workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import numpy as np

## Setting 'Key'

In [10]:
user_id = 'F370C269-2ED6-4B10-A31A-0D0C0CC18B61'

## Variables for API Calls

Assigning all the variables for each call to a python variable to be inserterted in the 
API call as a string.

In [11]:
api_vars = ['10','45','70','110','120','150','180','190','220','230','240',
            '250','260','270','280','290','300','310'
            ]

| Line Code| :DataFrame:  | Line Description     |
|  :---  |      :---:        |             ---         |
| 10     |CAINC30_10 | Personal income (thousands of dollars)   |
| 45     |CAINC30_45 | Net earnings by place of residence      |
| 70     |CAINC30_70 | Unemployment insurance compensation      |
| 110    |CAINC30_110| Per capita personal income      |
| 120    |CAINC30_120| Per capita net earnings      |
| 150    |CAINC30_150| Per capita unemployment insurance compensation      |
| 180    |CAINC30_180| Earnings by place of work      |
| 190    |CAINC30_190| Wages and salaries       |
| 220    |CAINC30_220| Farm proprietors' income      |
| 230    |CAINC30_230| Nonfarm proprietors' income       |
| 240    |CAINC30_240| Total employment (number of jobs)      |
| 250    |CAINC30_250| Wage and salary employment       |
| 260    |CAINC30_260| Proprietors employment       |
| 270    |CAINC30_270| Farm proprietors employment      |
| 280    |CAINC30_280| Nonfarm proprietors employment       |
| 290    |CAINC30_290| Average earnings per job (dollars)       |
| 300    |CAINC30_300| Average wages and salaries       |
| 310    |CAINC30_310| Average nonfarm proprietors' income        |

### Dataframe names to assign

Assigning all the dataframe names to a list to itterate through to name dataframes created during the API calls

In [12]:
df_names = ['DF_CAINC30_10','DF_CAINC30_45','DF_CAINC30_70','DF_CAINC30_110',
            'DF_CAINC30_120','DF_CAINC30_150','DF_CAINC30_180',
            'DF_CAINC30_190','DF_CAINC30_220','DF_CAINC30_230',
            'DF_CAINC30_240','DF_CAINC30_250','DF_CAINC30_260',
            'DF_CAINC30_270','DF_CAINC30_280','DF_CAINC30_290',
            'DF_CAINC30_300','DF_CAINC30_310'
            ]

### API Calls

In [13]:
# API endpoint URL
url = "https://apps.bea.gov/api/data"

# API parameters
dataset = "Regional"
method = "GetData"
year = "2020"
table_code = "CAINC30"
geofips = "COUNTY"

# Iterate over the api_vars and df_names simultaneously
for line, name in zip(api_vars, df_names):
    # API request
    payload = {
        "UserID": user_id,
        "method": method,
        "datasetname": dataset,
        "TableName": table_code,
        "GeoFIPS": geofips,
        "Year": year,
        "LineCode": line
    }
    response = requests.get(url, params=payload)

    # Check if the API call was successful
    if response.status_code == 200:
        try:
            # Convert JSON response to pandas DataFrame
            json_data = response.json()
            data = json_data['BEAAPI']['Results']['Data']
            df = pd.DataFrame(data)

            df.drop(['Code', 'TimePeriod', 'UNIT_MULT', 'NoteRef'], axis=1, inplace=True)

            # Rename columns if needed
            df.rename(columns={'GeoFips': 'FIPS', 'GeoName': 'Location', 'DataValue': 'Value', 'CL_UNIT': 'Measure In'},
                      inplace=True)

            # Set appropriate data types
            df['FIPS'] = df['FIPS'].astype(str)
            df['Location'] = df['Location'].astype(str)
            df['Value'] = df['Value'].astype(str)
            df['Measure In'] = df['Measure In'].astype(str)

            # Assign the DataFrame to the specified name
            globals()[name] = df

            # Print the DataFrame
            print(f"DataFrame '{name}':")
            print(df)
            print()
        except KeyError:
            print("Unexpected JSON structure. Check API response.")
    else:
        # API call failed
        print(f"API call failed for line {line}. Status code:", response.status_code)

DataFrame 'DF_CAINC30_10':
       FIPS        Location            Measure In       Value
0     01001     Autauga, AL  Thousands of dollars   2,664,063
1     01003     Baldwin, AL  Thousands of dollars  11,971,269
2     01005     Barbour, AL  Thousands of dollars     934,683
3     01007        Bibb, AL  Thousands of dollars     768,870
4     01009      Blount, AL  Thousands of dollars   2,265,818
...     ...             ...                   ...         ...
3135  56037  Sweetwater, WY  Thousands of dollars   2,326,633
3136  56039       Teton, WY  Thousands of dollars   6,786,373
3137  56041       Uinta, WY  Thousands of dollars     872,914
3138  56043    Washakie, WY  Thousands of dollars     438,250
3139  56045      Weston, WY  Thousands of dollars     323,424

[3140 rows x 4 columns]

DataFrame 'DF_CAINC30_45':
       FIPS        Location            Measure In      Value
0     01001     Autauga, AL  Thousands of dollars  1,609,537
1     01003     Baldwin, AL  Thousands of dollars  6,4

### Verify Pulls from API

I should have 18 Dataframes total.

- Dataframes ending in _0 should have:
    - Shape: 3140 rows, 4 columns

In [14]:
# Print the number of dataframes in df_names
print(f"Number of dataframes: {len(df_names)}")

print("Dataframes in df_names:")
for idx, name in enumerate(df_names):
    # Access the dataframe by name
    df = globals()[name]

    # Get the shape of the dataframe
    rows, cols = df.shape

    # Print the dataframe name and shape
    print(f"Dataframe '{name}':")
    print(f"Shape: {rows} rows, {cols} columns")

Number of dataframes: 18
Dataframes in df_names:
Dataframe 'DF_CAINC30_10':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_45':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_70':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_110':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_120':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_150':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_180':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_190':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_220':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_230':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_240':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_250':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_260':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_270':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_280':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_290':
Shape: 3140 rows, 4 columns
Dataframe 'DF_CAINC30_300':
Shape: 3140 rows, 4 columns
Da

### Checking for data errors

I should find no missing data.

In [15]:
for df_name in df_names:
    # Access the dataframe by name
    df = globals()[df_name]

    # Find the rows with missing data
    rows_with_missing_data = df[df.isnull().any(axis=1)]

    # Find the columns with missing data
    columns_with_missing_data = df.columns[df.isnull().any()]

    # Check if there is missing data in the dataframe
    if not rows_with_missing_data.empty or len(columns_with_missing_data) > 0:
        print(f"Missing data found in DataFrame '{df_name}'")

        # Replace NaN values with string 'NaN'
        df = df.replace({np.nan: 'NaN'})

        # Print the rows with missing data
        print("Rows with missing data:")
        print(rows_with_missing_data)

        # Print the columns with missing data
        print("Columns with missing data:")
        print(columns_with_missing_data)
        print()

        # Search for 'NaN' string in the dataframe
        rows_with_nan_string = df[df.eq('NaN').any(axis=1)]
        columns_with_nan_string = df.columns[df.eq('NaN').any()]

        # Print the rows with 'NaN' string
        print("Rows with 'NaN' string:")
        print(rows_with_nan_string)

        # Print the columns with 'NaN' string
        print("Columns with 'NaN' string:")
        print(columns_with_nan_string)
        print()
    else:
        print(f"No missing data found in DataFrame '{df_name}'")

No missing data found in DataFrame 'DF_CAINC30_10'
No missing data found in DataFrame 'DF_CAINC30_45'
No missing data found in DataFrame 'DF_CAINC30_70'
No missing data found in DataFrame 'DF_CAINC30_110'
No missing data found in DataFrame 'DF_CAINC30_120'
No missing data found in DataFrame 'DF_CAINC30_150'
No missing data found in DataFrame 'DF_CAINC30_180'
No missing data found in DataFrame 'DF_CAINC30_190'
No missing data found in DataFrame 'DF_CAINC30_220'
No missing data found in DataFrame 'DF_CAINC30_230'
No missing data found in DataFrame 'DF_CAINC30_240'
No missing data found in DataFrame 'DF_CAINC30_250'
No missing data found in DataFrame 'DF_CAINC30_260'
No missing data found in DataFrame 'DF_CAINC30_270'
No missing data found in DataFrame 'DF_CAINC30_280'
No missing data found in DataFrame 'DF_CAINC30_290'
No missing data found in DataFrame 'DF_CAINC30_300'
No missing data found in DataFrame 'DF_CAINC30_310'


### Export to excel
Create workbook and export wanted dataframes to excel as individual sheets.

In [18]:
# Define the file name
file_name = '2020_econ_data.xlsx'

# Get the file path in the current working directory
file_path = os.path.join(os.getcwd(), file_name)

# Check if the file exists
if os.path.exists(file_path):
    # Load the existing Excel file
    excel_file = pd.read_excel(file_path, engine='openpyxl')
    
    # Create a new ExcelWriter object using the existing file
    writer = pd.ExcelWriter(file_path, engine='openpyxl', if_sheet_exists = 'replace', mode= 'a')

    # Iterate through the dataframes in dfs
    for df_name, df in globals().items():
        # Check if the object is a DataFrame
        if isinstance(df, pd.DataFrame):
            # Check if the sheet already exists in the Excel file
            if df_name in writer.book.sheetnames:
                # Remove the existing sheet
                writer.book.remove(writer.book[df_name])
            
            # Write the dataframe to a separate sheet in the Excel file
            df.to_excel(writer, sheet_name=df_name, index=False)
    # Save and close the workbook
    writer.close()

else:
    # Create a new workbook
    writer = pd.ExcelWriter(file_path, engine='openpyxl')
    
    # Iterate through the dataframes in dfs
    for df_name, df in globals().items():
        # Check if the object is a DataFrame
        if isinstance(df, pd.DataFrame):
            # Write each dataframe to a separate sheet in the Excel file
            df.to_excel(writer, sheet_name=df_name, index=False)

    # Save and close the workbook
    writer.close()