In [13]:
import requests
import pandas as pd
import os
import pickle
import tempfile
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Define the years and the months
years = ["2010"]
months = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]

# Use a temporary directory provided by the OS
cache_dir = tempfile.gettempdir()

# Append a specific folder name for API cache within the temp directory
cache_dir = os.path.join(cache_dir, 'api_cache')
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

# Create a requests session
session = requests.Session()
# Retry configuration
retries = Retry(total=3, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))

# Function to save and load cached data
def save_data(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

def load_data(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

# Function to check and update geographic variables
def check_and_update_variables(year, month):
    filename = os.path.join(cache_dir, f"{year}-{month}-vars.pkl")
    if os.path.exists(filename):
        return load_data(filename)

    url_vars = f"https://api.census.gov/data/{year}/cps/basic/{month}/variables.json"
    try:
        response_vars = session.get(url_vars, timeout=20)
        response_vars.raise_for_status()
        variables = response_vars.json()['variables']
        key_to_use = "GTCBSA" if "GTCBSA" in variables else "CBSA" if "CBSA" in variables else None
        save_data(key_to_use, filename)
        return key_to_use
    except requests.exceptions.RequestException as e:
        print(f"Request failed for {year} {month}: {e}")
        return None

# Create an empty DataFrame to store the results
df = pd.DataFrame()

# Loop over the years and the months
for year in years:
    for month in months:
        geo_var = check_and_update_variables(year, month)
        if geo_var:
            params = {
                "get": f"PEMLR,{geo_var},HUBUS,PELAYFTO,PELKDUR,PECYC,PEABSRSN",
                "for": "state:*",
                "key": "e717ab9e44993306794f6f1b3f538b91c6af1140"  # Your Census API key
            }
            url = f"https://api.census.gov/data/{year}/cps/basic/{month}"
            filename = os.path.join(cache_dir, f"{year}-{month}.pkl")

            if os.path.exists(filename):
                data = load_data(filename)
            else:
                try:
                    response = session.get(url, params=params, timeout=20)
                    response.raise_for_status()
                    try:
                        data = response.json()
                        save_data(data, filename)
                    except ValueError:  # includes JSONDecodeError
                        print(f"Error decoding JSON for {year} {month}: {response.text}")
                        continue
                except requests.exceptions.RequestException as e:
                    print(f"Error fetching data for {year} {month}: {e}")
                    continue

            if data:
                temp_df = pd.DataFrame(data[1:], columns=data[0])
                temp_df['Year'] = year
                temp_df['Month'] = month
                df = pd.concat([df, temp_df], ignore_index=True)
        else:
            print(f"Skipping {year} {month} due to missing variables.")

# Print the first few rows of the DataFrame
print(df.head())


Error fetching data for 2010 apr: HTTPSConnectionPool(host='api.census.gov', port=443): Read timed out.
Error fetching data for 2010 jun: HTTPSConnectionPool(host='api.census.gov', port=443): Read timed out.
Error fetching data for 2010 oct: HTTPSConnectionPool(host='api.census.gov', port=443): Read timed out.
  PEMLR GTCBSA HUBUS PELAYFTO PELKDUR PECYC PEABSRSN state  Year Month
0     1      0     2       -1      -1     3       -1     1  2010   jan
1     3      0     2        1      -1    -1       -1     1  2010   jan
2     5      0     2       -1      -1     5       -1     1  2010   jan
3     4  22460     2       -1      60    -1       -1     1  2010   jan
4     4  22460     2       -1       3     5       -1     1  2010   jan


In [14]:
# Define the complete file path
file_path = "/Users/cmuhoza@unomaha.edu/Documents/Semester_Project/ten.csv"

# Save the DataFrame to CSV in the specified directory
df.to_csv(file_path, index=False)
