### How to get raw data of all subject tables using API
* Note: Remote host may disconnect you in the middle. You may need to run it again from that point

In [None]:
import requests
import pandas as pd
import time

# List of all tables names available in subject tables
df_tables = pd.read_csv('all_subject_tables.csv')

years = range(2013, 2022)

for year in years:
    for table in df_tables['Table']:
        attempts = 0
        max_attempts = 5  # Maximum number of retries
        wait_seconds = 1  # Initial wait time between retries, can be increased after each attempt
        success = False

        while not success and attempts < max_attempts:
            try:
                # API URL
                url = f"https://api.census.gov/data/{year}/acs/acs5/subject?get=NAME,group({table})&for=zip%20code%20tabulation%20area:*"

                # Make the API request
                response = requests.get(url)

                # Check if the request was successful
                if response.status_code == 200:
                    # Convert response to JSON
                    data_json = response.json()

                    # Extract column names (first element) and data (the rest)
                    column_names = data_json[0]
                    data = data_json[1:]

                    df = pd.DataFrame(data, columns=column_names)

                    # Exclude columns containing 'EA' or 'MA'
                    columns_to_exclude = df.filter(regex='(EA|MA)').columns

                    # Drop these columns from the DataFrame
                    df_cleaned = df.drop(columns=columns_to_exclude)

                    # Save the DataFrame to an Excel file
                    output_file = f"../Raw_Data/ACS-{table}-{year}-Data.csv"
                    df_cleaned.to_csv(output_file, index=False)
                    success = True
                else:
                    print(f"Failed to retrieve data. Status code: {response.status_code}")
                    print(table)
                    break  # Exit the retry loop if the response is unsuccessful

            except requests.exceptions.ChunkedEncodingError as e:
                print(f"Connection error for table {table}, year {year}: {e}, attempting retry {attempts + 1}")
                time.sleep(wait_seconds)
                attempts += 1
                wait_seconds *= 2  # Implement exponential backoff

        if not success:
            print(f"Failed to process table {table}, year {year} after {max_attempts} attempts.")


### The following is to get 'type' information of each variable

In [None]:
import json

years = range(2015, 2023)

for year in years:

    # URL of the webpage to fetch
    url = f"https://api.census.gov/data/{year}/acs/acs5/subject/variables.json"

    # Use the requests library to fetch the content of the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # The content is already in JSON format, so parse it
        data = response.json()

        # Define the filename where you want to save the JSON data
        filename = f"../Metadata/{year}-type.json"

        # Open a file in write mode and save the JSON data
        with open(filename, 'w') as file:
            json.dump(data, file, indent=4)

    else:
        print("Failed to fetch the webpage")

### Adding variable types (float or int) to metadata

In [None]:
# Add type (float or int) to metadata so that different types don't merge together
years = range(2015, 2023)

for year in years:

    df = pd.read_csv(f'../Metadata/ACS-ST5Y{year}-Metadata.csv')

    with open(f'../Metadata/{year}-type.json', 'r') as f:
         json_data = json.load(f)

    def extract_predicate_type(name_id, json_lookup):
        # Directly access the 'variables' key in the JSON lookup
        variables = json_lookup.get('variables', {})
        return variables.get(name_id, {}).get('predicateType', 'Unknown')

    # Apply the corrected function to the DataFrame
    df['type'] = df['name'].apply(extract_predicate_type, json_lookup=json_data)  
    df.to_csv(f'../Metadata/ACS-ST5Y{year}-Metadata-type.csv', index=False)