# Data Collection

* Getting Migration data from Census Bureau and Percentage of Income Spent on Housing from HUD via API from the code below.
* For all other data collection downloads see [raw data directory](OneDrive/general_assembly_dsb/project_capstone/data/raw_data) and [data_info directory](OneDrive/general_assembly_dsb/project_capstone/data/data_info).

In [2]:
# Import necessary libraries
import pandas as pd
import requests
import getpass
import json
import os
import time

## Migration Data from Census Bureau

In [4]:
# Function to retrieve the API key
def get_census_api_key():
    """
    Retrieves the API key from a JSON file if it exists, or prompts the user to 
    input the API key if the file is not found. The API key is securely entered 
    using getpass and saved to a JSON file for future use.

    Returns:
        str: The API key retrieved from the JSON file or input by the user.
    """
    # Define the file path for the JSON file
    file_path = 'census_api_key.json'
    
    # Check if the JSON file already exists
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
            api_key = data.get('census_api_key')
            if api_key:
                print("Census Bureau API Key loaded from file.")
                return api_key
    else:
        # If the file does not exist, ask the user for the API key
        api_key = getpass.getpass("Enter your Census Bureau API Key: ")
        
        # Save the API key to a JSON file
        with open(file_path, 'w') as file:
            json.dump({'census_api_key': api_key}, file)
        print("API Key saved to file.")
        
        return api_key

In [6]:
# Function to fetch migration data
def fetch_migration(api_key, year):
    """
    Fetches migration data for the specified year from the Census API.

    Args:
        api_key (str): The API key for authentication.
        year (int): The year of the migration data to fetch.

    Returns:
        pd.DataFrame: A DataFrame containing the migration data for the given year.
    """
    # Construct the URL
    base_url = f"https://api.census.gov/data/{year}/acs/acs1"
    params = {
        "get": "group(B07201)",  # Requesting the specific group of migration data
        "ucgid": "pseudo(0100000US$3100000)",  # Placeholder for metro-level data
        "key": api_key  # API key for authentication
    }
    
    # Make the request
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        try:
            data = response.json()
            if not data:
                print(f"No data available for {year}.")
                return pd.DataFrame()
            
            # Convert JSON to DataFrame
            columns = data[0]  # Header row
            values = data[1:]  # Data rows
            return pd.DataFrame(values, columns=columns)
        except json.JSONDecodeError:
            print(f"Error decoding JSON for {year}. Response:\n{response.text}")
            return pd.DataFrame()
    else:
        print(f"Failed to fetch data for {year}. Status Code: {response.status_code}")
        print(f"Response Content:\n{response.text}")
        return pd.DataFrame()

In [8]:
# Function to save DataFrame to CSV
def save_census_df_to_csv(dataframe, year, directory="../data/raw_data/demand/migration"):
    """
    Saves the DataFrame to a CSV file in the specified directory.

    Args:
        dataframe (pd.DataFrame): The DataFrame to save.
        year (int): The year corresponding to the data.
        directory (str): The directory where the CSV file will be saved.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Reverse column order
    dataframe = dataframe[dataframe.columns[::-1]]
    
    # Define the file path
    file_path = os.path.join(directory, f"inflow-outflow_{year}.csv")
    
    # Save the DataFrame to CSV
    dataframe.to_csv(file_path, index=False)
    print(f"Data for {year} saved to {file_path}.")

In [10]:
# Main function to fetch and save migration data for all years
def census_main():
    # Retrieve the API key
    api_key = get_census_api_key()
    
    # Define the range of years to process
    start_year = 2005
    end_year = 2023
    
    # Loop through each year and process the data
    for year in range(start_year, end_year + 1):
        if year == 2020:
            print(f"Skipping year {year} (no data published).")
            continue
        
        print(f"Fetching data for year {year}...")
        df = fetch_migration(api_key, year)
        
        if not df.empty:
            save_census_df_to_csv(df, year)
        else:
            print(f"No data returned for {year}, skipping save.")

In [12]:
# Run the main function
if __name__ == "__main__":
    census_main()

Enter your Census Bureau API Key:  ········


API Key saved to file.
Fetching data for year 2005...
Data for 2005 saved to ../data/raw_data/demand/migration\inflow-outflow_2005.csv.
Fetching data for year 2006...
Data for 2006 saved to ../data/raw_data/demand/migration\inflow-outflow_2006.csv.
Fetching data for year 2007...
Data for 2007 saved to ../data/raw_data/demand/migration\inflow-outflow_2007.csv.
Fetching data for year 2008...
Data for 2008 saved to ../data/raw_data/demand/migration\inflow-outflow_2008.csv.
Fetching data for year 2009...
Data for 2009 saved to ../data/raw_data/demand/migration\inflow-outflow_2009.csv.
Fetching data for year 2010...
Data for 2010 saved to ../data/raw_data/demand/migration\inflow-outflow_2010.csv.
Fetching data for year 2011...
Data for 2011 saved to ../data/raw_data/demand/migration\inflow-outflow_2011.csv.
Fetching data for year 2012...
Data for 2012 saved to ../data/raw_data/demand/migration\inflow-outflow_2012.csv.
Fetching data for year 2013...
Data for 2013 saved to ../data/raw_data/de

## Cost Burden Data from HUD's Office of Policy Development & Research

In [24]:
# Function to retrieve the HUD API key
def get_hud_api_key():
    file_path = 'hud_api_key.json'
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
            api_key = data.get('hud_api_key')
            if api_key:
                print("HUD API Key loaded from file.")
                return api_key
    else:
        api_key = getpass.getpass("Enter your HUD API Key: ")
        with open(file_path, 'w') as file:
            json.dump({'hud_api_key': api_key}, file)
        print("HUD API Key saved to file.")
        return api_key

In [26]:
# Function to fetch cost burden data
def fetch_cost_burden(api_key, year_range, state_id, entity_id):
    base_url = 'https://www.huduser.gov/hudapi/public/chas'
    params = {
        'type': 5,
        'year': year_range,
        'stateId': state_id,
        'entityId': entity_id
    }
    headers = {'Authorization': f'Bearer {api_key}'}
    
    while True:
        response = requests.get(base_url, params=params, headers=headers)
        if response.status_code == 200:
            try:
                data = response.json()
                if not data:
                    print(f"No data available for year range {year_range}, State ID {state_id}, Entity ID {entity_id}.")
                    return None
                return pd.DataFrame(data)
            except json.JSONDecodeError:
                print(f"Error decoding JSON response for year range {year_range}.")
                return None
        elif response.status_code == 429:
            print("Rate limit exceeded. Waiting 60 seconds...")
            time.sleep(60)  # Wait for 60 seconds if rate limit exceeded
        else:
            print(f"Request failed with status code {response.status_code}.")
            print(f"Response content: {response.text}")
            return None
        time.sleep(3)  # Wait for 3 seconds between requests

In [28]:
# Function to save all locations for each year range into one CSV
def save_hud_df_to_csv(all_dataframes, year_range, directory="../data/raw_data/affordability_metrics/cost_burden"):
    """
    Saves all data for a given year range into a single CSV file.

    Args:
        all_dataframes (list of pd.DataFrame): A list of DataFrames to combine and save.
        year_range (str): The year range of the data (e.g., '2006-2010').
        directory (str): The directory where the CSV file will be saved.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Combine all DataFrames into one
    combined_df = pd.concat(all_dataframes, ignore_index=True)

    # Save the combined DataFrame to a CSV
    file_path = os.path.join(directory, f"affordability_{year_range}.csv")
    combined_df.to_csv(file_path, index=False)
    print(f"Data for Year Range {year_range} saved to {file_path}.")

In [30]:
def hud_main():
    hud_api_key = get_hud_api_key()
    year_ranges = [
        '2006-2010', '2007-2011', '2008-2012', '2009-2013',
        '2010-2014', '2011-2015', '2012-2016', '2013-2017',
        '2014-2018', '2015-2019', '2016-2020', '2017-2021'
    ]
    locations = {
        "36": ["51000"],  # New York
        "34": ["10000", "36000", "51000"],  # New Jersey
        "42": ["60000"],  # Pennsylvania
        "6": ["44000", "43000", "2000"],  # California
        "17": ["14000", "51622", "23074"],  # Illinois
        "48": ["19000", "27000", "4000", "35000", "72656", "70808"],  # Texas
        "11": ["50000"],  # District of Columbia
        "51": ["3000", "1000"],  # Virginia
        "12": ["45000", "24000", "76600"],  # Florida
        "10": ["77580"],  # Delaware
        "13": ["4000", "68516", "49756"],  # Georgia
        "25": ["7000", "11000"],  # Massachusetts
        "33": ["50260"],  # New Hampshire
    }

    for year_range in year_ranges:
        all_dataframes = []
        for state_id, entity_ids in locations.items():
            for entity_id in entity_ids:
                print(f"Fetching data for Year Range {year_range}, State ID {state_id}, Entity ID {entity_id}...")
                df = fetch_cost_burden(hud_api_key, year_range, state_id, entity_id)
                if df is not None:
                    print(f"Data fetched for Year Range {year_range}, State ID {state_id}, Entity ID {entity_id}.")
                    all_dataframes.append(df)
                else:
                    print(f"No data for Year Range {year_range}, State ID {state_id}, Entity ID {entity_id}.")
        if all_dataframes:
            print(f"Saving data for Year Range {year_range}...")
            save_hud_df_to_csv(all_dataframes, year_range)
        else:
            print(f"No data collected for Year Range {year_range}. Skipping save.")

In [32]:
# Run the main function
if __name__ == "__main__":
    hud_main()

Enter your HUD API Key:  ········


HUD API Key saved to file.
Fetching data for Year Range 2006-2010, State ID 36, Entity ID 51000...
Data fetched for Year Range 2006-2010, State ID 36, Entity ID 51000.
Fetching data for Year Range 2006-2010, State ID 34, Entity ID 10000...
Data fetched for Year Range 2006-2010, State ID 34, Entity ID 10000.
Fetching data for Year Range 2006-2010, State ID 34, Entity ID 36000...
Data fetched for Year Range 2006-2010, State ID 34, Entity ID 36000.
Fetching data for Year Range 2006-2010, State ID 34, Entity ID 51000...
Data fetched for Year Range 2006-2010, State ID 34, Entity ID 51000.
Fetching data for Year Range 2006-2010, State ID 42, Entity ID 60000...
Data fetched for Year Range 2006-2010, State ID 42, Entity ID 60000.
Fetching data for Year Range 2006-2010, State ID 6, Entity ID 44000...
Data fetched for Year Range 2006-2010, State ID 6, Entity ID 44000.
Fetching data for Year Range 2006-2010, State ID 6, Entity ID 43000...
Data fetched for Year Range 2006-2010, State ID 6, Entity 