# Data Cleaning & Wrangling

In [3]:
# Import libraries
import pandas as pd
import zipfile
import os
import geopandas as gpd
import chardet
import fiona

In [4]:
# Global Variables
RAW_DATA_DIR = "../data/raw_data"
CLEAN_DATA_DIR = "../data/clean_data"
HOUSING_SUPPLY_ZIP = os.path.join(RAW_DATA_DIR, "housing_supply/Residential_Construction_Permits_by_County.gdb.zip")
HOUSING_SUPPLY_DIR = os.path.join(RAW_DATA_DIR, "housing_supply/Residential_Construction_Permits_by_County")
HOUSING_SUPPLY_OUTPUT = os.path.join(CLEAN_DATA_DIR, "housing_supply/residential_construction_permits_2000_2022.csv")

POPULATION_HISTORICAL_DIR = os.path.join(RAW_DATA_DIR, "demand/population/historical")
POPULATION_HISTORICAL_OUTPUT = os.path.join(CLEAN_DATA_DIR, "demand/population/historical/msa_historical.csv")

POPULATION_PROJECTED_FILE = os.path.join(RAW_DATA_DIR, "demand/population/projected/estimates_all_msas_2020-2023.xlsx")
POPULATION_PROJECTED_OUTPUT = os.path.join(CLEAN_DATA_DIR, "demand/population/projected/msa_projected.csv")

WAGES_DIR = os.path.join(RAW_DATA_DIR, "demand/economic/wages")
WAGES_OUTPUT = os.path.join(CLEAN_DATA_DIR, "demand/economic/wages/msa_wages.csv")

UNEMPLOYMENT_DIR = os.path.join(RAW_DATA_DIR, "demand/economic/unemployment_rate")
UNEMPLOYMENT_OUTPUT = os.path.join(CLEAN_DATA_DIR, "demand/economic/unemployment_rate/msa_unemployment.csv")

MIGRATION_DIR = os.path.join(RAW_DATA_DIR, "demand/migration")
MIGRATION_OUTPUT_DIR = os.path.join(CLEAN_DATA_DIR, "demand/migration")

COST_BURDEN_DIR = os.path.join(RAW_DATA_DIR, "affordability_metrics/cost_burden")
COST_BURDEN_OUTPUT_DIR = os.path.join(CLEAN_DATA_DIR, "affordability_metrics/cost_burden")

HOUSING_PRICES_FILE = os.path.join(RAW_DATA_DIR, "affordability_metrics/housing_costs/metro_housing_prices.csv")
HOUSING_PRICES_OUTPUT = os.path.join(CLEAN_DATA_DIR, "affordability_metrics/housing_costs/metro_housing_prices.csv")

RENTAL_PRICES_FILE = os.path.join(RAW_DATA_DIR, "affordability_metrics/housing_costs/metro_rental_prices.csv")
RENTAL_PRICES_OUTPUT = os.path.join(CLEAN_DATA_DIR, "affordability_metrics/housing_costs/metro_rental_prices.csv")

In [5]:
def unzip_file(zip_file_path, extraction_dir):
    """
    Unzips the specified file into the given directory.

    Args:
        zip_file_path (str): Path to the zip file.
        extraction_dir (str): Directory where the file should be extracted.
    """
    if not os.path.exists(extraction_dir):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extraction_dir)

In [6]:
def read_geodatabase(gdb_path):
    """
    Reads a GeoDatabase and returns the first layer as a GeoDataFrame.

    Args:
        gdb_path (str): Path to the GeoDatabase file.

    Returns:
        gpd.GeoDataFrame: GeoDataFrame containing data from the first layer.
    """
    layers = fiona.listlayers(gdb_path)
    return gpd.read_file(gdb_path, layer=layers[0])

In [7]:
def clean_housing_supply(zip_file_path, extraction_dir, output_path):
    """
    Cleans housing supply data from a GeoDatabase.

    Args:
        zip_file_path (str): Path to the zip file containing GeoDatabase.
        extraction_dir (str): Directory to extract the GeoDatabase.
        output_path (str): Path to save the cleaned data.
    """
    unzip_file(zip_file_path, extraction_dir)
    gdb_path = [os.path.join(extraction_dir, file) for file in os.listdir(extraction_dir) if file.endswith('.gdb')][0]
    gdf = read_geodatabase(gdb_path)
    year_columns = [col for col in gdf.columns if any(str(year) in col for year in range(2000, 2023))]
    relevant_columns = ["GEOID", "NAME", "STATE_NAME"] + year_columns
    gdf_subset = gdf[relevant_columns]
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    gdf_subset.to_csv(output_path, index=False)

In [8]:
def clean_population_historical(directory, csv_files, output_path):
    """
    Cleans and combines historical population data from multiple CSV files.

    Args:
        directory (str): Path to the directory containing raw CSV files.
        csv_files (list): List of CSV file names in the directory.
        output_path (str): Path to save the combined cleaned dataset.
    """
    combined_data = []
    for file in csv_files:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        df['DATE'] = pd.to_datetime(df['DATE'])
        population_col = [col for col in df.columns if col != 'DATE'][0]
        df.rename(columns={population_col: 'POPULATION'}, inplace=True)
        msa_name = file.replace("msa_", "").replace(".csv", "")
        df['MSA'] = msa_name
        combined_data.append(df)
    combined_df = pd.concat(combined_data, ignore_index=True)
    combined_df.sort_values(by=['MSA', 'DATE'], inplace=True)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    combined_df.to_csv(output_path, index=False)

In [9]:
def clean_combine_save(directory, csv_files, output_path):
    """
    General-purpose function to clean and combine data from multiple CSV files.

    Args:
        directory (str): Path to the directory containing raw CSV files.
        csv_files (list): List of CSV file names in the directory.
        output_path (str): Path to save the combined cleaned dataset.
    """
    combined_data = []
    for file in csv_files:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        combined_data.append(df)
    combined_df = pd.concat(combined_data, ignore_index=True)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    combined_df.to_csv(output_path, index=False)

In [10]:
def clean_population_projected(file_path, output_path):
    """
    Cleans projected population data from an Excel file.

    Args:
        file_path (str): Path to the raw projected population file.
        output_path (str): Path to save the cleaned data.
    """
    df = pd.read_excel(file_path, skiprows=3)
    df.columns = ["Geographic Area", "Base Population (2020)", "Population 2020", "Population 2021", "Population 2022", "Population 2023"]
    df.dropna(subset=["Geographic Area"], inplace=True)
    irrelevant_rows = ["United States", ".In Metropolitan Statistical Area"]
    df = df[~df["Geographic Area"].isin(irrelevant_rows)]
    df['Geographic Area'] = df['Geographic Area'].str.lstrip('.')
    df = df[~df['Geographic Area'].str.contains('Division', na=False)]
    df['Geographic Area'] = df['Geographic Area'].str.replace('Metro Area', '', regex=False).str.strip()
    df = df.iloc[:396]
    df.reset_index(drop=True, inplace=True)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False)

In [11]:
def clean_migration(input_directory, output_directory):
    """
    Cleans migration data files in the input directory.

    Args:
        input_directory (str): Path to the directory containing raw migration data.
        output_directory (str): Path to save the cleaned data.
    """
    os.makedirs(output_directory, exist_ok=True)
    for file_name in os.listdir(input_directory):
        if file_name.endswith('.csv'):
            file_path = os.path.join(input_directory, file_name)
            df = pd.read_csv(file_path)
            df.dropna(axis=1, how='all', inplace=True)
            df.drop(columns=['ucgid', 'GEO_ID'], errors='ignore', inplace=True)
            cols_to_drop = [col for col in df.columns if col.endswith('M')]
            df.drop(columns=cols_to_drop, inplace=True)
            if 'NAME' in df.columns:
                df['NAME'] = df['NAME'].str.replace('Metro Area', '', regex=False).str.strip()
            cleaned_file_path = os.path.join(output_directory, file_name)
            df.to_csv(cleaned_file_path, index=False)

In [12]:
def clean_cost_burden(input_dir, output_dir):
    """
    Cleans cost burden data files.

    Args:
        input_dir (str): Path to the directory containing raw cost burden data.
        output_dir (str): Path to save cleaned data.
    """
    os.makedirs(output_dir, exist_ok=True)
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.csv'):
            file_path = os.path.join(input_dir, file_name)
            df = pd.read_csv(file_path)
            df.rename(columns={'geoname': 'city'}, inplace=True)
            df.drop(columns=['sumlevel'], errors='ignore', inplace=True)
            df.to_csv(os.path.join(output_dir, file_name), index=False)

In [13]:
def clean_housing_prices(file_path, output_path):
    """
    Cleans housing price data.

    Args:
        file_path (str): Path to the raw housing price data file.
        output_path (str): Path to save cleaned data.
    """
    # Detect encoding
    with open(file_path, 'rb') as f:
        raw_data = f.read(10000)
        detected_encoding = chardet.detect(raw_data)['encoding']

    # Read the file using the detected encoding
    df = pd.read_csv(file_path, delimiter='\t', encoding=detected_encoding)

    # Rename and clean columns
    df.rename(columns={'Region': 'city'}, inplace=True)
    df['city'] = df['city'].str.replace('metro area', '', regex=False).str.strip()

    # Save the cleaned data
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False)

In [14]:
def clean_rental_prices(input_path, output_path):
    """
    Cleans rental price data.

    Args:
        input_path (str): Path to the raw rental price file.
        output_path (str): Path to save cleaned data.
    """
    df = pd.read_csv(input_path)
    df.drop(columns=['RegionID', 'SizeRank'], errors='ignore', inplace=True)
    df.to_csv(output_path, index=False)

In [15]:
def clean_combine_save_wage(directory, csv_files, output_path):
    """
    Cleans and combines wage data.

    Args:
        directory (str): Path to the raw wage data directory.
        csv_files (list): List of CSV files in the directory.
        output_path (str): Path to save the cleaned combined dataset.
    """
    combined_data = []
    for file in csv_files:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        combined_data.append(df)
    combined_df = pd.concat(combined_data, ignore_index=True)
    combined_df.to_csv(output_path, index=False)

In [16]:
def clean_combine_save_unemployment(directory, csv_files, output_path):
    """
    Cleans and combines unemployment data.

    Args:
        directory (str): Path to raw unemployment data directory.
        csv_files (list): List of CSV files in the directory.
        output_path (str): Path to save the cleaned data.
    """
    combined_data = []
    for file in csv_files:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        combined_data.append(df)
    combined_df = pd.concat(combined_data, ignore_index=True)
    combined_df.to_csv(output_path, index=False)

In [17]:
if __name__ == "__main__":
    # Migration dataset
    clean_migration(
        input_directory='../data/raw_data/demand/migration',
        output_directory='../data/clean_data/demand/migration'
    )

    # Cost burden dataset
    clean_cost_burden(
        input_dir='../data/raw_data/affordability_metrics/cost_burden',
        output_dir='../data/clean_data/affordability_metrics/cost_burden'
    )

    # Housing prices dataset
    clean_housing_prices(
        file_path='../data/raw_data/affordability_metrics/housing_costs/metro_housing_prices.csv',
        output_path='../data/clean_data/affordability_metrics/housing_costs/metro_housing_prices.csv'
    )

    # Rental prices dataset
    clean_rental_prices(
        input_path='../data/raw_data/affordability_metrics/housing_costs/metro_rental_prices.csv',
        output_path='../data/clean_data/affordability_metrics/housing_costs/metro_rental_prices.csv'
    )

    # Population historical dataset
    clean_population_historical(
        directory='../data/raw_data/demand/population/historical',
        csv_files=[f for f in os.listdir('../data/raw_data/demand/population/historical') if f.endswith('.csv')],
        output_path='../data/clean_data/demand/population/historical/msa_historical.csv'
    )

    # Population projected dataset
    clean_population_projected(
        file_path='../data/raw_data/demand/population/projected/estimates_all_msas_2020-2023.xlsx',
        output_path='../data/clean_data/demand/population/projected/msa_projected.csv'
    )

    # Housing supply dataset
    clean_housing_supply(
        zip_file_path='../data/raw_data/housing_supply/Residential_Construction_Permits_by_County.gdb.zip',
        extraction_dir='../data/raw_data/housing_supply/Residential_Construction_Permits_by_County',
        output_path='../data/clean_data/housing_supply/residential_construction_permits_2000_2022.csv'
    )

    # Wages dataset
    clean_combine_save_wage(
        directory='../data/raw_data/demand/economic/wages',
        csv_files=[f for f in os.listdir('../data/raw_data/demand/economic/wages') if f.endswith('.csv')],
        output_path='../data/clean_data/demand/economic/wages/msa_wages.csv'
    )

    # Unemployment dataset
    clean_combine_save_unemployment(
        directory='../data/raw_data/demand/economic/unemployment_rate',
        csv_files=[f for f in os.listdir('../data/raw_data/demand/economic/unemployment_rate') if f.endswith('.csv')],
        output_path='../data/clean_data/demand/economic/unemployment_rate/msa_unemployment.csv'
    )

  return ogr_read(
