This notebook focuses on reshaping metadata files (the Excel dictionaries that describe survey variables). It extracts the Sheet 1 and Sheet 2 Metadata (for context, there are a total of two metadata sheets available for all survey months for decoding purposes) across all years/months and saves them into a new folder (NEW Metadata Sheet 1 CSV's and NEW Metadata Sheet 2 CSV's).

- it loads metadata files from the inventory.

- It extracts extended variable names and descriptions from Sheet 1 (columns E and F).

- It cleans empty rows and trims whitespace.

- It saves reshaped metadata into a dedicated folder (data/interim/metadata_sheet1/).

- Batch process: Load raw metadata, reshape variables, save clean CSVs.

- Verification: Compare raw vs reshaped counts to ensure no variables or descriptions were lost.

**INTENT:**  Metadata often contains empty spaces and inconsistent formatting, which show up as NaN in analysis. By reshaping metadata first, we ensure that variable names and descriptions are clean and reliable.

#### Imports and Setup

In [1]:
import pandas as pd
import json
from pathlib import Path
import os

# Load settings
with open(Path("./data/interim/config.json")) as f:
    cfg = json.load(f)

BASE_PATH = Path(cfg["BASE_PATH"])
INTERIM_DIR = Path(cfg["INTERIM_DIR"])

print("Settings loaded.")

# Load inventory (already built in 01_Inventory.ipynb)
with open(INTERIM_DIR / "inventory.json") as f:
    inventory = json.load(f)

print("Inventory loaded. Years available:", list(inventory.keys()))

# Create NEW local subfolder for reshaped metadata outputs
metadata_out = INTERIM_DIR / "NEW_metadata_sheet1"
metadata_out.mkdir(parents=True, exist_ok=True)

print("Reshaped metadata will be temporarily saved to:", metadata_out)


Settings loaded.
Inventory loaded. Years available: ['2018', '2019', '2022', '2023', '2024']
Reshaped metadata will be temporarily saved to: data\interim\NEW_metadata_sheet1


#### Load dataset (optimized)

In [2]:
def load_dataset(year, month, filetype="survey", sheet_number=None):
    """
    Load a dataset from the inventory.
    - year: str, e.g., "2018"
    - month: str, e.g., "January"
    - filetype: "survey" or "metadata"
    - sheet_number: 0 (Sheet 1) or 1 (Sheet 2) for metadata
    """
    file_info = next((f for f in inventory[year][month] if f["filetype"] == filetype), None)
    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = BASE_PATH / year / file_info["filename"]

    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)
    if sheet_number is not None:
        return pd.read_excel(file_path, sheet_name=sheet_number, engine="openpyxl")
    return pd.read_excel(file_path, engine="openpyxl")


In [3]:
# Load the first sheet of January 2018 metadata
january_2018_metadata_sheet1 = load_dataset("2018", "January", "metadata", sheet_number=0)

# View the first few rows
print("=== January 2018 Metadata Sheet 1 (Raw) ===")
display(january_2018_metadata_sheet1.head())


=== January 2018 Metadata Sheet 1 (Raw) ===


Unnamed: 0,QUEST,Questionnaire,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,_IDS0,(Id Items),,
1,,,,,PUFREG,Region
2,,,,,PUFPRV,Province
3,,,,,PUFPRRCD,Province Recode
4,,,,,PUFHHNUM,Household Unique Sequential Number


### Sheet 1 Metadata Reshape Automation

#### Extract variables

In [4]:
def extract_variables(df):
    """
    Extract variable names and descriptions from metadata Sheet 1.
    Columns E and F (index 4 and 5) contain variable codes and descriptions.
    Returns a clean DataFrame with ['Variable', 'Description'].
    """
    df_vars = df.iloc[:, 4:6].copy()
    df_vars.columns = ['Variable', 'Description']

    # Drop empty rows
    df_vars = df_vars[df_vars['Variable'].notna() & (df_vars['Variable'].astype(str).str.strip() != '')]

    # Clean whitespace
    df_vars['Variable'] = df_vars['Variable'].astype(str).str.strip()
    df_vars['Description'] = df_vars['Description'].astype(str).str.strip()

    return df_vars.reset_index(drop=True)


#### Batch process (year by year option)

In [5]:
def batch_process_sheet1_metadata(inventory, base_output_path, years=None):
    """
    Process metadata Sheet 1 for selected years.
    Saves reshaped CSVs into NEW_metadata_sheet1 folder.
    """
    success_count, failure_count, skipped_count = 0, 0, 0
    errors_log = []

    main_folder_path = os.path.join(base_output_path, "data", "interim", "NEW_metadata_sheet1")
    os.makedirs(main_folder_path, exist_ok=True)

    print("--- STARTING BATCH PROCESS ---")
    print(f"Target Directory: {main_folder_path}")
    print("-" * 50)

    # Limit to selected years if provided
    years_to_process = years if years else inventory.keys()

    for year in years_to_process:
        year_folder_path = os.path.join(main_folder_path, year)
        os.makedirs(year_folder_path, exist_ok=True)

        for month, files_list in inventory[year].items():
            if month == "Unmatched":
                continue

            has_metadata = any(f.get('filetype') == 'metadata' for f in files_list)
            if not has_metadata:
                skipped_count += 1
                continue

            try:
                raw_df = load_dataset(year, month, "metadata", 0)
                clean_df = extract_variables(raw_df)

                filename = f"Sheet1_{month}_{year}.csv"
                full_save_path = os.path.join(year_folder_path, filename)
                clean_df.to_csv(full_save_path, index=False)

                print(f"[OK] Saved: {year}/{filename}")
                success_count += 1

            except Exception as e:
                print(f"[ERROR] Failed {month} {year}: {e}")
                errors_log.append(f"{month} {year}: {str(e)}")
                failure_count += 1

    print("\n=== PROCESSING SUMMARY REPORT ===")
    print(f"Total Successfully Saved: {success_count}")
    print(f"Total Failed:             {failure_count}")
    print(f"Total Skipped (No File):  {skipped_count}")
    print("-" * 40)
    if failure_count == 0:
        print("STATUS: COMPLETE SUCCESS")
    else:
        print("STATUS: COMPLETED WITH ERRORS")
        if errors_log:
            print("Error Details:")
            for err in errors_log:
                print(f" - {err}")


#### Verification

In [6]:
def batch_verify_sheet1_variable_and_description_count_verbose(inventory, base_output_path, years=None):
    """
    Compare raw vs reshaped metadata counts for assurance.
    Returns a DataFrame with PASS/FAIL per year/month.
    """
    results = []
    reshaped_folder = os.path.join(base_output_path, "data", "interim", "NEW_metadata_sheet1")

    years_to_process = years if years else inventory.keys()

    for year in years_to_process:
        for month, files_list in inventory[year].items():
            if month == "Unmatched":
                continue

            try:
                raw_df = load_dataset(year, month, "metadata", sheet_number=0)
            except Exception as e:
                print(f"[ERROR] {month} {year}: Could not load raw Sheet 1 ({e})")
                continue

            reshaped_file_path = os.path.join(reshaped_folder, year, f"Sheet1_{month}_{year}.csv")
            if not os.path.exists(reshaped_file_path):
                print(f"[ERROR] {month} {year}: Reshaped CSV missing!")
                continue

            reshaped_df = pd.read_csv(reshaped_file_path)

            raw_vars = raw_df.iloc[:,4].dropna().astype(str).str.strip()
            raw_vars = raw_vars[raw_vars != '']
            raw_descs = raw_df.iloc[:,5].dropna().astype(str).str.strip()
            raw_descs = raw_descs[raw_descs != '']

            reshaped_vars = reshaped_df['Variable'].astype(str).str.strip()
            reshaped_vars = reshaped_vars[reshaped_vars != '']
            reshaped_descs = reshaped_df['Description'].astype(str).str.strip()
            reshaped_descs = reshaped_descs[reshaped_descs != '']

            status = "PASS" if (len(raw_vars) == len(reshaped_vars) and len(raw_descs) == len(reshaped_descs)) else "FAIL"
            if status == "FAIL":
                print(f"[MISMATCH] {month} {year} - Variables: {len(raw_vars)} vs {len(reshaped_vars)}, "
                      f"Descriptions: {len(raw_descs)} vs {len(reshaped_descs)}")

            results.append({
                'Year': year, 'Month': month,
                'Raw Variable Count': len(raw_vars),
                'Reshaped Variable Count': len(reshaped_vars),
                'Raw Description Count': len(raw_descs),
                'Reshaped Description Count': len(reshaped_descs),
                'Status': status
            })

    df = pd.DataFrame(results).sort_values(['Year', 'Month']).reset_index(drop=True)
    return df


#### Run batch and verify  (2018 trial)

In [7]:
# Trial run: process only 2018 locally (fast, avoids Drive write)
batch_process_sheet1_metadata(inventory, ".", years=["2018"])

# Verify outputs for 2018
verification_df = batch_verify_sheet1_variable_and_description_count_verbose(inventory, ".", years=["2018"])

print("=== Verification Results (Local) ===")
display(verification_df)


--- STARTING BATCH PROCESS ---
Target Directory: .\data\interim\NEW_metadata_sheet1
--------------------------------------------------
[OK] Saved: 2018/Sheet1_April_2018.csv
[OK] Saved: 2018/Sheet1_July_2018.csv
[OK] Saved: 2018/Sheet1_January_2018.csv
[OK] Saved: 2018/Sheet1_October_2018.csv

=== PROCESSING SUMMARY REPORT ===
Total Successfully Saved: 4
Total Failed:             0
Total Skipped (No File):  0
----------------------------------------
STATUS: COMPLETE SUCCESS
=== Verification Results (Local) ===


Unnamed: 0,Year,Month,Raw Variable Count,Reshaped Variable Count,Raw Description Count,Reshaped Description Count,Status
0,2018,April,50,50,50,50,PASS
1,2018,January,50,50,50,50,PASS
2,2018,July,51,51,51,51,PASS
3,2018,October,51,51,51,51,PASS


In [8]:
# Load the reshaped January 2018 metadata (Sheet 1) from the saved CSV
reshaped_jan_2018 = pd.read_csv(INTERIM_DIR / "NEW_metadata_sheet1" / "2018" / "Sheet1_January_2018.csv")

print("=== January 2018 Metadata Sheet 1 (Reshaped) ===")
display(reshaped_jan_2018.head(10))


=== January 2018 Metadata Sheet 1 (Reshaped) ===


Unnamed: 0,Variable,Description
0,PUFREG,Region
1,PUFPRV,Province
2,PUFPRRCD,Province Recode
3,PUFHHNUM,Household Unique Sequential Number
4,PUFURB2K10,2010Urban-RuralFIES
5,PUFPWGTPRV,Final Weight Based on Projection (provincial p...
6,PUFSVYMO,Survey Month
7,PUFSVYYR,Survey Year
8,PUFPSU,Psu Number
9,PUFRPL,Replicate


#### Batch Automation (Redirect to Drive/Base_path as new folder)

In [9]:
def batch_process_sheet1_metadata_to_drive(inventory, base_output_path, years=None):
    """
    Process metadata Sheet 1 for selected years.
    Saves reshaped CSVs directly into NEW Metadata Sheet 1 CSV's in Google Drive.
    Prints a progress report while running.
    """
    success_count, failure_count, skipped_count = 0, 0, 0
    errors_log = []

    main_folder_name = "NEW Metadata Sheet 1 CSV's"
    main_folder_path = os.path.join(base_output_path, main_folder_name)
    os.makedirs(main_folder_path, exist_ok=True)

    # Count total tasks for progress tracking
    total_tasks = sum(
        1 for year in (years if years else inventory.keys())
        for month, files_list in inventory[year].items()
        if any(f.get('filetype') == 'metadata' for f in files_list)
    )
    current_task = 0

    print("--- STARTING BATCH PROCESS TO DRIVE ---")
    print(f"Target Directory: {main_folder_path}")
    print("-" * 50)

    years_to_process = years if years else inventory.keys()

    for year in years_to_process:
        year_folder_path = os.path.join(main_folder_path, year)
        os.makedirs(year_folder_path, exist_ok=True)

        for month, files_list in inventory[year].items():
            if month == "Unmatched":
                continue

            has_metadata = any(f.get('filetype') == 'metadata' for f in files_list)
            if not has_metadata:
                skipped_count += 1
                continue

            current_task += 1
            try:
                raw_df = load_dataset(year, month, "metadata", 0)
                clean_df = extract_variables(raw_df)

                filename = f"Sheet1_{month}_{year}.csv"
                full_save_path = os.path.join(year_folder_path, filename)
                clean_df.to_csv(full_save_path, index=False)

                print(f"[{current_task}/{total_tasks}] [OK] Saved: {year}/{filename}")
                success_count += 1

            except Exception as e:
                print(f"[{current_task}/{total_tasks}] [ERROR] Failed {month} {year}: {e}")
                errors_log.append(f"{month} {year}: {str(e)}")
                failure_count += 1

    print("\n=== PROCESSING SUMMARY REPORT ===")
    print(f"Total Successfully Saved: {success_count}")
    print(f"Total Failed:             {failure_count}")
    print(f"Total Skipped (No File):  {skipped_count}")
    print("-" * 40)
    if failure_count == 0:
        print("STATUS: COMPLETE SUCCESS")
        print(f"All files are now located in: {main_folder_path}")
    else:
        print("STATUS: COMPLETED WITH ERRORS")
        if errors_log:
            print("Error Details:")
            for err in errors_log:
                print(f" - {err}")


In [10]:
# Run batch process for ALL years, saving directly to Drive
batch_process_sheet1_metadata_to_drive(inventory, str(BASE_PATH))

--- STARTING BATCH PROCESS TO DRIVE ---
Target Directory: G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey\NEW Metadata Sheet 1 CSV's
--------------------------------------------------
[1/40] [OK] Saved: 2018/Sheet1_April_2018.csv
[2/40] [OK] Saved: 2018/Sheet1_July_2018.csv
[3/40] [OK] Saved: 2018/Sheet1_January_2018.csv
[4/40] [OK] Saved: 2018/Sheet1_October_2018.csv
[5/40] [OK] Saved: 2019/Sheet1_April_2019.csv
[6/40] [OK] Saved: 2019/Sheet1_July_2019.csv
[7/40] [OK] Saved: 2019/Sheet1_October_2019.csv
[8/40] [OK] Saved: 2019/Sheet1_January_2019.csv
[9/40] [OK] Saved: 2022/Sheet1_July_2022.csv
[10/40] [OK] Saved: 2022/Sheet1_June_2022.csv
[11/40] [OK] Saved: 2022/Sheet1_April_2022.csv
[12/40] [OK] Saved: 2022/Sheet1_August_2022.csv
[13/40] [OK] Saved: 2022/Sheet1_December_2022.csv
[14/40] [OK] Saved: 2022/Sheet1_February_2022.csv
[15/40] [OK] Saved: 2022/Sheet1_January_2022.csv
[16/40] [OK] Saved: 2022/Sheet1_March_2022.csv
[17/40] [OK] Saved: 2022/She

### Sheet 2 Metadata Reshape Automation

In [11]:
import os
import pandas as pd

def batch_process_sheet2_metadata(inventory, base_output_path, years=None):
    """
    Loops through the inventory to process 'Sheet 2' (Value Codes).
    Saves reshaped CSVs into NEW Metadata Sheet 2 CSV's.
    """

    success_count, failure_count, skipped_count = 0, 0, 0
    errors_log = []

    # 1. Define Main Folder Name (NEW folder for Sheet 2 outputs)
    main_folder_name = "NEW Metadata Sheet 2 CSV's"
    main_folder_path = os.path.join(base_output_path, main_folder_name)
    os.makedirs(main_folder_path, exist_ok=True)

    # Count total tasks for progress tracking
    total_tasks = sum(
        1 for year in (years if years else inventory.keys())
        for month, files_list in inventory[year].items()
        if any(f.get('filetype') == 'metadata' for f in files_list)
    )
    current_task = 0

    print("--- STARTING BATCH PROCESS (SHEET 2) ---")
    print(f"Target Directory: {main_folder_path}")
    print("-" * 50)

    # 2. Iterate through Inventory
    years_to_process = years if years else inventory.keys()

    for year in years_to_process:
        year_folder_path = os.path.join(main_folder_path, year)
        os.makedirs(year_folder_path, exist_ok=True)

        for month, files_list in inventory[year].items():
            if month == "Unmatched":
                continue

            has_metadata = any(f.get('filetype') == 'metadata' for f in files_list)
            if not has_metadata:
                skipped_count += 1
                continue

            current_task += 1
            try:
                # A. Load raw Sheet 2 (Value Codes)
                raw_df = load_dataset(year, month, "metadata", sheet_number=1)

                # B. Clean whitespace (optional, but intentional)
                raw_df = raw_df.applymap(lambda x: str(x).strip() if pd.notna(x) else x)

                # C. Generate Filename
                filename = f"Sheet2_{month}_{year}.csv"
                full_save_path = os.path.join(year_folder_path, filename)

                # D. Save
                raw_df.to_csv(full_save_path, index=False)

                print(f"[{current_task}/{total_tasks}] [OK] Saved: {year}/{filename}")
                success_count += 1

            except Exception as e:
                print(f"[{current_task}/{total_tasks}] [ERROR] Failed {month} {year}: {e}")
                errors_log.append(f"{month} {year}: {str(e)}")
                failure_count += 1

    # 3. Final Report
    print("\n" + "="*40)
    print("      SHEET 2 PROCESSING SUMMARY")
    print("="*40)
    print(f"Total Saved:    {success_count}")
    print(f"Total Failed:   {failure_count}")
    print(f"Total Skipped:  {skipped_count}")
    print("-" * 40)

    if failure_count == 0:
        print("STATUS: COMPLETE SUCCESS")
        print(f"Files are syncing to: {main_folder_path}")
    else:
        print("STATUS: COMPLETED WITH ERRORS")
        for err in errors_log:
            print(f" - {err}")
    print("="*40)
