
This notebook decodes header survey datasets using metadata Sheet 1 and Sheet 2 definitions.  
It renames raw variable codes into human‑readable descriptions, ensuring consistency across all survey files.

#### Dependencies
- Requires `00_Settings.ipynb` to be executed first (defines `inventory`, `base_path`, and global settings).
- Requires reshaped metadata outputs from `01_Metadata_Sheet1_Reshaper.ipynb`.
#### Outputs
- Decoded survey CSVs saved into the folder: **NEW Header Encoded Surveys**
- Formal translation reports per survey (coverage, untranslated codes).
#### Notes
- This notebook is functional and does not require reruns once executed.
- Each notebook in the pipeline retains its function and respects assigned variables consistently.


In [None]:
import json
from pathlib import Path
import os
import pandas as pd

# ------------------------------------------------------------
# Load settings from config.json (produced by 00_Settings.ipynb)
# ------------------------------------------------------------
with open(Path("./data/interim/config.json")) as f:
    cfg = json.load(f)

BASE_PATH = Path(cfg["BASE_PATH"])
INTERIM_DIR = Path(cfg["INTERIM_DIR"])
PROCESSED_DIR = Path(cfg["PROCESSED_DIR"])
LOG_DIR = Path(cfg["LOG_DIR"])
MONTH_ORDER = cfg["MONTH_ORDER"]

# ------------------------------------------------------------
# Load inventory (produced by 01_Inventory.ipynb)
# ------------------------------------------------------------
with open(Path(INTERIM_DIR) / "inventory.json") as f:
    inventory = json.load(f)

# Alias for compatibility
base_path = str(BASE_PATH)


### Loader Function

In [None]:
import pandas as pd
import os

def load_dataset(year, month, filetype="survey"):
    """
    Locate and load a dataset file (CSV or Excel) from the global inventory.
    Relies on 'inventory' and 'BASE_PATH' defined in 00_Settings.ipynb.
    """
    if year not in inventory or month not in inventory[year]:
        raise ValueError(f"No records found in inventory for {month} {year}.")

    files = inventory[year][month]
    found_file = next((f for f in files if f['filetype'] == filetype), None)
    if not found_file:
        raise FileNotFoundError(f"No {filetype} file found for {month} {year}.")

    file_path = os.path.join(base_path, year, found_file['filename'])
    return pd.read_csv(file_path, low_memory=False) if filetype == "survey" else pd.read_excel(file_path)


def load_clean_sheet1(year, month):
    """
    Load processed variable definitions (Sheet 1) from 'Metadata Sheet 1 CSV's'.
    """
    folder_name = "Metadata Sheet 1 CSV's"
    filename = f"Sheet1_{month}_{year}.csv"
    file_path = os.path.join(base_path, folder_name, year, filename)

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Processed metadata file not found at {file_path}")

    return pd.read_csv(file_path)


### Header translation

In [None]:
def apply_metadata_headers(survey_df, metadata_sheet1_df, year="Unknown", month="Survey"):
    """
    Rename raw survey columns using metadata definitions.
    Prints a translation report showing coverage and untranslated codes.
    """
    metadata_sheet1_df['Variable'] = metadata_sheet1_df['Variable'].astype(str).str.strip()
    metadata_sheet1_df['Description'] = metadata_sheet1_df['Description'].astype(str).str.strip()

    header_map = dict(zip(metadata_sheet1_df['Variable'], metadata_sheet1_df['Description']))

    original_cols = set(survey_df.columns)
    translated_cols = original_cols.intersection(header_map.keys())
    untranslated_cols = original_cols - header_map.keys()

    renamed_df = survey_df.rename(columns=header_map)

    print("\n" + "="*60)
    print(f"METADATA TRANSLATION REPORT: {month.upper()} {year}")
    print("="*60)
    print(f"Total Columns Detected:       {len(original_cols)}")
    print(f"Successfully Decoded:         {len(translated_cols)}")
    print(f"Remaining as Raw Codes:       {len(untranslated_cols)}")
    print("-" * 60)

    if not untranslated_cols:
        print("Status: SUCCESS (100% Metadata Coverage)")
    else:
        print("Status: PARTIAL SUCCESS")
        print("Untranslated Codes:", sorted(list(untranslated_cols)))

    print("="*60 + "\n")
    return renamed_df


### Batch Automation

In [None]:
def run_batch_header_translation(inventory, base_path):
    """
    Apply header translation to all survey CSVs and save results
    into 'NEW Header Encoded Surveys'.
    """
    output_folder_name = "NEW Header Encoded Surveys"
    output_base_path = os.path.join(base_path, output_folder_name)
    os.makedirs(output_base_path, exist_ok=True)

    print("================================================")
    print("STARTING BATCH HEADER TRANSLATION")
    print(f"Output Directory: {output_base_path}")
    print("================================================\n")

    success_count, skip_count, error_count = 0, 0, 0

    for year in sorted(inventory.keys()):
        year_output_path = os.path.join(output_base_path, year)
        os.makedirs(year_output_path, exist_ok=True)

        for month, files_list in inventory[year].items():
            if month == "Unmatched": continue
            print(f"Processing: {month.upper()} {year}...")

            try:
                survey_file_data = next((f for f in files_list if f['filetype'] == 'survey'), None)
                if not survey_file_data:
                    print("   [SKIP] No raw survey CSV found.")
                    skip_count += 1
                    continue

                raw_survey = load_dataset(year, month, "survey")
                clean_metadata = load_clean_sheet1(year, month)
                decoded_df = apply_metadata_headers(raw_survey, clean_metadata, year, month)

                save_path = os.path.join(year_output_path, survey_file_data['filename'])
                decoded_df.to_csv(save_path, index=False)
                print(f"   [OK] Saved File: {survey_file_data['filename']}")
                success_count += 1

            except FileNotFoundError:
                print(f"   [SKIP] Missing Metadata Sheet 1 CSV for {month} {year}.")
                skip_count += 1
            except Exception as e:
                print(f"   [ERROR] Failed to process: {e}")
                error_count += 1

            print("-" * 40)

    print("\n================================================")
    print("BATCH PROCESS COMPLETE")
    print(f"   Successful: {success_count}")
    print(f"   Skipped:    {skip_count}")
    print(f"   Errors:     {error_count}")
    print("================================================")


### Integrity Check

In [None]:
def verify_header_decoding_integrity(inventory, base_path):
    """
    Checks if all raw survey columns have been successfully decoded
    using metadata Sheet 1.
    
    Returns a DataFrame with:
    Year | Month | Raw Headers Count | Decoded Headers Count | Integrity Status
    """
    results = []

    for year, months_data in inventory.items():
        for month, files_list in months_data.items():
            if month == "Unmatched": continue

            try:
                raw_df = load_dataset(year, month, "survey")
                raw_headers = list(raw_df.columns)
                raw_count = len(raw_headers)

                meta_df = load_clean_sheet1(year, month)
                meta_df['Variable'] = meta_df['Variable'].astype(str).str.strip()
                meta_df['Description'] = meta_df['Description'].astype(str).str.strip()
                header_map = dict(zip(meta_df['Variable'], meta_df['Description']))

                decoded_count = sum(col in header_map for col in raw_headers)
                status = "PASS" if raw_count == decoded_count else "FAIL"

                results.append({
                    "Year": year,
                    "Month": month,
                    "Raw Headers Count": raw_count,
                    "Decoded Headers Count": decoded_count,
                    "Integrity Status": status
                })

            except Exception as e:
                results.append({
                    "Year": year,
                    "Month": month,
                    "Raw Headers Count": "ERROR",
                    "Decoded Headers Count": "ERROR",
                    "Integrity Status": f"FAIL ({e})"
                })

    result_df = pd.DataFrame(results).sort_values(["Year", "Month"]).reset_index(drop=True)

    print("\n===== HEADER DECODING INTEGRITY CHECK COMPLETE =====")
    total_failures = (result_df["Integrity Status"] != "PASS").sum()
    if total_failures == 0:
        print("SUCCESS: All survey column headers have been fully decoded.")
    else:
        print(f"Completed with {total_failures} months failing integrity checks.")
    print("====================================================\n")

    return result_df


### Execution Block

In [None]:
if __name__ == "__main__":
    if 'inventory' in locals() and 'base_path' in locals():
        run_batch_header_translation(inventory, base_path)
        integrity_df = verify_header_decoding_integrity(inventory, base_path)
        display(integrity_df)
    else:
        print("Skipping execution: 'inventory' or 'base_path' not found in scope.")
