
This notebook applies metadata Sheet 2 definitions to decode survey values.  
This notebook maps coded responses (e.g., 1, 2, 3) into human‑readable labels (e.g., Employed, Unemployed).

Dependencies:
- Run `00_Settings.ipynb` and `01_Inventory.ipynb` first.
- Requires outputs from `03_Metadata_Decoder.ipynb` (Header Encoded Surveys).
- Note: Survey CSVs already carry meaning from Sheet 2 reshaping. This notebook applies the final decoding logic.

Output:

- Fully decoded survey CSVs saved into **NEW Fully Decoded Surveys**.
- Reports per survey showing number of columns successfully decoded.

Notes:
- **Sheet 1 metadata** → header translation (column names).  
- **Sheet 2 metadata** → value translation (coded responses).  

**INTENT:** This notebook performs the **final decoding stage** of the Labor Force Survey pipeline.  
It applies **Sheet 2 metadata** to translate coded survey values into human‑readable labels.

- Next steps: duplicate variable detection, integrity checks, coverage scanning.


In [None]:
# Linter stubs (will be overwritten when 00_Settings.ipynb runs)
BASE_PATH: str
inventory: dict

In [None]:
# Ensure settings are loaded
%run ./00_Settings.ipynb
%run ./01_Inventory.ipynb

# Alias for compatibility
base_path = BASE_PATH

### Interim Sample 

#### Intent: Interim Sample Decoder

This interim function generates a **single sample output** (e.g., `NEW Fully Decoded Survey Sample` for **January 2018**) using the same decoding logic as the batch runner.  

Unlike the full batch process, which redirects all decoded surveys to Google Drive, this sample run saves directly into the local **interim repository path**.  

The purpose is to provide a quick preview of how a fully decoded CSV looks without requiring you to download the large, heavy files from Google Drive.  

Use this when:
- You want to validate decoding logic on a small subset before running the full batch.  
- You need a lightweight example for documentation, testing, or demonstration.  
- You want to inspect decoded values locally without waiting for the complete dataset.


In [None]:
# ============================================================
# Interim sample decoder (standalone, portable)
# ============================================================
def run_sample_decoding(base_path,
                        year="2018",
                        month="January",
                        interim_root=None):
    """
    Decode a single survey file (e.g., January 2018) for demonstration.
    Saves into '<base_path>/data/interim/NEW Fully Decoded Survey Sample/<year>/' by default.

    Parameters
    ----------
    base_path : str
        Root path defined in 00_Settings.ipynb
    year : str
        Year of the survey (default "2018")
    month : str
        Month of the survey (default "January")
    interim_root : str, optional
        Custom interim path. If None, defaults to '<base_path>/data/interim'.
    """

    # Normalize month capitalization (e.g., january -> January)
    month = month.strip().capitalize()
    if month.upper() not in MONTHS:
        raise ValueError(f"Invalid month: {month}. Expected one of: {', '.join(MONTHS)}")

    # Default interim path inside repo if not provided
    if interim_root is None:
        interim_root = os.path.join(base_path, "data", "interim")

    input_root = os.path.join(base_path, HEADER_ENCODED_FOLDER, year)
    output_root = os.path.join(interim_root, FULLY_DECODED_SAMPLE_FOLDER, year)
    os.makedirs(output_root, exist_ok=True)

    # Find the survey file for the given month
    if not os.path.exists(input_root):
        print(f"[SKIP] Input folder not found: {input_root}")
        return

    files = [f for f in os.listdir(input_root) if f.lower().endswith(".csv")]
    target_file = next((f for f in files if month.upper() in f.upper()), None)

    if not target_file:
        print(f"[SKIP] No survey file found for {month} {year} in {input_root}.")
        return

    print("================================================")
    print(f"SAMPLE DECODING: {month.upper()} {year}")
    print(f"Source: {input_root}")
    print(f"Dest:   {output_root}")
    print("================================================\n")

    try:
        # 1. Load Survey
        survey_path = os.path.join(input_root, target_file)
        df_survey = pd.read_csv(survey_path, low_memory=False)

        # 2. Load Metadata
        df_meta = load_clean_sheet2(base_path, year, month)

        # 3. Decode
        df_final, count = decode_survey_safe(df_survey, df_meta)

        # 4. Save
        save_path = os.path.join(output_root, target_file)
        df_final.to_csv(save_path, index=False)

        print(f"   [OK] Decoded {count} columns.")
        print(f"   [SAVED] {save_path}")

    except Exception as e:
        print(f"   [ERROR] {e}")


In [None]:
# ============================================================
# EXECUTION CONTROL
# ============================================================

# Option 1: Run interim sample first (lightweight preview)
run_sample_decoding(base_path, year="2018", month="January")

### Batch Runner

In [None]:
# ============================================================
# Full batch decoder (unchanged, uses same helpers)
# ============================================================
def run_batch_decoding(base_path):
    """
    Batch decode all survey CSVs using Sheet 2 metadata.
    Saves results into 'NEW Fully Decoded Surveys'.
    """
    input_root = os.path.join(base_path, HEADER_ENCODED_FOLDER)
    output_root = os.path.join(base_path, FULLY_DECODED_FOLDER)
    os.makedirs(output_root, exist_ok=True)

    print("================================================")
    print("STARTING BATCH VALUE DECODING")
    print(f"Source: {input_root}")
    print(f"Dest:   {output_root}")
    print("================================================\n")

    if not os.path.exists(input_root):
        print(f"Error: Input folder not found: {input_root}")
        return

    success, errors = 0, 0

    year_folders = [f for f in os.listdir(input_root) if f.isdigit() and os.path.isdir(os.path.join(input_root, f))]
    for year in sorted(year_folders):
        year_in = os.path.join(input_root, year)
        year_out = os.path.join(output_root, year)
        os.makedirs(year_out, exist_ok=True)

        files = [f for f in os.listdir(year_in) if f.lower().endswith(".csv")]
        for filename in files:
            match = MONTH_PATTERN.search(filename)
            if not match: continue
            month = match.group(1).capitalize()

            print(f"Processing: {month.upper()} {year}...")

            try:
                df_survey = pd.read_csv(os.path.join(year_in, filename), low_memory=False)
                df_meta = load_clean_sheet2(base_path, year, month)
                df_final, count = decode_survey_safe(df_survey, df_meta)

                save_path = os.path.join(year_out, filename)
                df_final.to_csv(save_path, index=False)

                print(f"   [OK] Decoded {count} columns.")
                print(f"   [SAVED] {filename}")
                success += 1

            except FileNotFoundError as e:
                print(f"   [SKIP] Metadata missing: {e}")
            except Exception as e:
                print(f"   [ERROR] {e}")
                errors += 1

            print("-" * 40)

    print(f"\nCOMPLETED. Success: {success} | Errors: {errors}")


In [None]:
# Option 2: Run full batch (heavy, Google Drive output)
run_batch_decoding(base_path)