This notebook focuses on reshaping metadata files (the Excel dictionaries that describe survey variables). It extracts the Sheet 1 Metadata (for context, there are a total of two metadata sheets available for all survey months for decoding purposes) across all years/months and saves them into a new folder (NEW Metadata Sheet 1 CSV's).

- it loads metadata files from the inventory.

- It extracts extended variable names and descriptions from Sheet 1 (columns E and F).

- It cleans empty rows and trims whitespace.

- It saves reshaped metadata into a dedicated folder (data/interim/metadata_sheet1/).

- Batch process: Load raw metadata, reshape variables, save clean CSVs.

- Verification: Compare raw vs reshaped counts to ensure no variables or descriptions were lost.

**INTENT:**  Metadata often contains empty spaces and inconsistent formatting, which show up as NaN in analysis. By reshaping metadata first, we ensure that variable names and descriptions are clean and reliable.

#### Imports and Setup

In [None]:
import pandas as pd
import json
from pathlib import Path
import os

# Load settings
with open(Path("./data/interim/config.json")) as f:
    cfg = json.load(f)

BASE_PATH = Path(cfg["BASE_PATH"])
INTERIM_DIR = Path(cfg["INTERIM_DIR"])

print("Settings loaded.")

# Load inventory (already built in 01_Inventory.ipynb)
with open(INTERIM_DIR / "inventory.json") as f:
    inventory = json.load(f)

print("Inventory loaded. Years available:", list(inventory.keys()))

# Create NEW local subfolder for reshaped metadata outputs
metadata_out = INTERIM_DIR / "NEW_metadata_sheet1"
metadata_out.mkdir(parents=True, exist_ok=True)

print("Reshaped metadata will be saved to:", metadata_out)


#### Load dataset (optimized)

In [None]:
def load_dataset(year, month, filetype="survey", sheet_number=None):
    """
    Load a dataset from the inventory.
    - year: str, e.g., "2018"
    - month: str, e.g., "January"
    - filetype: "survey" or "metadata"
    - sheet_number: 0 (Sheet 1) or 1 (Sheet 2) for metadata
    """
    file_info = next((f for f in inventory[year][month] if f["filetype"] == filetype), None)
    if not file_info:
        raise ValueError(f"No {filetype} file found for {month} {year}")

    file_path = BASE_PATH / year / file_info["filename"]

    if filetype == "survey":
        return pd.read_csv(file_path, low_memory=False)
    if sheet_number is not None:
        # Optimization: only read columns E and F for metadata Sheet 1
        return pd.read_excel(file_path, sheet_name=sheet_number, usecols=[4,5])
    return pd.read_excel(file_path)


#### Extract variables

In [None]:
def extract_variables(df):
    """
    Extract variable names and descriptions from metadata Sheet 1.
    Columns E and F contain variable codes and descriptions.
    Returns a clean DataFrame with ['Variable', 'Description'].
    """
    df_vars = df.iloc[:, 4:6].copy()
    df_vars.columns = ['Variable', 'Description']

    # Drop empty rows
    df_vars = df_vars[df_vars['Variable'].notna() & (df_vars['Variable'].astype(str).str.strip() != '')]

    # Clean whitespace
    df_vars['Variable'] = df_vars['Variable'].astype(str).str.strip()
    df_vars['Description'] = df_vars['Description'].astype(str).str.strip()

    return df_vars.reset_index(drop=True)


#### Batch process (year by year option)

In [None]:
def batch_process_sheet1_metadata(inventory, base_output_path):
    """
    Loops through inventory, loads Sheet 1 metadata,
    reshapes it, and saves into NEW folder hierarchy.
    """
    success_count, failure_count, skipped_count = 0, 0, 0
    errors_log = []

    main_folder_name = "NEW Metadata Sheet 1 CSV's"
    main_folder_path = os.path.join(base_output_path, main_folder_name)
    os.makedirs(main_folder_path, exist_ok=True)

    print("--- STARTING BATCH PROCESS ---")
    print(f"Target Directory: {main_folder_path}")
    print("-" * 50)

    for year, months_data in inventory.items():
        year_folder_path = os.path.join(main_folder_path, year)
        os.makedirs(year_folder_path, exist_ok=True)

        for month, files_list in months_data.items():
            if month == "Unmatched":
                continue

            has_metadata = any(f.get('filetype') == 'metadata' for f in files_list)
            if not has_metadata:
                skipped_count += 1
                continue

            try:
                raw_df = load_dataset(year, month, "metadata", 0)
                clean_df = extract_variables(raw_df)

                filename = f"Sheet1_{month}_{year}.csv"
                full_save_path = os.path.join(year_folder_path, filename)
                clean_df.to_csv(full_save_path, index=False)

                print(f"[OK] Saved: {year}/{filename}")
                success_count += 1

            except Exception as e:
                print(f"[ERROR] Failed {month} {year}: {e}")
                errors_log.append(f"{month} {year}: {str(e)}")
                failure_count += 1

    print("\n=== PROCESSING SUMMARY REPORT ===")
    print(f"Total Successfully Saved: {success_count}")
    print(f"Total Failed:             {failure_count}")
    print(f"Total Skipped (No File):  {skipped_count}")
    print("-" * 40)
    if failure_count == 0:
        print("STATUS: COMPLETE SUCCESS")
    else:
        print("STATUS: COMPLETED WITH ERRORS")
        if errors_log:
            print("Error Details:")
            for err in errors_log:
                print(f" - {err}")


#### Verification

In [None]:
def batch_verify_sheet1_variable_and_description_count_verbose(inventory, base_path):
    """
    Compares raw vs reshaped metadata counts for assurance.
    Returns a DataFrame with PASS/FAIL per year/month.
    """
    results = []
    reshaped_folder = os.path.join(base_path, "NEW Metadata Sheet 1 CSV's")

    for year, months_data in inventory.items():
        for month, files_list in months_data.items():
            if month == "Unmatched":
                continue

            try:
                raw_df = load_dataset(year, month, "metadata", sheet_number=0)
            except Exception as e:
                print(f"[ERROR] {month} {year}: Could not load raw Sheet 1 ({e})")
                continue

            reshaped_file_path = os.path.join(reshaped_folder, year, f"Sheet1_{month}_{year}.csv")
            if not os.path.exists(reshaped_file_path):
                print(f"[ERROR] {month} {year}: Reshaped CSV missing!")
                continue

            reshaped_df = pd.read_csv(reshaped_file_path)

            raw_vars = raw_df.iloc[:, 4].dropna().astype(str).str.strip()
            raw_vars = raw_vars[raw_vars != '']
            raw_descs = raw_df.iloc[:, 5].dropna().astype(str).str.strip()
            raw_descs = raw_descs[raw_descs != '']

            reshaped_vars = reshaped_df['Variable'].astype(str).str.strip()
            reshaped_vars = reshaped_vars[reshaped_vars != '']
            reshaped_descs = reshaped_df['Description'].astype(str).str.strip()
            reshaped_descs = reshaped_descs[reshaped_descs != '']

            status = "PASS" if (len(raw_vars) == len(reshaped_vars) and len(raw_descs) == len(reshaped_descs)) else "FAIL"
            if status == "FAIL":
                print(f"[MISMATCH] {month} {year} - Variables: {len(raw_vars)} vs {len(reshaped_vars)}, "
                      f"Descriptions: {len(raw_descs)} vs {len(reshaped_descs)}")

            results.append({
                'Year': year, 'Month': month,
                'Raw Variable Count': len(raw_vars),
                'Reshaped Variable Count': len(reshaped_vars),
                'Raw Description Count': len(raw_descs),
                'Reshaped Description Count': len(reshaped_descs),
                'Status': status
            })

    df = pd.DataFrame(results).sort_values(['Year', 'Month']).reset_index(drop=True)
    return df


#### Run batch and verify  (2018 trial)

In [None]:
# Run batch process
batch_process_sheet1_metadata(inventory, BASE_PATH)

# Run verification
verification_df = batch_verify_sheet1_variable_and_description_count_verbose(inventory, BASE_PATH)

print("=== Verification Results ===")
display(verification_df)
