In [1]:
# ===============================================================
# 00 — CONFIG + INVENTORY LOADING
# ===============================================================
import json
from pathlib import Path
import os
import pandas as pd
from rapidfuzz import fuzz
from collections import defaultdict
import re

# Load settings from config.json
with open(Path("./data/interim/config.json")) as f:
    cfg = json.load(f)

BASE_PATH = Path(cfg["BASE_PATH"])
INTERIM_DIR = Path(cfg["INTERIM_DIR"])
PROCESSED_DIR = Path(cfg["PROCESSED_DIR"])
LOG_DIR = Path(cfg["LOG_DIR"])
MONTH_ORDER = cfg["MONTH_ORDER"]

# Load inventory
with open(Path(INTERIM_DIR) / "inventory.json") as f:
    inventory = json.load(f)

# Paths
decoded_path = BASE_PATH / "NEW Fully Decoded Surveys"
metadata_path = BASE_PATH / "NEW Metadata Sheet 2 CSV's"


In [2]:
# ===============================================================
# 01 — COLLECT ALL VARIABLES + MONTH APPEARANCES
# ===============================================================
all_columns = []
variable_months = defaultdict(set)

for year in os.listdir(decoded_path):
    year_folder = decoded_path / year
    if not year_folder.is_dir():
        continue

    for file in os.listdir(year_folder):
        if not file.endswith(".CSV"):
            continue

        month = file.split("_")[0].capitalize()
        month_year = f"{month} {year}"
        file_path = year_folder / file

        try:
            df = pd.read_csv(file_path, low_memory=False)
            for col in df.columns:
                col_clean = col.strip()
                all_columns.append(col_clean)
                variable_months[col_clean].add(month_year)
        except Exception as e:
            print(f"[ERROR] {file} -> {e}")

all_columns = sorted(set(all_columns))


In [3]:
# ===============================================================
# 02 — CLUSTER SIMILAR VARIABLES (EXCLUSIVE GROUPING)
# ===============================================================
similarity_threshold = 85
filtered_groups = {}
processed_vars = set()

for i, var1 in enumerate(all_columns):
    if var1 in processed_vars:
        continue

    current_group = []

    for j in range(i + 1, len(all_columns)):
        var2 = all_columns[j]
        if var2 in processed_vars:
            continue

        similarity = fuzz.token_sort_ratio(var1.lower(), var2.lower())
        if similarity >= similarity_threshold:
            current_group.append(var2)
            processed_vars.add(var2)

    if current_group:
        filtered_groups[var1] = current_group
        processed_vars.add(var1)


In [4]:
# ===============================================================
# 03 — HELPERS
# ===============================================================
def load_per_month_labels(variable):
    results = defaultdict(set)
    for year in os.listdir(metadata_path):
        year_folder = metadata_path / year
        if not year_folder.is_dir():
            continue
        for file in os.listdir(year_folder):
            if not file.startswith("Sheet2_") or not file.endswith(".csv"):
                continue
            month = file.split("_")[1].capitalize()
            month_year = f"{month} {year}"
            file_path = year_folder / file
            try:
                df = pd.read_csv(file_path, dtype=str).fillna("")
            except Exception:
                continue
            if "Description" not in df.columns or "Label" not in df.columns:
                continue
            match = df[df["Description"].astype(str).str.strip().str.lower() == variable.lower()]
            if not match.empty:
                labels_raw = [str(x).strip() for x in match["Label"].tolist() if str(x).strip() != ""]
                results[month_year].update(labels_raw)
    return results

def normalize(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.replace('\xa0', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()


In [5]:
# ===============================================================
# 04 — PRINT GROUPED ANALYSIS
# ===============================================================
print("\n============================================================")
print("      GROUPS WITH DETAILED LABEL + MONTH DIFFERENCE CHECK      ")
print("============================================================\n")

group_number = 1

for key, group in filtered_groups.items():
    full_group = sorted(set([key] + group))

    print(f"\n----- Group {group_number} -----")
    print("Variables:")
    for var in full_group:
        month_count = len(variable_months[var])
        print(f"- {var} ({month_count} months)")

    group_labels = {}
    overall_label_union = {}
    pretty_print_map = {}

    for var in full_group:
        per_month = load_per_month_labels(var)
        group_labels[var] = per_month

        overall = set()
        for labels in per_month.values():
            overall.update(labels)
            for label in labels:
                pretty_print_map[normalize(label)] = label

        overall_label_union[var] = overall

        print(f"\nVariable: {var}")
        if not per_month:
            print("Labels: (No labels found in metadata)")

        months_to_show = sorted(
            [m for m in per_month.keys() if m in variable_months[var]],
            key=lambda x: (int(x.split()[-1]), MONTH_ORDER.get(x.split()[0], 99))
        )

        if not months_to_show:
            for m in sorted(variable_months[var], key=lambda x: (int(x.split()[-1]), MONTH_ORDER.get(x.split()[0], 99))):
                print(f"  {m}: (empty)")
        else:
            for month in months_to_show:
                labels = per_month.get(month, set())
                sorted_labels = sorted(list(labels))
                if len(sorted_labels) > 10:
                    label_str = ", ".join(sorted_labels[:10]) + f", ... (+{len(sorted_labels)-10} more)"
                else:
                    label_str = ", ".join(sorted_labels)
                label_str = label_str if labels else "(empty)"
                print(f"  {month}: {label_str}")

    # Fingerprint comparison
    fingerprints = {
        var: frozenset([normalize(s) for s in overall_label_union.get(var, set())])
        for var in full_group
    }

    reference_var = full_group[0]
    reference_set = fingerprints[reference_var]
    same_global_vocab = all(fingerprints[v] == reference_set for v in full_group)

    temporal_mismatches = []
    is_temporally_consistent = True

    for var in full_group:
        var_global_set = fingerprints[var]
        for month in variable_months[var]:
            month_labels_raw = group_labels[var].get(month, set())
            month_labels_normalized = set([normalize(s) for s in month_labels_raw])
            missing_in_month = var_global_set - month_labels_normalized
            if missing_in_month:
                is_temporally_consistent = False
                missing_readable = sorted([pretty_print_map.get(x, x) for x in missing_in_month])
                if len(missing_readable) > 5:
                    missing_str = ", ".join(missing_readable[:5]) + "..."
                else:
                    missing_str = ", ".join(missing_readable)
                temporal_mismatches.append(f"- {var} in {month} is missing: {missing_str}")

    identical = same_global_vocab and is_temporally_consistent
    print("\nIdentical coding scheme?: ", "YES" if identical else "NO")

    if not identical:
        print("Differences found:")
        if temporal_mismatches:
            print(">> TEMPORAL INCONSISTENCIES (Labels missing in specific months):")
            for mismatch in temporal_mismatches[:10]:
                print(mismatch)
            if len(temporal_mismatches) > 10:
                print(f"... and {len(temporal_mismatches) - 10} more months.")

        for var in full_group:
            if var == reference_var:
                continue
            cur_set = fingerprints[var]
            extra_overall = cur_set - reference_set
            missing_overall = reference_set - cur_set

            if extra_overall:
                readable_extra = [pretty_print_map.get(x, x) for x in extra_overall]
                print(f"- {var} has EXTRA overall labels: {', '.join(sorted(readable_extra))}")

            if missing_overall:
                readable_missing = [pretty_print_map.get(x, x) for x in missing_overall]
                print(f"- {var} is MISSING overall labels: {', '.join(sorted(readable_missing))}")

    group_number += 1



      GROUPS WITH DETAILED LABEL + MONTH DIFFERENCE CHECK      


----- Group 1 -----
Variables:
- 2010Urban-RuralFIES (8 months)
- 2015Urban-RuralFIES (13 months)

Variable: 2010Urban-RuralFIES
  January 2018: Rural, Urban
  April 2018: Rural, Urban
  July 2018: Rural, Urban
  October 2018: Rural, Urban
  January 2019: Rural, Urban
  April 2019: Rural, Urban
  July 2019: Rural, Urban
  October 2019: Rural, Urban

Variable: 2015Urban-RuralFIES
  July 2022: Rural, Urban
  August 2022: Rural, Urban
  September 2022: Rural, Urban
  October 2022: Rural, Urban
  November 2022: Rural, Urban
  December 2022: Rural, Urban
  January 2023: Rural, Urban
  February 2023: Rural, Urban
  March 2023: Rural, Urban
  April 2023: Rural, Urban
  May 2023: Rural, Urban
  June 2023: Rural, Urban
  July 2023: Rural, Urban

Identical coding scheme?:  YES

----- Group 2 -----
Variables:
- C08-Overseas Filipino Indicator (17 months)
- C10-Overseas Filipino Indicator (17 months)

Variable: C08-Overseas Filipin

In [6]:
# ===============================================================
# RENAMING MAP (Unified Variable Names)
# ===============================================================
RENAMING_MAP = {
    "Urban-RuralFIES": [
        "2010Urban-RuralFIES", 
        "2015Urban-RuralFIES"
    ],
    "Location of Work (Province, Municipality)": [
        "C11 - Location of Work (Province, Municipality)", 
        "C11-Location of Work (Province, Municipality)", 
        "C12A - Location of Work (Province, Municipality)"
    ],
    "Normal Working Hours per Day": [
        "C17-Normal Working Hours per Day", 
        "C18-Normal Working Hours per Day"
    ],
    "Want More Hours of Work": [
        "C19-Want More Hours of Work", 
        "C20-Want More Hours of Work"
    ],
    "Look for Additional Work": [
        "C20-Look for Additional Work", 
        "C21-Look for Additional Work"
    ],
    "Other Job Indicator": [
        "C22-Other Job Indicator", 
        "C26-Other Job Indicator"
    ],
    "Total Hours Worked for all Jobs": [
        "C23-Total Hours Worked for all Jobs", 
        "C28-Total Hours Worked for all Jobs"
    ],
    "Looked for Work or Tried to Establish Business During the Past Week": [
        "C25-Looked for Work or Tried to Establish Business during the past week", 
        "C30-Looked for Work or Tried to Establish Business during the past week"
    ],
    "First Time to Look for Work": [
        "C25B - First time to look for work", 
        "C31-First Time to Look for Work"
    ],
    "Previous Job Indicator": [
        "C28-Previous Job Indicator", 
        "C38-Previous Job Indicator"
    ],
    "Previous Occupation": [
        "C31-Previous Occupation", 
        "C40-Previous Occupation"
    ],
    "Kind of Business (Past Quarter)": [
        "C33-Kind of Business (past quarter)", 
        "C43-Kind of Business (past quarter)"
    ],
    "Province": [
        "Province", 
        "province"
    ],
    "Province Recode": [
        "Province Recode", 
        "province_recode"
    ]
}

# ===============================================================
# PATHS AND DIRECTORY SETUP
# ===============================================================
SOURCE_FOLDER = "NEW Fully Decoded Surveys"
DESTINATION_FOLDER = "NEW Renamed Fully Decoded Surveys"

decoded_path = BASE_PATH / SOURCE_FOLDER
renamed_path = BASE_PATH / DESTINATION_FOLDER

os.makedirs(renamed_path, exist_ok=True)
print(f"Output folder created/verified: {renamed_path}")

# ===============================================================
# INVERT RENAMING MAP: Old Name → New Name
# ===============================================================
REVERSE_RENAMING_MAP = {}
for new_name, old_names in RENAMING_MAP.items():
    for old_name in old_names:
        REVERSE_RENAMING_MAP[old_name.strip()] = new_name.strip()


Output folder created/verified: G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey\NEW Renamed Fully Decoded Surveys


In [7]:
def rename_and_save_survey(source_filepath, dest_filepath, renaming_map):
    try:
        df = pd.read_csv(source_filepath, low_memory=False)
        columns_to_rename = {
            col: renaming_map[col.strip()]
            for col in df.columns if col.strip() in renaming_map
        }
        if columns_to_rename:
            df = df.rename(columns=columns_to_rename)
        df.to_csv(dest_filepath, index=False)
        return len(columns_to_rename)
    except Exception as e:
        print(f"[ERROR] Processing {source_filepath}: {e}")
        return -1

def run_batch_renaming(source_root, dest_root, renaming_map):
    total_files_processed = 0
    total_columns_unified = 0

    print("\n--- STARTING BATCH RENAMING AND SAVING ---")
    print(f"Source: {SOURCE_FOLDER}")
    print(f"Destination: {DESTINATION_FOLDER}")
    print(f"Total variables to unify: {len(renaming_map)}")
    print("-" * 50)

    for year in sorted(os.listdir(source_root)):
        year_source_folder = source_root / year
        if not year_source_folder.is_dir():
            continue

        year_dest_folder = dest_root / year
        os.makedirs(year_dest_folder, exist_ok=True)

        for filename in os.listdir(year_source_folder):
            if filename.lower().endswith(".csv"):
                source_filepath = year_source_folder / filename
                dest_filepath = year_dest_folder / filename

                renamed_count = rename_and_save_survey(
                    source_filepath, dest_filepath, renaming_map
                )

                if renamed_count >= 0:
                    total_files_processed += 1
                    total_columns_unified += renamed_count
                    if renamed_count > 0:
                        print(f"[OK] {year}/{filename}: Unified {renamed_count} column(s).")
                    else:
                        print(f"[OK] {year}/{filename}: Saved (No unification needed).")
                else:
                    print(f"[FAIL] {year}/{filename}: See error above.")

    print("-" * 50)
    print("BATCH RENAMING COMPLETE.")
    print(f"Total Files Processed: {total_files_processed}")
    print(f"Total Columns Unified Across All Files: {total_columns_unified}")
    print(f"Consolidated data is ready in '{DESTINATION_FOLDER}'.")


In [8]:
# Execute batch renaming
run_batch_renaming(decoded_path, renamed_path, REVERSE_RENAMING_MAP)



--- STARTING BATCH RENAMING AND SAVING ---
Source: NEW Fully Decoded Surveys
Destination: NEW Renamed Fully Decoded Surveys
Total variables to unify: 29
--------------------------------------------------
[OK] 2018/APRIL_2018.CSV: Unified 13 column(s).
[OK] 2018/JULY_2018.CSV: Unified 13 column(s).
[OK] 2018/JANUARY_2018.CSV: Unified 13 column(s).
[OK] 2018/OCTOBER_2018.CSV: Unified 13 column(s).
[OK] 2019/APRIL_2019.CSV: Unified 11 column(s).
[OK] 2019/JULY_2019.CSV: Unified 11 column(s).
[OK] 2019/OCTOBER_2019.CSV: Unified 11 column(s).
[OK] 2019/JANUARY_2019.CSV: Unified 11 column(s).
[OK] 2022/JULY_2022.CSV: Unified 11 column(s).
[OK] 2022/JUNE_2022.csv: Unified 12 column(s).
[OK] 2022/APRIL_2022.csv: Unified 11 column(s).
[OK] 2022/AUGUST_2022.CSV: Unified 12 column(s).
[OK] 2022/DECEMBER_2022.CSV: Unified 12 column(s).
[OK] 2022/FEBRUARY_2022.csv: Unified 11 column(s).
[OK] 2022/JANUARY_2022.csv: Unified 11 column(s).
[OK] 2022/MARCH_2022.csv: Unified 11 column(s).
[OK] 2022/MAY_

In [9]:
def check_duplicate_headers(source_root):
    if not os.path.exists(source_root):
        print(f"[ERROR] Source folder not found: {source_root}")
        return

    print("\n--- STARTING DUPLICATE HEADER CHECK ---")
    print("-" * 50)

    total_files_checked = 0
    files_with_duplicates = 0

    for year in sorted(os.listdir(source_root)):
        year_source_folder = os.path.join(source_root, year)
        if not os.path.isdir(year_source_folder):
            continue

        for filename in os.listdir(year_source_folder):
            if filename.lower().endswith(".csv"):
                source_filepath = os.path.join(year_source_folder, filename)
                try:
                    df = pd.read_csv(source_filepath, low_memory=False)
                    total_files_checked += 1

                    seen = set()
                    duplicates = set()
                    for col in df.columns:
                        if col in seen:
                            duplicates.add(col)
                        seen.add(col)

                    if duplicates:
                        files_with_duplicates += 1
                        print(f"[DUPLICATE FOUND] {year}/{filename}")
                        print(f"    Duplicated Headers: {sorted(list(duplicates))}")
                except Exception as e:
                    print(f"[ERROR] Failed to read {year}/{filename}: {e}")

    print("-" * 50)
    print("DUPLICATION CHECK COMPLETE.")
    print(f"Total files checked: {total_files_checked}")
    if files_with_duplicates > 0:
        print(f"Total files with duplicates: {files_with_duplicates} (REQUIRES CONSOLIDATION)")
    else:
        print("No duplicate headers found across all files. Ready for FMI.")


In [10]:
# Execute duplication check
check_duplicate_headers(renamed_path)



--- STARTING DUPLICATE HEADER CHECK ---
--------------------------------------------------
--------------------------------------------------
DUPLICATION CHECK COMPLETE.
Total files checked: 40
No duplicate headers found across all files. Ready for FMI.
