Fraction of Missing Information

This notebook computes FMI diagnostics for renamed survey files.
It builds on:
- Config loader (`config.json`) for BASE_PATH, INTERIM_DIR, PROCESSED_DIR, LOG_DIR
- Inventory (`inventory.json`) for locating renamed survey files
- Consistency profile (`consistency_profile.csv`) for tagging variables

Outputs:
- Per‑month FMI reports in PROCESSED_DIR/FMI_Reports/<year>/
- Consolidated FMI summary in NEW Variable Consistency Check V2/fmi_profile.csv
- Log file in LOG_DIR/fmi_report.log

Key improvements over old FMI automation:
- No hardcoded paths (uses config loader from 00_Settings.ipynb)
- Canonical variable alignment
- Integration with consistency profile
- Modular missingness detection
- Unified outputs and logging

In [1]:
import json
from pathlib import Path
import os
import pandas as pd
import numpy as np
import shutil
from datetime import datetime

# --- Load config ---
with open(Path("./data/interim/config.json")) as f:
    cfg = json.load(f)

BASE_PATH = Path(cfg["BASE_PATH"])
INTERIM_DIR = Path(cfg["INTERIM_DIR"])
PROCESSED_DIR = Path(cfg["PROCESSED_DIR"])
LOG_DIR = Path(cfg["LOG_DIR"])
MONTH_ORDER = cfg["MONTH_ORDER"]

# --- Paths ---
RENAMED_ROOT = BASE_PATH / "NEW Renamed Fully Decoded Surveys V2"
CONSISTENCY_ROOT = BASE_PATH / "NEW Variable Consistency Check V2"
FMI_MONTHLY_ROOT = BASE_PATH / "NEW FMI Reports V2"

# --- IDEMPOTENT CLEANUP LOGIC ---
# Removes the folder entirely if it exists to prevent "(1)" duplicates in GDrive
if FMI_MONTHLY_ROOT.exists():
    print(f" Cleaning existing FMI folder to prevent duplicates: {FMI_MONTHLY_ROOT}")
    shutil.rmtree(FMI_MONTHLY_ROOT)

os.makedirs(FMI_MONTHLY_ROOT, exist_ok=True)
print(f" Fresh folder created: {FMI_MONTHLY_ROOT}")

# --- Load consistency profile ---
consistency_df = pd.read_csv(CONSISTENCY_ROOT / "consistency_profile.csv")
consistency_tags = dict(zip(consistency_df["Variable"], consistency_df["ConsistencyTag"]))

# --- Missingness detector ---
TEXT_MISSING = {"", " ", "NA", "N/A", "NaN", "nan", ".", "-", "_"}
NUMERIC_SENTINELS = {9, 99, 999, 9999, -9, -99, -999, -9999}

def build_missing_mask(series: pd.Series, include_numeric_sentinels: bool = True) -> pd.Series:
    s = series.astype(str).str.strip()
    mask = series.isna() | (s == "") | s.isin(TEXT_MISSING)
    if include_numeric_sentinels:
        s_num = pd.to_numeric(series, errors="coerce")
        mask |= s_num.isin(NUMERIC_SENTINELS)
    return mask

 Cleaning existing FMI folder to prevent duplicates: G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey\NEW FMI Reports V2
 Fresh folder created: G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey\NEW FMI Reports V2


In [2]:
def fmi_scan_csv(file_path: Path, year: str, month: str) -> pd.DataFrame:
    df = pd.read_csv(file_path, low_memory=False)
    rows = []
    for col in df.columns:
        miss_mask = build_missing_mask(df[col])
        missing = int(miss_mask.sum())
        total = int(len(df[col]))
        fmi = (missing / total) if total > 0 else 0.0

        # Severity flags for quality audit
        if fmi < 0.05:
            flag, rec = "Low", "Keep"
        elif fmi < 0.20:
            flag, rec = "Moderate", "Consider imputation"
        elif fmi < 0.40:
            flag, rec = "High", "Strongly consider imputation"
        else:
            flag, rec = "Critical", "Candidate to drop (validate with business logic)"

        rows.append({
            "Year": year,
            "Month": month,
            "Variable": col.strip(),
            "Missing": missing,
            "Total": total,
            "FMI": round(fmi, 6),
            "Flag": flag,
            "Recommendation": rec,
            "ConsistencyTag": consistency_tags.get(col.strip(), "unknown"),
        })
    return pd.DataFrame(rows)

In [3]:
# --- Batch runner ---
# Use a static filename to prevent duplication in LOG_DIR
log_file = LOG_DIR / "fmi_report_latest.log"
success_count, error_count = 0, 0
all_reports = []

# Overwrite log file each run
with open(log_file, "w", encoding="utf-8") as log:
    log.write("STARTING FMI REPORT\n")
    log.write(f"Source: {RENAMED_ROOT}\n")
    log.write(f"Dest:   {FMI_MONTHLY_ROOT}\n")
    log.write("===============================================\n\n")

    for year in sorted(os.listdir(RENAMED_ROOT)):
        year_folder = RENAMED_ROOT / year
        if not year_folder.is_dir():
            continue

        year_out = FMI_MONTHLY_ROOT / year
        os.makedirs(year_out, exist_ok=True)

        for file in sorted(os.listdir(year_folder)):
            if not file.lower().endswith(".csv"):
                continue

            month = file.split("_")[0].capitalize()
            file_path = year_folder / file

            try:
                report = fmi_scan_csv(file_path, year, month)
                out_file = year_out / f"FMI_{month}_{year}.csv"
                
                # Overwrites existing file directly
                report.to_csv(out_file, index=False)
                all_reports.append(report)

                success_count += 1
                msg = f"[OK] {year}/{file} → {len(report)} variables"
                print(msg); log.write(msg + "\n")
            except Exception as e:
                error_count += 1
                msg = f"[ERROR] {file} → {e}"
                print(msg); log.write(msg + "\n")

    summary_msg = f"COMPLETED. Success: {success_count} | Errors: {error_count}"
    print(summary_msg); log.write("\n" + summary_msg + "\n")

[OK] 2018/APRIL_2018.CSV → 50 variables
[OK] 2018/JANUARY_2018.CSV → 50 variables
[OK] 2018/JULY_2018.CSV → 51 variables
[OK] 2018/OCTOBER_2018.CSV → 51 variables
[OK] 2019/APRIL_2019.CSV → 49 variables
[OK] 2019/JANUARY_2019.CSV → 49 variables
[OK] 2019/JULY_2019.CSV → 49 variables
[OK] 2019/OCTOBER_2019.CSV → 49 variables
[OK] 2022/APRIL_2022.csv → 52 variables
[OK] 2022/AUGUST_2022.CSV → 42 variables
[OK] 2022/DECEMBER_2022.CSV → 42 variables
[OK] 2022/FEBRUARY_2022.csv → 41 variables
[OK] 2022/JANUARY_2022.csv → 52 variables
[OK] 2022/JULY_2022.CSV → 52 variables
[OK] 2022/JUNE_2022.csv → 42 variables
[OK] 2022/MARCH_2022.csv → 41 variables
[OK] 2022/MAY_2022.csv → 42 variables
[OK] 2022/NOVEMBER_2022.CSV → 42 variables
[OK] 2022/OCTOBER_2022.CSV → 52 variables
[OK] 2022/SEPTEMBER_2022.CSV → 42 variables
[OK] 2023/APRIL_2023.CSV → 52 variables
[OK] 2023/AUGUST_2023.CSV → 41 variables
[OK] 2023/DECEMBER_2023.CSV → 41 variables
[OK] 2023/FEBRUARY_2023.CSV → 42 variables
[OK] 2023/JAN

In [4]:
# --- Consolidate in-memory reports ---
combined = pd.concat(all_reports, ignore_index=True)

FMI_summary = (
    combined.groupby(["Variable", "ConsistencyTag"])
    .agg(
        TotalMissing=("Missing", "sum"),
        TotalRows=("Total", "sum"),
        AvgFMI=("FMI", "mean"),
        MonthsObserved=("Year", "count"),
    )
    .reset_index()
)

FMI_summary["OverallFMI"] = FMI_summary["TotalMissing"] / FMI_summary["TotalRows"]

def flag_and_rec(fmi):
    if fmi < 0.05:
        return "Low", "Keep"
    elif fmi < 0.20:
        return "Moderate", "Consider imputation"
    elif fmi < 0.40:
        return "High", "Strongly consider imputation"
    else:
        return "Critical", "Candidate to drop (validate with business logic)"

FMI_summary[["Flag", "Recommendation"]] = FMI_summary["OverallFMI"].apply(
    lambda x: pd.Series(flag_and_rec(x))
)

# Save to primary location and mirror to consistency root
final_file_primary = FMI_MONTHLY_ROOT / "fmi_profile.csv"
FMI_summary.to_csv(final_file_primary, index=False)

final_file_mirror = CONSISTENCY_ROOT / "fmi_profile.csv"
FMI_summary.to_csv(final_file_mirror, index=False)

print(f"[OK] Consolidated FMI summary saved. Total Months Observed: {FMI_summary['MonthsObserved'].max()}")

[OK] Consolidated FMI summary saved. Total Months Observed: 40


In [5]:
# --- Load consolidated summary ---
fmi_df = pd.read_csv(FMI_MONTHLY_ROOT / "fmi_profile.csv")

# Full FMI (all variables)
full_out = FMI_MONTHLY_ROOT / "fmi_profile_all.csv"
fmi_df.to_csv(full_out, index=False)

# Consistent-only FMI (Crucial for Imputation stage)
fmi_consistent = fmi_df[fmi_df["ConsistencyTag"] == "consistent"].copy()
consistent_out = FMI_MONTHLY_ROOT / "fmi_profile_consistent.csv"
fmi_consistent.to_csv(consistent_out, index=False)

print(f"[OK] Full FMI profile saved to {full_out}")
print(f"[OK] Consistent-only FMI profile saved to {consistent_out}")

[OK] Full FMI profile saved to G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey\NEW FMI Reports V2\fmi_profile_all.csv
[OK] Consistent-only FMI profile saved to G:\.shortcut-targets-by-id\1VctTphaltRx4xcPxmTJlRTrxLalyuEt8\Labor Force Survey\NEW FMI Reports V2\fmi_profile_consistent.csv
