Fraction of Missing Information

This notebook computes FMI diagnostics for renamed survey files.
It builds on:
- Config loader (`config.json`) for BASE_PATH, INTERIM_DIR, PROCESSED_DIR, LOG_DIR
- Inventory (`inventory.json`) for locating renamed survey files
- Consistency profile (`consistency_profile.csv`) for tagging variables

Outputs:
- Per‑month FMI reports in PROCESSED_DIR/FMI_Reports/<year>/
- Consolidated FMI summary in NEW Variable Consistency Check/fmi_profile.csv
- Log file in LOG_DIR/fmi_report.log

Key improvements over old FMI automation:
- No hardcoded paths (uses config loader from 00)
- Canonical variable alignment
- Integration with consistency profile
- Modular missingness detection
- Unified outputs and logging


In [None]:

import json
from pathlib import Path
import os
import pandas as pd
import numpy as np
from datetime import datetime

# --- Load config ---
with open(Path("./data/interim/config.json")) as f:
    cfg = json.load(f)

BASE_PATH = Path(cfg["BASE_PATH"])
INTERIM_DIR = Path(cfg["INTERIM_DIR"])
PROCESSED_DIR = Path(cfg["PROCESSED_DIR"])
LOG_DIR = Path(cfg["LOG_DIR"])
MONTH_ORDER = cfg["MONTH_ORDER"]

# --- Load inventory ---
with open(Path(INTERIM_DIR) / "inventory.json") as f:
    inventory = json.load(f)

# --- Paths ---
RENAMED_ROOT = BASE_PATH / "NEW Renamed Fully Decoded Surveys"
CONSISTENCY_ROOT = BASE_PATH / "NEW Variable Consistency Check"
OUTPUT_PATH = CONSISTENCY_ROOT / "FMI_Reports"
os.makedirs(OUTPUT_PATH, exist_ok=True)

# --- Load consistency profile ---
consistency_df = pd.read_csv(CONSISTENCY_ROOT / "consistency_profile.csv")
consistency_tags = dict(zip(consistency_df["Variable"], consistency_df["ConsistencyTag"]))


In [None]:
TEXT_MISSING = {"", " ", "NA", "N/A", "NaN", "nan", ".", "-", "_"}
NUMERIC_SENTINELS = {9, 99, 999, 9999, -9, -99, -999, -9999}

def build_missing_mask(series: pd.Series,
                       include_numeric_sentinels: bool = True) -> pd.Series:
    s = series.astype(str).str.strip()
    mask = series.isna() | (s == "") | s.isin(TEXT_MISSING)
    if include_numeric_sentinels:
        s_num = pd.to_numeric(series, errors="coerce")
        mask |= s_num.isin(NUMERIC_SENTINELS)
    return mask


#### FMI Scanner per file

In [None]:
def fmi_scan_csv(file_path: Path, year: str, month: str) -> pd.DataFrame:
    df = pd.read_csv(file_path, low_memory=False)
    rows = []
    for col in df.columns:
        miss_mask = build_missing_mask(df[col])
        missing = int(miss_mask.sum())
        total = int(len(df[col]))
        fmi = (missing / total) if total > 0 else 0.0

        # Severity flags
        if fmi < 0.05:
            flag, rec = "Low", "Keep"
        elif fmi < 0.20:
            flag, rec = "Moderate", "Consider imputation"
        elif fmi < 0.40:
            flag, rec = "High", "Strongly consider imputation"
        else:
            flag, rec = "Critical", "Candidate to drop (validate with business logic)"

        rows.append({
            "Year": year,
            "Month": month,
            "Variable": col.strip(),
            "Missing": missing,
            "Total": total,
            "FMI": round(fmi, 6),
            "Flag": flag,
            "Recommendation": rec,
            "ConsistencyTag": consistency_tags.get(col.strip(), "unknown")
        })
    return pd.DataFrame(rows)


#### Batch FMI Report Generator

In [None]:
# Codeblock 4 — Batch FMI Runner with UTF-8 logging (fixed)

import os
import pandas as pd
from datetime import datetime

log_file = LOG_DIR / f"fmi_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"

success_count, error_count = 0, 0
all_reports = []

with open(log_file, "w", encoding="utf-8") as log:
    log.write("STARTING FMI REPORT\n")
    log.write(f"Source: {RENAMED_ROOT}\n")
    log.write(f"Dest:   {OUTPUT_PATH}\n")
    log.write("===============================================\n\n")

    for year in os.listdir(RENAMED_ROOT):
        year_folder = RENAMED_ROOT / year
        if not year_folder.is_dir():
            continue

        for file in os.listdir(year_folder):
            if not file.endswith(".CSV"):
                continue

            month = file.split("_")[0].capitalize()
            file_path = year_folder / file

            try:
                report = fmi_scan_csv(file_path, year, month)
                out_file = OUTPUT_PATH / f"FMI_{month}_{year}.csv"
                report.to_csv(out_file, index=False)
                all_reports.append(report)

                success_count += 1
                log.write(f"[OK] {file} → {len(report)} variables\n")
            except Exception as e:
                error_count += 1
                log.write(f"[ERROR] {file} → {e}\n")

    log.write("\n===============================================\n")
    log.write(f"COMPLETED. Success: {success_count} | Errors: {error_count}\n")


#### Weighted Summary Across All Months

In [None]:
combined = pd.concat(all_reports, ignore_index=True)

FMI_summary = (
    combined.groupby(["Variable", "ConsistencyTag"])
    .agg(
        TotalMissing=("Missing", "sum"),
        TotalRows=("Total", "sum"),
        AvgFMI=("FMI", "mean"),
        MonthsObserved=("Year", "count")
    )
    .reset_index()
)

FMI_summary["OverallFMI"] = FMI_summary["TotalMissing"] / FMI_summary["TotalRows"]

def flag_and_rec(fmi):
    if fmi < 0.05:
        return "Low", "Keep"
    elif fmi < 0.20:
        return "Moderate", "Consider imputation"
    elif fmi < 0.40:
        return "High", "Strongly consider imputation"
    else:
        return "Critical", "Candidate to drop (validate with business logic)"

FMI_summary[["Flag", "Recommendation"]] = FMI_summary["OverallFMI"].apply(
    lambda x: pd.Series(flag_and_rec(x))
)

FMI_summary.to_csv(CONSISTENCY_ROOT / "fmi_profile.csv", index=False)


In [None]:
# Codeblock 5 — Consolidated FMI Summary with Drive folder creation

import os
import pandas as pd

# --- Paths ---
RENAMED_ROOT = BASE_PATH / "NEW Renamed Fully Decoded Surveys"
CONSISTENCY_ROOT = BASE_PATH / "NEW Variable Consistency Check"
FMI_REPORTS_ROOT = CONSISTENCY_ROOT / "NEW_FMI_Reports"
FINAL_SUMMARY_DIR = BASE_PATH / "NEW_FMI_Summary"   # new folder in Drive
os.makedirs(FINAL_SUMMARY_DIR, exist_ok=True)

# --- Load all monthly FMI reports ---
all_reports = []
for year in os.listdir(FMI_REPORTS_ROOT):
    year_folder = FMI_REPORTS_ROOT / year
    if not year_folder.is_dir():
        continue
    for file in os.listdir(year_folder):
        if not file.endswith(".csv"):
            continue
        file_path = year_folder / file
        df = pd.read_csv(file_path)
        all_reports.append(df)

# --- Combine all reports ---
combined = pd.concat(all_reports, ignore_index=True)

# --- Aggregate per variable across all years/months ---
FMI_summary = (
    combined.groupby(["Variable", "ConsistencyTag"])
    .agg(
        TotalMissing=("Missing", "sum"),
        TotalRows=("Total", "sum"),
        AvgFMI=("FMI", "mean"),
        MonthsObserved=("Year", "count")
    )
    .reset_index()
)

# Compute overall FMI (weighted by total rows)
FMI_summary["OverallFMI"] = FMI_summary["TotalMissing"] / FMI_summary["TotalRows"]

# Flag severity based on OverallFMI
def flag_and_rec(fmi):
    if fmi < 0.05:
        return "Low", "Keep"
    elif fmi < 0.20:
        return "Moderate", "Consider imputation"
    elif fmi < 0.40:
        return "High", "Strongly consider imputation"
    else:
        return "Critical", "Candidate to drop (validate with business logic)"

FMI_summary[["Flag", "Recommendation"]] = FMI_summary["OverallFMI"].apply(
    lambda x: pd.Series(flag_and_rec(x))
)

# --- Save consolidated summary ---
final_file = FINAL_SUMMARY_DIR / "fmi_profile.csv"
FMI_summary.to_csv(final_file, index=False)

print(f"[OK] Consolidated FMI summary saved to {final_file}")


In [None]:
# Codeblock 6 — Dual FMI Outputs (all variables and consistent variables)

import os
import pandas as pd

# --- Paths ---
CONSISTENCY_ROOT = BASE_PATH / "NEW Variable Consistency Check"
FINAL_SUMMARY_DIR = BASE_PATH / "NEW_FMI_Summary"
os.makedirs(FINAL_SUMMARY_DIR, exist_ok=True)

# --- Load consolidated FMI from Codeblock 5 ---
fmi_profile_path = FINAL_SUMMARY_DIR / "fmi_profile.csv"
fmi_df = pd.read_csv(fmi_profile_path)

# --- Save full FMI profile (all variables) ---
full_out = FINAL_SUMMARY_DIR / "fmi_profile_all.csv"
fmi_df.to_csv(full_out, index=False)

# --- Filter to consistent-only variables ---
fmi_consistent = fmi_df[fmi_df["ConsistencyTag"] == "consistent"].copy()
consistent_out = FINAL_SUMMARY_DIR / "fmi_profile_consistent.csv"
fmi_consistent.to_csv(consistent_out, index=False)

print(f"[OK] Full FMI profile saved to {full_out}")
print(f"[OK] Consistent-only FMI profile saved to {consistent_out}")
