In [1]:
!pip install factor-analyzer



In [2]:
import json
from pathlib import Path
import pandas as pd
import numpy as np
from factor_analyzer import FactorAnalyzer, calculate_kmo, calculate_bartlett_sphericity
import matplotlib.pyplot as plt

# ------------------------------------------------------------
# Load settings from config.json
# ------------------------------------------------------------
with open(Path("./data/interim/config.json")) as f:
    cfg = json.load(f)

BASE_PATH = Path(cfg["BASE_PATH"])
CONSISTENCY_ROOT = BASE_PATH / "NEW Variable Consistency Check"
IMPUTED_ROOT = BASE_PATH / "Imputed Data for Analysis"
OUTPUT_RESULTS = BASE_PATH / "FA_Results"
OUTPUT_RESULTS.mkdir(exist_ok=True)


In [3]:
def normalize_name(name: str) -> str:
    return (
        str(name)
        .strip()
        .lower()
        .replace("\xa0", " ")
        .replace("-", " ")
        .replace("_", " ")
    )


This codeblock normalizes column names to avoid mismatches (e.g., “Available for Work” vs “available_for_work” given that they have the same meaning). This guarantees consistent variable matching across years and pipelines. **If you tweak this, keep transformations symmetric with imputation (view 09_Imputation. I applied the same rules for 10_Factor_Analysis). Changing normalization can break matching.**

In [4]:
consistency_df = pd.read_csv(CONSISTENCY_ROOT / "consistency_profile.csv")
consistent_vars = consistency_df[consistency_df["ConsistencyTag"] == "consistent"]["Variable"].tolist()
consistent_vars_norm = [normalize_name(v) for v in consistent_vars]


This code restricts analysis strictly to variables tagged “consistent”. **Factor analysis assumes that the variables being analyzed represent the same constructs across time.** If we include variables that change definition, coding, or availability between survey rounds, the resulting factors could be misleading. Restricting to consistent variables enforces the methodological guard  we  set during manual factor formation as it guarantees that the latent dimensions (sensitivity, resilience, exposure) are built on a stable foundation.



Changing which variables are tagged “consistent” (if ever this was tweaked) directly alters the set of inputs to factor analysis. That means the latent structure itself could shift — for example, adding a new employment indicator might create or split factors, while removing a resilience proxy could weaken interpretability. Any adjustment must be documented and justified, because it changes the theoretical and empirical basis of your Regional Financial Vulnerability Index.

In [6]:
results = []

for year_folder in sorted(IMPUTED_ROOT.iterdir()):
    if not year_folder.is_dir():
        continue
    
    print(f"\n=== Processing Year: {year_folder.name} ===")
    
    for file in year_folder.glob("imputed_*.csv"):
        print(f"Dataset: {file.name}")
        df = pd.read_csv(file, low_memory=False)
        
        # Normalize columns
        df.columns = [normalize_name(c) for c in df.columns]
        
        # Match consistent variables
        matched_vars = [v for v in consistent_vars_norm if v in df.columns]
        unmatched_vars = [v for v in consistent_vars_norm if v not in df.columns]
        
        if unmatched_vars:
            print("[WARNING] Unmatched variables:", unmatched_vars[:5], "...")
        
        # Restrict to matched consistent variables
        df_consistent = df[matched_vars].dropna()
        
        # Optimization: sample rows if dataset is huge
        if len(df_consistent) > 20000:
            df_consistent = df_consistent.sample(n=10000, random_state=42)
        
        # --- Cleaning step ---
        for col in df_consistent.columns:
            if not pd.api.types.is_numeric_dtype(df_consistent[col]):
                df_consistent[col] = pd.Categorical(df_consistent[col]).codes
        
        df_consistent = df_consistent.replace([np.inf, -np.inf], np.nan)
        df_consistent = df_consistent.dropna()
        df_consistent = df_consistent.loc[:, df_consistent.nunique() > 1]  # drop constant cols
        
        print("Shape after cleaning:", df_consistent.shape)
        
        # Suitability tests
        try:
            kmo_all, kmo_model = calculate_kmo(df_consistent)
            chi_square_value, p_value = calculate_bartlett_sphericity(df_consistent)
        except Exception as e:
            print("Suitability test failed:", e)
            continue
        
        print(f"KMO={kmo_model:.3f}, Bartlett p={p_value:.5f}")
        
        # Factor analysis
        fa = FactorAnalyzer(rotation="varimax")
        fa.fit(df_consistent)
        
        ev, v = fa.get_eigenvalues()
        
        # Save results
        results.append({
            "Year": year_folder.name,
            "Dataset": file.name,
            "KMO": kmo_model,
            "Bartlett_p": p_value,
            "Eigenvalues": ev.tolist(),
            "Loadings": pd.DataFrame(fa.loadings_, index=df_consistent.columns)
        })



=== Processing Year: 2018 ===
Dataset: imputed_APRIL_2018.csv
Shape after cleaning: (10000, 29)
KMO=0.623, Bartlett p=0.00000
Dataset: imputed_JULY_2018.csv




Shape after cleaning: (10000, 29)
KMO=0.643, Bartlett p=0.00000
Dataset: imputed_JANUARY_2018.csv




Shape after cleaning: (10000, 29)
KMO=0.594, Bartlett p=0.00000
Dataset: imputed_OCTOBER_2018.csv




Shape after cleaning: (10000, 29)
KMO=0.593, Bartlett p=0.00000

=== Processing Year: 2019 ===
Dataset: imputed_APRIL_2019.csv




Shape after cleaning: (10000, 29)
KMO=0.608, Bartlett p=0.00000
Dataset: imputed_JULY_2019.csv




Shape after cleaning: (10000, 29)
KMO=0.606, Bartlett p=0.00000
Dataset: imputed_OCTOBER_2019.csv




Shape after cleaning: (10000, 29)
KMO=0.591, Bartlett p=0.00000
Dataset: imputed_JANUARY_2019.csv




Shape after cleaning: (10000, 29)
KMO=0.643, Bartlett p=0.00000

=== Processing Year: 2022 ===
Dataset: imputed_JULY_2022.csv




Shape after cleaning: (10000, 29)
KMO=0.616, Bartlett p=0.00000
Dataset: imputed_AUGUST_2022.csv




Shape after cleaning: (10000, 29)
KMO=0.562, Bartlett p=0.00000
Dataset: imputed_DECEMBER_2022.csv




Shape after cleaning: (10000, 29)
KMO=0.577, Bartlett p=0.00000
Dataset: imputed_NOVEMBER_2022.csv




Shape after cleaning: (10000, 29)
KMO=0.609, Bartlett p=0.00000
Dataset: imputed_OCTOBER_2022.csv




Shape after cleaning: (10000, 29)
KMO=0.625, Bartlett p=0.00000
Dataset: imputed_SEPTEMBER_2022.csv




Shape after cleaning: (10000, 29)
KMO=0.556, Bartlett p=0.00000

=== Processing Year: 2023 ===
Dataset: imputed_APRIL_2023.csv




Shape after cleaning: (10000, 29)
KMO=0.547, Bartlett p=0.00000
Dataset: imputed_AUGUST_2023.csv




Shape after cleaning: (10000, 29)
KMO=0.637, Bartlett p=0.00000
Dataset: imputed_DECEMBER_2023.csv




Shape after cleaning: (10000, 29)
KMO=0.658, Bartlett p=0.00000
Dataset: imputed_FEBRUARY_2023.csv




Shape after cleaning: (10000, 29)
KMO=0.585, Bartlett p=0.00000
Dataset: imputed_JANUARY_2023.csv




Shape after cleaning: (10000, 29)
KMO=0.539, Bartlett p=0.00000
Dataset: imputed_JULY_2023.csv




Shape after cleaning: (10000, 29)
KMO=0.637, Bartlett p=0.00000
Dataset: imputed_JUNE_2023.csv




Shape after cleaning: (10000, 29)
KMO=0.562, Bartlett p=0.00000
Dataset: imputed_MARCH_2023.csv




Shape after cleaning: (10000, 29)
KMO=0.569, Bartlett p=0.00000
Dataset: imputed_NOVEMBER_2023.csv




Shape after cleaning: (10000, 29)
KMO=0.633, Bartlett p=0.00000
Dataset: imputed_OCTOBER_2023.csv




Shape after cleaning: (10000, 29)
KMO=0.612, Bartlett p=0.00000




Dataset: imputed_SEPTEMBER_2023.csv
Shape after cleaning: (10000, 29)
KMO=0.637, Bartlett p=0.00000
Dataset: imputed_MAY_2023.csv




Shape after cleaning: (10000, 29)
KMO=0.571, Bartlett p=0.00000

=== Processing Year: 2024 ===
Dataset: imputed_FEBRUARY_2024.csv




Shape after cleaning: (10000, 29)
KMO=0.652, Bartlett p=0.00000
Dataset: imputed_APRIL_2024.csv




Shape after cleaning: (10000, 29)
KMO=0.612, Bartlett p=0.00000
Dataset: imputed_JANUARY_2024.csv




Shape after cleaning: (10000, 29)
KMO=0.603, Bartlett p=0.00000
Dataset: imputed_AUGUST_2024.csv




Shape after cleaning: (10000, 29)
KMO=0.649, Bartlett p=0.00000
Dataset: imputed_JULY_2024.csv




Shape after cleaning: (10000, 29)
KMO=0.629, Bartlett p=0.00000
Dataset: imputed_MARCH_2024.csv




Shape after cleaning: (10000, 29)
KMO=0.631, Bartlett p=0.00000
Dataset: imputed_MAY_2024.csv




Shape after cleaning: (10000, 28)
KMO=0.636, Bartlett p=0.00000
Dataset: imputed_JUNE_2024.csv




Shape after cleaning: (10000, 29)
KMO=0.645, Bartlett p=0.00000




In [7]:
for res in results:
    year = res["Year"]
    dataset = res["Dataset"]
    
    # Scree plot
    plt.figure()
    plt.scatter(range(1, len(res["Eigenvalues"])+1), res["Eigenvalues"])
    plt.plot(range(1, len(res["Eigenvalues"])+1), res["Eigenvalues"])
    plt.title(f"Scree Plot - {year} {dataset}")
    plt.xlabel("Factors")
    plt.ylabel("Eigenvalue")
    plt.grid(True)
    plt.savefig(OUTPUT_RESULTS / f"scree_{year}_{dataset}.png")
    plt.close()
    
    # Loadings
    loadings_file = OUTPUT_RESULTS / f"loadings_{year}_{dataset}.csv"
    res["Loadings"].to_csv(loadings_file)


In [8]:
summary = pd.DataFrame([{
    "Year": r["Year"],
    "Dataset": r["Dataset"],
    "KMO": r["KMO"],
    "Bartlett_p": r["Bartlett_p"],
    "Factors>1": sum(np.array(r["Eigenvalues"]) > 1)
} for r in results])

summary_file = OUTPUT_RESULTS / "FA_summary.csv"
summary.to_csv(summary_file, index=False)
print(summary)


    Year                     Dataset       KMO  Bartlett_p  Factors>1
0   2018      imputed_APRIL_2018.csv  0.622814         0.0         11
1   2018       imputed_JULY_2018.csv  0.643198         0.0          9
2   2018    imputed_JANUARY_2018.csv  0.593688         0.0         11
3   2018    imputed_OCTOBER_2018.csv  0.592693         0.0         10
4   2019      imputed_APRIL_2019.csv  0.608462         0.0         11
5   2019       imputed_JULY_2019.csv  0.606434         0.0         11
6   2019    imputed_OCTOBER_2019.csv  0.590601         0.0         11
7   2019    imputed_JANUARY_2019.csv  0.642515         0.0         11
8   2022       imputed_JULY_2022.csv  0.615851         0.0         10
9   2022     imputed_AUGUST_2022.csv  0.561541         0.0         12
10  2022   imputed_DECEMBER_2022.csv  0.576514         0.0         12
11  2022   imputed_NOVEMBER_2022.csv  0.608900         0.0         11
12  2022    imputed_OCTOBER_2022.csv  0.624810         0.0         11
13  2022  imputed_SE

Validity Checks in Factor Analysis

KMO statistic
- Measures sampling adequacy.
- Rule of thumb: >0.6 is acceptable, >0.7 is good, >0.8 is great.
- Justification: Shows that correlations among variables are strong enough to support factor extraction.

Bartlett’s test of sphericity

- Tests whether your correlation matrix is significantly different from an identity matrix.
- A significant p‑value (<0.05) means factor analysis is valid because variables are correlated.
- Justification: Ensures you’re not forcing factors out of unrelated variables.

Scree plot

- Visualizes eigenvalues.
- The “elbow” point helps decide how many factors to retain.
- Justification: Prevents over‑factoring and supports transparent retention decisions.

Factor loadings

- Show how strongly each variable contributes to a factor.
- Loadings >0.4 are typically considered meaningful.
- Justification: Provides interpretability — you can argue that factors represent real constructs (e.g., resilience, sensitivity).

Cross‑year stability

- Running FA across multiple years lets you check if factor structures are consistent.
- Justification: Strengthens validity by showing constructs aren’t artifacts of one dataset.