In [11]:
#!/usr/bin/env python3
"""
Create classification‑report CSVs for every central‑bank inference run,
skip any 'Llama‑4‑*', average over seeds, and summarise the best model per
(bank, feature).
"""

import ast
import json
import re
from collections import defaultdict
from pathlib import Path

import pandas as pd
from datasets import load_dataset
from sklearn.metrics import classification_report

# ───────────────────────── CONFIG ───────────────────────── #
INFERENCE_ROOT = Path(
    "/Users/hp/Desktop/global-central-banks/llm_inference_output_few_shot"
)
METRICS_ROOT = Path(
    "/Users/hp/Desktop/global-central-banks/new_metrics_few_shot"
)
METRICS_ROOT.mkdir(parents=True, exist_ok=True)

# Skip *any* variant containing “llama‑4”
SKIP_PATTERN = re.compile(r"llama[-_]?4", re.I)

FENCE_RE  = re.compile(r"^```(?:json)?\s*|```", re.I)
LABEL_RE  = re.compile(r'"?label"?\s*:\s*"?([a-zA-Z]+)"?', re.I)
JSON_ERR  = "error"
# ─────────────────────────────────────────────────────────── #

# ═════════════════════ PARSING UTILITIES ═══════════════════ #
JSON_BLOCK_RE = re.compile(r"\{[\s\S]*?\}", re.M)


def _extract_json_block(text: str) -> str:
    m = JSON_BLOCK_RE.search(text)
    return m.group(0) if m else text


def _clean_pred(raw: str) -> str:
    """
    Normalise LLM response to a valid JSON string:
    • Strip ``` fences
    • Keep only the first {...} block
    • Collapse doubled quotes => "
    • Replace fancy quotes
    """
    raw = raw.strip()
    raw = FENCE_RE.sub("", raw).strip()

    brace = raw.find("{")
    if brace != -1:
        raw = raw[brace:]

    raw = re.sub(r'""([^"]+)""', r'"\1"', raw)  # collapse ""key""
    raw = raw.replace("“", '"').replace("”", '"')
    raw = _extract_json_block(raw)
    raw = raw.replace('""', '"') 
    raw = raw.rstrip(",")  # CSV‑escaped quotes
    return raw


def parse_llm_responses(df: pd.DataFrame, file_path: Path) -> list[str]:
    labels: list[str] = []
    fp = str(file_path.resolve())

    for i, raw_pred in enumerate(df["llm_responses"]):
        cleaned = _clean_pred(str(raw_pred))

        try:
            label = json.loads(cleaned).get("label", "").strip().lower()
        except json.JSONDecodeError:
            try:
                parsed = ast.literal_eval(cleaned)
                label = (
                    str(parsed.get("label", "")).strip().lower()
                    if isinstance(parsed, dict)
                    else ""
                )
            except Exception:
                m = LABEL_RE.search(cleaned)
                label = m.group(1).lower() if m else JSON_ERR

        if not label:
            label = JSON_ERR

        if label == JSON_ERR:
            print(f"[Row {i}] {fp}: stored as 'error' (parse failed)")

        labels.append(label)

    return labels


# ═══════════════════ EVALUATION ROUTINE ════════════════════ #
def evaluate_one_csv(
    file_path: Path,
    bank: str,
    feature_provider: str,
    model_dir: str,
    model_name: str,
) -> tuple[str, float, pd.DataFrame] | None:
    try:
        df = pd.read_csv(file_path)
        _, seed_str = file_path.stem.rsplit("_", 1)

        # -------- actual labels -------- #
        if "actual_labels" in df.columns:
            actual = [x.strip().lower() for x in df["actual_labels"]]
        else:
            feature = feature_provider.split("_", 1)[0]
            ds = load_dataset(
                f"gtfintechlab/{bank}",
                seed_str,
                trust_remote_code=True,
                split="test",
            )
            actual = [x.strip().lower() for x in ds[f"{feature}_label"]]

        # -------- predicted labels ----- #
        predicted = parse_llm_responses(df, file_path)

        clean_pairs = [(a, p) for a, p in zip(actual, predicted) if p != "error"]
        if not clean_pairs:
            raise ValueError("All rows in this file failed to parse!")

        actual, predicted = zip(*clean_pairs)
        skipped = len(df) - len(actual)
        if skipped:
            print(f"[INFO] {file_path.name}: skipped {skipped} unparsable rows")

        report = classification_report(
            actual, predicted, output_dict=True, zero_division=0
        )
        weighted_f1 = report["weighted avg"]["f1-score"]

        metrics_df = pd.DataFrame(report).T
        metrics_df["model"] = model_name
        metrics_df["seed"] = seed_str
        metrics_df["weighted_f1"] = weighted_f1

        out_dir = METRICS_ROOT / bank / feature_provider / model_dir
        out_dir.mkdir(parents=True, exist_ok=True)
        metrics_df.to_csv(out_dir / f"{model_name}_metrics_{seed_str}.csv")

        return seed_str, weighted_f1, metrics_df

    except Exception as e:
        print(f"[ERROR] {file_path.resolve()}: {e}")
        return None


# ═════════════════════════ MAIN LOOP ═══════════════════════ #
per_model_runs: defaultdict[
    tuple[str, str, str, str], list[tuple[str, float, pd.DataFrame]]
] = defaultdict(list)

for bank_dir in INFERENCE_ROOT.iterdir():
    if not bank_dir.is_dir():
        continue
    bank = bank_dir.name

    for featprov_dir in bank_dir.iterdir():
        if not featprov_dir.is_dir():
            continue
        feature_provider = featprov_dir.name

        for model_dir_path in featprov_dir.iterdir():
            if not model_dir_path.is_dir():
                continue
            model_dir = model_dir_path.name

            if SKIP_PATTERN.search(model_dir):
                print(f"Skipping directory {model_dir_path}")
                continue

            for csv_file in model_dir_path.glob("*.csv"):
                # extra guard: skip file itself if pattern appears in the path
                if SKIP_PATTERN.search(str(csv_file)):
                    # print(f"Skipping file {csv_file}")
                    continue

                model_name = "_".join(csv_file.stem.split("_")[:-1])  # drop seed

                res = evaluate_one_csv(
                    csv_file, bank, feature_provider, model_dir, model_name
                )
                if res:
                    per_model_runs[
                        (bank, feature_provider, model_dir, model_name)
                    ].append(res)

# ═══════════ AVERAGE ACROSS SEEDS & SUMMARY CSV ═══════════ #
summary_rows: list[dict] = []

for (bank, featprov, model_dir, model_name), runs in per_model_runs.items():
    concat_df = pd.concat([r[2] for r in runs])
    numeric_df = concat_df.select_dtypes(include="number")
    avg_df = numeric_df.groupby(level=0).mean()

    # scalar weighted‑F1
    avg_weighted_f1 = float(avg_df.loc["weighted avg", "weighted_f1"])

    # add back descriptors
    avg_df["model"] = model_name
    avg_df["seeds"] = ",".join(r[0] for r in runs)
    avg_df["weighted_f1"] = avg_weighted_f1

    out_dir = METRICS_ROOT / bank / featprov / model_dir
    out_df_path = out_dir / f"{model_name}_average_metrics.csv"
    avg_df.to_csv(out_df_path)

    feature = featprov.split("_", 1)[0]
    summary_rows.append(
        {
            "bank": bank,
            "feature": feature,
            "model_family": model_dir,
            "model_name": model_name,
            "avg_weighted_f1": avg_weighted_f1,
        }
    )

summary_df = pd.DataFrame(summary_rows)

# -------- best model per (bank, feature) -------- #
best_df = (
    summary_df.sort_values("avg_weighted_f1", ascending=False)
    .groupby(["bank", "feature"], as_index=False)
    .first()
)

best_csv = METRICS_ROOT / "best_model_per_bank_feature.csv"
best_df.to_csv(best_csv, index=False)

print(f"\nAll done! ➜  {best_csv}")



All done! ➜  /Users/hp/Desktop/global-central-banks/new_metrics_few_shot/best_model_per_bank_feature.csv


In [2]:
#!/usr/bin/env python3
"""
Re‑compute FINMA classification reports and place them in the new_metrics/
directory structure, keeping per‑seed files and one averaged file.

Input  tree:
    finma_results/<bank>/<feature>_finma-7b-full_<YYYYMMDD>_<SEED>.csv

Output tree (created fresh):
    new_metrics/<bank>/<feature>_finma/finma-7b-full/
        finma-7b-full_<YYYYMMDD>_<SEED>_metrics.csv
        finma-7b-full_<YYYYMMDD>_average_metrics.csv
"""

import json
from pathlib import Path
import shutil
import pandas as pd
from sklearn.metrics import classification_report

# ───────── PATHS ───────── #
RESULTS_ROOT = Path("/Users/hp/Desktop/global-central-banks/finma_results")
OUTPUT_ROOT  = Path("/Users/hp/Desktop/global-central-banks/new_metrics")
MODEL_NAME   = "finma-7b-full"
MODEL_FAMILY = "finma-7b-full"           # one more level, mirrors example

# ───────── HELPERS ───────── #
def safe_label(pred, valid):
    """
    Extract label from JSON or raw string; unseen → 'error'.
    """
    try:
        if isinstance(pred, str) and pred.strip().startswith("{"):
            pred = json.loads(pred)
        label = (pred.get("label") if isinstance(pred, dict) else str(pred)).strip().lower()
    except Exception:
        label = "error"
    return label if label in valid else "error"

def write_metrics(report_dict: dict, out_file: Path):
    pd.DataFrame(report_dict).transpose().to_csv(out_file)
    print(f"[✓] {out_file.relative_to(OUTPUT_ROOT.parent)}")

def average_reports(csv_files, out_file: Path):
    dfs = [pd.read_csv(f, index_col=0) for f in csv_files]
    # align indices then average numeric cols
    avg = pd.concat(dfs).groupby(level=0).mean(numeric_only=True)
    avg.to_csv(out_file)
    print(f"[✓] {out_file.relative_to(OUTPUT_ROOT.parent)}  (average)")

# ───────── CLEAN OLD ───────── #
if OUTPUT_ROOT.exists():
    shutil.rmtree(OUTPUT_ROOT)
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

# ───────── MAIN ───────── #
for csv_path in RESULTS_ROOT.rglob("*.csv"):
    bank     = csv_path.parent.name
    stem     = csv_path.stem  # e.g. certain_finma-7b-full_20250501_5768
    parts    = stem.split("_")

    if len(parts) < 4 or not parts[1].startswith(MODEL_NAME):
        print(f"[SKIP] {csv_path.name}")
        continue

    feature, date_part, seed_part = parts[0], parts[-2], parts[-1]

    # load and compute metrics
    df     = pd.read_csv(csv_path)
    y_true = df["actual_labels"].str.lower()
    valid  = set(y_true.unique())
    y_pred = [safe_label(x, valid) for x in df["llm_responses"]]

    report = classification_report(
        y_true, y_pred,
        labels=sorted(valid | {"error"}),
        output_dict=True,
        zero_division=0
    )

    # build output path
    out_dir = (
        OUTPUT_ROOT
        / bank
        / f"{feature}_finma"
        / MODEL_FAMILY
    )
    out_dir.mkdir(parents=True, exist_ok=True)

    per_seed_file = out_dir / f"{MODEL_NAME}_{date_part}_{seed_part}_metrics.csv"
    write_metrics(report, per_seed_file)

# ─── produce per‑date averages (four files total per folder) ─── #
for feature_dir in OUTPUT_ROOT.rglob("*_finma"):
    per_seed_csvs = list((feature_dir / MODEL_FAMILY).glob(f"{MODEL_NAME}_*_metrics.csv"))
    if not per_seed_csvs:
        continue

    # all per‑seed files share same <DATE>; extract from first file name
    first_parts = per_seed_csvs[0].stem.split("_")
    date_part   = first_parts[-2]

    avg_file = (feature_dir / MODEL_FAMILY) / f"{MODEL_NAME}_{date_part}_average_metrics.csv"
    average_reports(per_seed_csvs, avg_file)

print("Finished. Each model folder now contains exactly 4 files.")


[✓] new_metrics/central_reserve_bank_of_peru/stance_finma/finma-7b-full/finma-7b-full_20250501_944601_metrics.csv
[✓] new_metrics/central_reserve_bank_of_peru/time_finma/finma-7b-full/finma-7b-full_20250501_78516_metrics.csv
[✓] new_metrics/central_reserve_bank_of_peru/stance_finma/finma-7b-full/finma-7b-full_20250501_5768_metrics.csv
[✓] new_metrics/central_reserve_bank_of_peru/certain_finma/finma-7b-full/finma-7b-full_20250501_78516_metrics.csv
[✓] new_metrics/central_reserve_bank_of_peru/stance_finma/finma-7b-full/finma-7b-full_20250501_78516_metrics.csv
[✓] new_metrics/central_reserve_bank_of_peru/certain_finma/finma-7b-full/finma-7b-full_20250501_5768_metrics.csv
[✓] new_metrics/central_reserve_bank_of_peru/time_finma/finma-7b-full/finma-7b-full_20250501_944601_metrics.csv
[✓] new_metrics/central_reserve_bank_of_peru/time_finma/finma-7b-full/finma-7b-full_20250501_5768_metrics.csv
[✓] new_metrics/central_reserve_bank_of_peru/certain_finma/finma-7b-full/finma-7b-full_20250501_94460

In [3]:
#!/usr/bin/env python3
"""
Replace FINMA inference CSVs in llm_inference_output_no_guide/ with the
fresh runs currently sitting in finma_results/.
"""

import shutil
from pathlib import Path

RESULTS_ROOT = Path("/Users/hp/Desktop/global-central-banks/finma_results")
OUTPUT_ROOT  = Path("/Users/hp/Desktop/global-central-banks/llm_inference_output_no_guide")
MODEL_FAMILY = "finma-7b-full"

# ───────── 1. delete existing CSVs ───────── #
for csv_path in OUTPUT_ROOT.rglob(f"*{MODEL_FAMILY}/*.csv"):
    print(f"[–] deleting {csv_path.relative_to(OUTPUT_ROOT.parent)}")
    csv_path.unlink()

# ───────── 2. move fresh files over ───────── #
for src_csv in RESULTS_ROOT.rglob("*.csv"):
    bank_dir  = src_csv.parent.name                     # e.g. bank_negara_malaysia
    stem      = src_csv.stem                            # certain_finma-7b-full_20250501_5768

    # derive feature directory name  (first token before '_')
    feature   = stem.split("_", 1)[0]                   # certain / stance / time
    tgt_dir   = (
        OUTPUT_ROOT
        / bank_dir
        / f"{feature}_finma"
        / MODEL_FAMILY
    )
    tgt_dir.mkdir(parents=True, exist_ok=True)

    dst_csv = tgt_dir / src_csv.name
    print(f"[→] moving  {src_csv.relative_to(RESULTS_ROOT.parent)}")
    print(f"    to      {dst_csv.relative_to(OUTPUT_ROOT.parent)}")
    shutil.move(src_csv, dst_csv)

print("FINMA outputs refreshed.")


[–] deleting llm_inference_output_no_guide/central_reserve_bank_of_peru/stance_finma/finma-7b-full/finma-7b-full_20250501_78516_metrics.csv
[–] deleting llm_inference_output_no_guide/central_reserve_bank_of_peru/stance_finma/finma-7b-full/finma-7b-full_20250501_944601_metrics.csv
[–] deleting llm_inference_output_no_guide/central_reserve_bank_of_peru/stance_finma/finma-7b-full/finma-7b-full_20250501_5768_metrics.csv
[–] deleting llm_inference_output_no_guide/central_reserve_bank_of_peru/time_finma/finma-7b-full/finma-7b-full_20250501_78516_metrics.csv
[–] deleting llm_inference_output_no_guide/central_reserve_bank_of_peru/time_finma/finma-7b-full/finma-7b-full_20250501_944601_metrics.csv
[–] deleting llm_inference_output_no_guide/central_reserve_bank_of_peru/time_finma/finma-7b-full/finma-7b-full_20250501_5768_metrics.csv
[–] deleting llm_inference_output_no_guide/central_reserve_bank_of_peru/certain_finma/finma-7b-full/finma-7b-full_20250501_78516_metrics.csv
[–] deleting llm_inferenc

In [7]:
#!/usr/bin/env python3
"""
Re‑compute FINMA classification reports and store them in
new_metrics/<bank>/<feature>_finma/finma-7b-full/,
matching the existing schema:
precision, recall, f1‑score, support, model, seed, weighted_f1
"""

import json
from pathlib import Path
import pandas as pd
from sklearn.metrics import classification_report

# ───────── Paths & constants ───────── #
RESULTS_ROOT = Path("/Users/hp/Desktop/global-central-banks/finma_results")
OUTPUT_ROOT  = Path("/Users/hp/Desktop/global-central-banks/new_metrics")
MODEL_FAMILY = "finma-7b-full"        # directory name
MODEL_TAG    = "finma-7b-full"        # file prefix

# ───────── Helper functions ───────── #
def safe_pred(pred, valid_set):
    """Parse label from JSON or raw str; unseen → 'error'."""
    try:
        if isinstance(pred, str) and pred.strip().startswith("{"):
            pred = json.loads(pred)
        label = (pred.get("label") if isinstance(pred, dict) else str(pred)).strip().lower()
    except Exception:
        label = "error"
    return label if label in valid_set else "error"

def build_df(report, model_tag, seed):
    """Convert report dict → DataFrame + extra cols, drop 'error' row if present."""
    df = pd.DataFrame(report).transpose()
    if "error" in df.index:
        df = df.drop(index="error")

    weighted_f1 = report["weighted avg"]["f1-score"]
    df["model"]       = model_tag
    df["seed"]        = seed
    df["weighted_f1"] = weighted_f1
    return df

def delete_old_finma_files(folder: Path):
    """Remove only the FINMA csvs; leave others intact."""
    for f in folder.glob(f"{MODEL_TAG}_*"):
        if f.is_file():
            f.unlink()

# ───────── Main processing ───────── #
per_date_files = {}      # (bank, feature, date) → list[csv_path]

for src in RESULTS_ROOT.rglob("*.csv"):
    stem = src.stem                       # e.g. certain_finma-7b-full_20250420_5768
    parts = stem.split("_")
    if len(parts) < 4 or parts[1] != MODEL_TAG:
        continue                          # skip non‑FINMA files

    feature, date_part, seed_part = parts[0], parts[-2], parts[-1]
    bank    = src.parent.name

    # load data
    df_in   = pd.read_csv(src)
    y_true  = df_in["actual_labels"].str.lower()
    valid   = set(y_true.unique())
    y_pred  = [safe_pred(x, valid) for x in df_in["llm_responses"]]

    report  = classification_report(
        y_true, y_pred,
        output_dict=True,
        zero_division=0
    )

    # output dir
    tgt_dir = OUTPUT_ROOT / bank / f"{feature}_finma" / MODEL_FAMILY
    if not tgt_dir.exists():
        tgt_dir.mkdir(parents=True, exist_ok=True)
    # remove old FINMA files in this folder once
    if not (tgt_dir / ".cleaned").exists():
        delete_old_finma_files(tgt_dir)
        (tgt_dir / ".cleaned").touch()     # sentinel

    out_csv = tgt_dir / f"{MODEL_TAG}_{date_part}_{seed_part}_metrics.csv"
    build_df(report, f"{MODEL_TAG}_{date_part}", seed_part).to_csv(out_csv, index=True)
    print(f"[metrics] {out_csv.relative_to(OUTPUT_ROOT.parent)}")

    per_date_files.setdefault((bank, feature, date_part), []).append(out_csv)

# ───────── Cross‑seed averages ───────── #
for (bank, feature, date_part), file_list in per_date_files.items():
    if len(file_list) < 2:                # need ≥2 seeds to average
        continue

    dfs  = [pd.read_csv(p, index_col=0) for p in file_list]
    avg  = pd.concat(dfs).groupby(level=0).mean(numeric_only=True)

    # copy text cols from first df; overwrite seed & model cols
    tmpl = dfs[0]
    avg["model"] = tmpl["model"].iloc[0]
    avg["seed"]  = ",".join(sorted(p.stem.split("_")[-1] for p in file_list))
    avg["weighted_f1"] = avg["weighted_f1"]         # already mean

    avg_csv = (
        OUTPUT_ROOT / bank / f"{feature}_finma" / MODEL_FAMILY /
        f"{MODEL_TAG}_{date_part}_average_metrics.csv"
    )
    avg.to_csv(avg_csv)
    print(f"[average] {avg_csv.relative_to(OUTPUT_ROOT.parent)}")

print("FINMA metrics rewritten in the desired schema (other models untouched).")


FINMA metrics rewritten in the desired schema (other models untouched).


In [8]:
#!/usr/bin/env python3
"""
Patch existing FINMA metrics CSVs *in place* so they match the schema:
precision, recall, f1-score, support, model, seed, weighted_f1
"""

from pathlib import Path
import pandas as pd

ROOT = Path("/Users/hp/Desktop/global-central-banks/new_metrics")
MODEL_FAMILY = "finma-7b-full"

for csv_path in ROOT.rglob(f"{MODEL_FAMILY}/*_metrics.csv"):
    df = pd.read_csv(csv_path, index_col=0)

    # 1) drop 'error' row if present
    if "error" in df.index:
        df = df.drop(index="error")

    # 2) extract DATE + SEED from filename
    stem_parts = csv_path.stem.split("_")              # finma-7b-full_<DATE>_<SEED>_metrics
    if len(stem_parts) < 4:
        print(f"[skip] unexpected name {csv_path.name}")
        continue
    date_part, seed_part = stem_parts[-3], stem_parts[-2]

    # 3) add/replace extra columns
    weighted_f1 = df.loc["weighted avg", "f1-score"]
    df["model"]       = f"{MODEL_FAMILY}_{date_part}"
    df["seed"]        = seed_part
    df["weighted_f1"] = weighted_f1

    df.to_csv(csv_path)
    print(f"[fixed] {csv_path.relative_to(ROOT.parent)}")

print("All FINMA metrics files patched.")


[fixed] new_metrics/central_reserve_bank_of_peru/stance_finma/finma-7b-full/finma-7b-full_20250501_78516_metrics.csv
[fixed] new_metrics/central_reserve_bank_of_peru/stance_finma/finma-7b-full/finma-7b-full_78516_average_metrics.csv
[fixed] new_metrics/central_reserve_bank_of_peru/stance_finma/finma-7b-full/finma-7b-full_20250501_944601_metrics.csv
[fixed] new_metrics/central_reserve_bank_of_peru/stance_finma/finma-7b-full/finma-7b-full_20250501_5768_metrics.csv
[fixed] new_metrics/central_reserve_bank_of_peru/time_finma/finma-7b-full/finma-7b-full_20250501_78516_metrics.csv
[fixed] new_metrics/central_reserve_bank_of_peru/time_finma/finma-7b-full/finma-7b-full_78516_average_metrics.csv
[fixed] new_metrics/central_reserve_bank_of_peru/time_finma/finma-7b-full/finma-7b-full_20250501_944601_metrics.csv
[fixed] new_metrics/central_reserve_bank_of_peru/time_finma/finma-7b-full/finma-7b-full_20250501_5768_metrics.csv
[fixed] new_metrics/central_reserve_bank_of_peru/certain_finma/finma-7b-fu