# Failure Landscape Notebook (B-v2-step2-2)

목적: **Stage-structured failure를 회복하기 위한 Diff-Formatter 실험 관찰**
> 아래 구조를 통해 git apply 가능한 unified diff로 정규화(normalize)하는 시도

- single sLM
- 1차: Generator
- 2차: Formatter (동일 모델 2-call)

In [1]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

# ---- load ----
# 이미 results_csv 변수가 있으면 그걸 쓰고, 없으면 latest를 찾음
def find_latest_results_csv(runs_dir: Path) -> Path:
    cands = sorted(runs_dir.glob('*/results.csv'), key=lambda p: p.stat().st_mtime, reverse=True)
    if not cands:
        raise FileNotFoundError(f'No results.csv found under: {runs_dir}')
    return cands[0]

RUNS_DIR = Path('..').resolve() / "runs"
try:
    results_csv
except NameError:
    results_csv = find_latest_results_csv(RUNS_DIR)

df = pd.read_csv(results_csv)
print("loaded:", results_csv)
print("rows:", len(df), "cols:", len(df.columns))

loaded: /home/dibaeck/workspace/project_IR_sLM_MAS/runs/exp1_init_qwen2p5_baseline_20260213_060256/results.csv
rows: 200 cols: 23


In [2]:
# ---- safe normalize ----
for c in ["format_used","format_ok","format_reason","signature","error_type"]:
    if c not in df.columns:
        df[c] = np.nan

def to_bool_series(s):
    if s.dtype == bool:
        return s.fillna(False)
    return s.astype(str).str.strip().str.lower().isin(["1","true","t","yes","y"])

df["format_used"] = to_bool_series(df["format_used"].fillna(False))
df["format_ok"] = to_bool_series(df["format_ok"].fillna(False))

df["format_reason"] = df["format_reason"].fillna("").astype(str)
df["signature"] = df["signature"].fillna("").astype(str)
df["error_type"] = df["error_type"].fillna("").astype(str)


In [3]:
# ---- metrics ----
total = len(df)
fmt_used = int(df["format_used"].sum())
fmt_used_ratio = fmt_used / total if total else 0.0

fmt_ok = int((df["format_used"] & df["format_ok"]).sum())
fmt_ok_ratio = fmt_ok / fmt_used if fmt_used else 0.0

summary = pd.Series({
    "total_rows": total,
    "formatter_used_count": fmt_used,
    "formatter_used_ratio": round(fmt_used_ratio, 4),
    "formatter_success_count": fmt_ok,
    "formatter_success_rate_given_used": round(fmt_ok_ratio, 4),
})
display(summary)

total_rows                           200.00
formatter_used_count                   4.00
formatter_used_ratio                   0.02
formatter_success_count                3.00
formatter_success_rate_given_used      0.75
dtype: float64

In [4]:
# ---- formatter 전 reason top-k ----
# formatter가 호출된 row에서 format_reason이 "전 invalid reason" 역할
pre_reason = (
    df.loc[df["format_used"], "format_reason"]
      .replace("", "UNKNOWN_PRE_REASON")
)
print("\n[Top-K] Pre (trigger) invalid reasons (format_reason) among formatter_used:")
display(pre_reason.value_counts().head(10))



[Top-K] Pre (trigger) invalid reasons (format_reason) among formatter_used:


format_reason
too_many_files(3)                                                                                                                                                                                                                                                                                           3
formatter_exception: Error code: 400 - {'error': {'message': "'max_tokens' or 'max_completion_tokens' is too large: 2048. This model's maximum context length is 4096 tokens and your request has 2224 input tokens (2048 > 4096 - 2224). None", 'type': 'BadRequestError', 'param': None, 'code': 400}}    1
Name: count, dtype: int64

In [5]:
# ---- formatter 후 invalid top-k ----
# "후 invalid"는 formatter까지 했는데도 최종적으로 GEN_FAIL(invalid/empty)로 남은 것들
# 판단 기준: format_used=True AND error_type==GEN_FAIL AND signature in {invalid_diff_format, empty_diff}
post_mask = (
    df["format_used"]
    & (df["error_type"] == "GEN_FAIL")
    & (df["signature"].isin(["invalid_diff_format", "empty_diff"]))
)

# 후 reason은 가능한 한 구체적으로:
# 1) format_reason에 formatter_exception / 원인 문자열이 있으면 그걸 사용
# 2) 아니면 signature로 fallback
post_reason = df.loc[post_mask, "format_reason"].copy()
post_reason = post_reason.where(post_reason.str.len() > 0, df.loc[post_mask, "signature"])
post_reason = post_reason.replace("", "UNKNOWN_POST_REASON")

print("\n[Top-K] Post invalid reasons among formatter_used BUT still GEN_FAIL:")
display(post_reason.value_counts().head(10))


[Top-K] Post invalid reasons among formatter_used BUT still GEN_FAIL:


format_reason
formatter_exception: Error code: 400 - {'error': {'message': "'max_tokens' or 'max_completion_tokens' is too large: 2048. This model's maximum context length is 4096 tokens and your request has 2224 input tokens (2048 > 4096 - 2224). None", 'type': 'BadRequestError', 'param': None, 'code': 400}}    1
Name: count, dtype: int64

In [6]:
# ---- bonus: formatter 성공 케이스에서 pre_reason이 뭐였는지 top-k ----
ok_mask = df["format_used"] & df["format_ok"]
ok_pre_reason = (
    df.loc[ok_mask, "format_reason"]
      .replace("", "UNKNOWN_PRE_REASON")
)
print("\n[Top-K] Pre reasons for which formatter succeeded:")
display(ok_pre_reason.value_counts().head(10))


[Top-K] Pre reasons for which formatter succeeded:


format_reason
too_many_files(3)    3
Name: count, dtype: int64