# Failure Landscape Notebook (B-v2-step2-3)

목적: **Stage-structured failure를 회복하기 위한 Diff-Formatter 실험 관찰**
> 아래 구조를 통해 git apply 가능한 unified diff로 정규화(normalize)하는 시도

- single sLM
- 1차: Generator
- 2차: Formatter (동일 모델 2-call)

P0 : trigger 기반 formatter   
P1 : always formatter

In [1]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

# ---- load ----
# 이미 results_csv 변수가 있으면 그걸 쓰고, 없으면 latest를 찾음
def find_latest_results_csv(runs_dir: Path) -> Path:
    cands = sorted(runs_dir.glob('*/results.csv'), key=lambda p: p.stat().st_mtime, reverse=True)
    if not cands:
        raise FileNotFoundError(f'No results.csv found under: {runs_dir}')
    return cands[0]

RUNS_DIR = Path('..').resolve()
RESULTS_CSV = RUNS_DIR / "results.csv"
ANALYSIS_DIR = RUNS_DIR / "analysis"

print("Using results:", RESULTS_CSV)

df = pd.read_csv(RESULTS_CSV)
print("rows:", len(df), "cols:", len(df.columns))

Using results: /home/dibaeck/workspace/project_IR_sLM_MAS/runs_archive/exp1_B-v2-step2-3_Always-Formatter_p1/results.csv
rows: 200 cols: 25


### P1 : 
- format_used가 거의 항상 True가 됨(“항상 formatter 1회” 정책).
- pre(trigger) reason”이라는 개념이 약해짐
    1. formatter 자체 품질(formatter_invalid / formatter_exception) -> 실패 원인 top-k
    2. apply-check 실패(bucketed reason) -> bucket/top-hint
    3. 최종 PATCH_FAIL 분포 변화(formatter 성공/실패와 어떤 관계인지)

In [2]:
# --- normalize missing cols ---
for c in ["format_used","format_ok","format_reason","apply_check_ok","apply_check_reason","stage","error_type","signature"]:
    if c not in df.columns:
        df[c] = np.nan

def to_bool_series(s):
    if s.dtype == bool:
        return s.fillna(False)
    return s.astype(str).str.strip().str.lower().isin(["1","true","t","yes","y"])

df["format_used"] = to_bool_series(df["format_used"].fillna(False))
df["format_ok"] = to_bool_series(df["format_ok"].fillna(False))

df["apply_check_ok"] = to_bool_series(df["apply_check_ok"].fillna(True))  # 없으면 True로 간주(구버전 호환)
df["format_reason"] = df["format_reason"].fillna("").astype(str)
df["apply_check_reason"] = df["apply_check_reason"].fillna("").astype(str)
df["stage"] = df["stage"].fillna("").astype(str)
df["error_type"] = df["error_type"].fillna("").astype(str)
df["signature"] = df["signature"].fillna("").astype(str)


In [3]:
# --- (0) corrupt_patch 비율 ---

df[['stage','error_type','signature']].value_counts().head(20)

stage  error_type  signature          
GEN    GEN_FAIL    invalid_diff_format    189
                   llm_call_fail            9
                   formatter_exception      2
Name: count, dtype: int64

In [4]:
# PATCH_FAIL signature 분포(전체)
patch = df[(df["stage"]=="PATCH") & (df["error_type"]=="PATCH_FAIL")]
print("\nPATCH_FAIL rows:", len(patch))
print("\n=== PATCH_FAIL signature top ===")
display(patch["signature"].value_counts().head(10))


PATCH_FAIL rows: 0

=== PATCH_FAIL signature top ===


Series([], Name: count, dtype: int64)

In [5]:
# --- (1) formatter 요약 ---
total = len(df)
fmt_used = int(df["format_used"].sum())
fmt_ok = int((df["format_used"] & df["format_ok"]).sum())

formatter_summary = pd.Series({
    "total_rows": total,
    "formatter_used_count": fmt_used,
    "formatter_used_ratio": round(fmt_used / total, 4) if total else 0.0,
    "formatter_success_count": fmt_ok,
    "formatter_success_rate_given_used": round(fmt_ok / fmt_used, 4) if fmt_used else 0.0,
})
display(formatter_summary)

# formatter 실패 유형(top-k)
fmt_fail_mask = df["format_used"] & (~df["format_ok"])
fmt_fail_reason = df.loc[fmt_fail_mask, "format_reason"].replace("", "UNKNOWN_FMT_FAIL")
print("\n[Top-K] Formatter failures (format_reason) among format_used & !format_ok:")
display(fmt_fail_reason.value_counts().head(15))

total_rows                           200.0000
formatter_used_count                 191.0000
formatter_used_ratio                   0.9550
formatter_success_count              186.0000
formatter_success_rate_given_used      0.9738
dtype: float64


[Top-K] Formatter failures (format_reason) among format_used & !format_ok:


format_reason
formatter_invalid:too_many_files(3)                                                                                                                                                                                                                                                                        3
formatter_exception:Error code: 400 - {'error': {'message': "'max_tokens' or 'max_completion_tokens' is too large: 2048. This model's maximum context length is 4096 tokens and your request has 2129 input tokens (2048 > 4096 - 2129). None", 'type': 'BadRequestError', 'param': None, 'code': 400}}    1
formatter_exception:Error code: 400 - {'error': {'message': "'max_tokens' or 'max_completion_tokens' is too large: 2048. This model's maximum context length is 4096 tokens and your request has 2488 input tokens (2048 > 4096 - 2488). None", 'type': 'BadRequestError', 'param': None, 'code': 400}}    1
Name: count, dtype: int64

In [6]:
# --- (2) apply-check 실패 집계 (P1에서 중요) ---
# apply_check_ok==False 는 formatter 이후에도 apply 불가였던 케이스
ac_fail_mask = df["format_used"] & (df["apply_check_ok"] == False)
print("\napply-check failed count:", int(ac_fail_mask.sum()))

# bucket prefix만 뽑아서 top-k
def prefix_bucket(x: str) -> str:
    x = x or ""
    if ":" in x:
        return x.split(":", 1)[0].strip()
    return x.strip() if x.strip() else "UNKNOWN_APPLY_CHECK_REASON"

ac_bucket = df.loc[ac_fail_mask, "apply_check_reason"].map(prefix_bucket)
print("\n[Top-K] apply-check failed buckets (prefix only):")
display(ac_bucket.value_counts().head(10))

# 힌트(top-k)도 보고 싶으면:
ac_hint = df.loc[ac_fail_mask, "apply_check_reason"].map(lambda s: s.split(":",1)[1].strip() if ":" in s else "")
ac_hint = ac_hint.replace("", "NO_HINT")
print("\n[Top-K] apply-check failed hints (first line excerpt):")
display(ac_hint.value_counts().head(15))


apply-check failed count: 191

[Top-K] apply-check failed buckets (prefix only):


apply_check_reason
git_apply_corrupt_patch       175
git_apply_hunk_failed           9
UNKNOWN_APPLY_CHECK_REASON      5
git_apply_path_missing          2
Name: count, dtype: int64


[Top-K] apply-check failed hints (first line excerpt):


apply_check_reason
error: corrupt patch at line 13    42
error: corrupt patch at line 12    28
error: corrupt patch at line 14    11
error: corrupt patch at line 26    11
error: corrupt patch at line 20     8
error: corrupt patch at line 15     7
error: corrupt patch at line 11     7
error: corrupt patch at line 19     6
error: corrupt patch at line 10     6
error: corrupt patch at line 21     5
NO_HINT                             5
error: corrupt patch at line 22     5
error: corrupt patch at line 30     4
error: corrupt patch at line 9      4
error: corrupt patch at line 33     3
Name: count, dtype: int64

In [7]:
# --- (3) 최종 PATCH_FAIL 4-bucket 변화 확인 ---
patch_mask = (df["stage"] == "PATCH") & (df["error_type"] == "PATCH_FAIL")
patch_sig = df.loc[patch_mask, "signature"].replace("", "UNKNOWN_PATCH_SIG")

print("\n[PATCH_FAIL] signature distribution:")
display(patch_sig.value_counts())

# formatter 성공/실패에 따라 PATCH signature가 어떻게 달라지는지
# (P1에서는 대부분 formatter를 거치므로, 오히려 apply-check fail이 GEN으로 떨어지게 됨)
patch_by_fmt = (
    df.loc[patch_mask, ["format_used","format_ok","signature"]]
      .assign(fmt_group=lambda x: np.where(x["format_ok"], "fmt_ok", "fmt_fail_or_na"))
      .groupby(["fmt_group","signature"])
      .size()
      .sort_values(ascending=False)
)


[PATCH_FAIL] signature distribution:


Series([], Name: count, dtype: int64)

In [8]:
print("\n[PATCH_FAIL] by formatter group:")
display(patch_by_fmt)


[PATCH_FAIL] by formatter group:


Series([], dtype: int64)