In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip -q install -U "transformers==4.45.2" "accelerate>=0.34.0" "sentencepiece" "safetensors" "scikit-learn"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m119.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m110.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import transformers, torch, sys
print("Transformers:", transformers.__version__)
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

Transformers: 4.45.2
Torch: 2.8.0+cu126
CUDA available: True


In [None]:
!head -n 3 "/content/drive/MyDrive/GEC_dataset/sentence_pairs_2000000.csv"

noise,clean
Much many brands and sellers still in the market.,Many brands and sellers still in the market.
"Fairy Or Not, I'm the Godmother: no just look, but my outfit for taking the part as godmother.","Fairy Or Not, I'm the Godmother: Not just a look, but my outfit for taking on the role as godmother."


In [None]:
%%writefile /content/train_edit_tag_no_val.py
# -*- coding: utf-8 -*-
# DeBERTa-v3-base + Edit-Tag fine-tuning
# - validation 사용 (best step 선택)
# - EarlyStopping 미사용
# - save_total_limit=1 (디스크에는 베스트 체크포인트만 유지)
# - --resume_from 로 이어학습 지원

import os, re, argparse, random
from dataclasses import dataclass
from typing import Optional, List, Tuple

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)

IGNORE_INDEX = -100
EDIT_TAGS = [
    "KEEP", "DELETE",
    "APPEND_s", "APPEND_es", "APPEND_ed", "APPEND_ing",
    "REPLACE_VBZ", "REPLACE_VBD", "REPLACE_VBG", "REPLACE_NNS",
    "REPLACE_ART_A", "REPLACE_ART_AN",
    "MLM_REPLACE", "MLM_APPEND"
]
TAG2ID = {t:i for i,t in enumerate(EDIT_TAGS)}
ID2TAG = {i:t for t,i in TAG2ID.items()}

def read_pairs(path: str, limit: Optional[int] = None) -> pd.DataFrame:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".csv":
        df = pd.read_csv(path)
    elif ext in [".tsv", ".tab"]:
        df = pd.read_csv(path, sep="\t")
    elif ext == ".jsonl":
        df = pd.read_json(path, lines=True)
    else:
        raise ValueError(f"Unsupported file: {path}")
    assert "noise" in df.columns and "clean" in df.columns, "CSV must have 'noise','clean'"
    if limit: df = df.head(limit)
    return df.dropna(subset=["noise", "clean"]).reset_index(drop=True)

def word_diff_ops(src_words: List[str], tgt_words: List[str]) -> List[Tuple[str,int,Optional[str]]]:
    from difflib import SequenceMatcher
    sm = SequenceMatcher(a=src_words, b=tgt_words, autojunk=False)
    ops = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "equal":
            for i in range(i1, i2):
                ops.append(("KEEP", i, None))
        elif tag == "delete":
            for i in range(i1, i2):
                ops.append(("DELETE", i, None))
        elif tag == "insert":
            at = max(i1-1, 0)
            for j in range(j1, j2):
                ops.append(("INSERT", at, tgt_words[j]))
        elif tag == "replace":
            n = max(i2-i1, j2-j1)
            src_seg = src_words[i1:i2]
            tgt_seg = tgt_words[j1:j2]
            for k in range(n):
                s = src_seg[k] if k < len(src_seg) else None
                t = tgt_seg[k] if k < len(tgt_seg) else None
                if s is not None and t is not None:
                    ops.append(("REPLACE", i1+k, t))
                elif s is not None and t is None:
                    ops.append(("DELETE", i1+k, None))
                elif s is None and t is not None:
                    at = max(i1-1, 0)
                    ops.append(("INSERT", at, t))
    ops.sort(key=lambda x: (x[1], 0 if x[0]!="INSERT" else 1))
    return ops

_RE_ART_A  = re.compile(r"^(a)\b", flags=re.I)
_RE_ART_AN = re.compile(r"^(an)\b", flags=re.I)

def map_op_to_tag(src_word: str, op: Tuple[str,int,Optional[str]]) -> str:
    kind, _, payload = op
    if kind == "KEEP":   return "KEEP"
    if kind == "DELETE": return "DELETE"
    if kind == "INSERT":
        w = (payload or "").lower()
        if w in ("s",):   return "APPEND_s"
        if w in ("es",):  return "APPEND_es"
        if w in ("ed",):  return "APPEND_ed"
        if w in ("ing",): return "APPEND_ing"
        if w in ("a","an"): return "REPLACE_ART_AN" if w=="an" else "REPLACE_ART_A"
        return "MLM_APPEND"
    if kind == "REPLACE":
        w = (payload or "").lower()
        sw = (src_word or "").lower()
        if _RE_ART_A.match(src_word) or _RE_ART_AN.match(src_word) or w in ("a","an"):
            return "REPLACE_ART_AN" if w=="an" else "REPLACE_ART_A"
        if w.endswith("es") and (sw == w[:-2] or w[:-2].endswith(sw)): return "APPEND_es"
        if w.endswith("s")  and (sw == w[:-1] or w[:-1].endswith(sw)): return "APPEND_s"
        if w.endswith("ed") and (sw == w[:-2] or w[:-2].endswith(sw)): return "APPEND_ed"
        if w.endswith("ing") and (sw == w[:-3] or w[:-3].endswith(sw)): return "APPEND_ing"
        if w.endswith(("s","es")): return "REPLACE_VBZ"
        if w.endswith("ed"):       return "REPLACE_VBD"
        if w.endswith("ing"):      return "REPLACE_VBG"
        if w.endswith("s") and len(w)>2: return "REPLACE_NNS"
        return "MLM_REPLACE"
    return "KEEP"

@dataclass
class EditTagDataset(Dataset):
    df: pd.DataFrame
    tok: any
    max_len: int = 160
    def __len__(self): return len(self.df)
    def _word_tags(self, noise: str, clean: str) -> List[str]:
        src_words = noise.strip().split()
        tgt_words = clean.strip().split()
        ops = word_diff_ops(src_words, tgt_words)
        tags = ["KEEP"] * len(src_words)
        for kind, idx, payload in ops:
            if idx >= len(tags): continue
            if kind == "KEEP":
                tags[idx] = "KEEP"
            elif kind == "DELETE":
                tags[idx] = "DELETE"
            else:
                tags[idx] = map_op_to_tag(src_words[idx] if idx < len(src_words) else "", (kind, idx, payload))
        return tags
    def __getitem__(self, i):
        noise, clean = str(self.df.iloc[i]["noise"]), str(self.df.iloc[i]["clean"])
        word_tags = self._word_tags(noise, clean)
        words = noise.strip().split()
        enc = self.tok(
            words, is_split_into_words=True, truncation=True, max_length=self.max_len,
            return_token_type_ids=False, return_offsets_mapping=False
        )
        labels = [IGNORE_INDEX] * len(enc["input_ids"])
        word_ids = enc.word_ids()
        for tidx, wid in enumerate(word_ids):
            if wid is None: continue
            if tidx == 0 or word_ids[tidx-1] != wid:
                tag = word_tags[wid] if wid < len(word_tags) else "KEEP"
                labels[tidx] = TAG2ID.get(tag, TAG2ID["KEEP"])
        enc["labels"] = labels
        return {k: torch.tensor(v) for k,v in enc.items()}

def split_df(df: pd.DataFrame, val_ratio: float, seed: int=42):
    n = len(df)
    idx = list(range(n))
    random.Random(seed).shuffle(idx)
    v = int(n * val_ratio)
    val_idx = set(idx[:v])
    df_train = df.loc[[i for i in range(n) if i not in val_idx]].reset_index(drop=True)
    df_val   = df.loc[[i for i in range(n) if i in val_idx]].reset_index(drop=True)
    return df_train, df_val

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--train_csv", type=str, required=True)
    ap.add_argument("--out_dir", type=str, required=True)
    ap.add_argument("--backbone", type=str, default="microsoft/deberta-v3-base")
    ap.add_argument("--epochs", type=int, default=3)
    ap.add_argument("--batch_size", type=int, default=16)
    ap.add_argument("--lr", type=float, default=2e-5)
    ap.add_argument("--max_len", type=int, default=64)
    ap.add_argument("--seed", type=int, default=42)
    ap.add_argument("--train_limit", type=int, default=None)

    ap.add_argument("--val_ratio", type=float, default=0.05)
    ap.add_argument("--eval_steps", type=int, default=2000)
    ap.add_argument("--save_steps", type=int, default=2000)

    ap.add_argument("--resume_from", type=str, default=None)  # 이어학습 체크포인트 경로
    args = ap.parse_args()

    set_seed(args.seed)
    os.makedirs(args.out_dir, exist_ok=True)

    tok = AutoTokenizer.from_pretrained(args.backbone)
    df_all = read_pairs(args.train_csv, limit=args.train_limit)
    df_tr, df_va = split_df(df_all, args.val_ratio, args.seed)

    ds_tr = EditTagDataset(df=df_tr, tok=tok, max_len=args.max_len)
    ds_va = EditTagDataset(df=df_va, tok=tok, max_len=args.max_len)

    model = AutoModelForTokenClassification.from_pretrained(
        args.backbone,
        num_labels=len(EDIT_TAGS),
        problem_type="single_label_classification",
    )

    collator = DataCollatorForTokenClassification(tokenizer=tok, padding=True)

    targs = TrainingArguments(
        output_dir=args.out_dir,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        learning_rate=args.lr,
        num_train_epochs=args.epochs,
        weight_decay=0.01,
        warmup_ratio=0.06,
        lr_scheduler_type="linear",
        fp16=torch.cuda.is_available(),
        report_to=[],
        # 평가/저장 주기
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=args.eval_steps,
        save_steps=args.eval_steps,      # 평가와 저장을 같은 주기
        save_total_limit=1,              # 디스크에는 베스트 1개만 유지
        load_best_model_at_end=True,     # 종료 시 베스트 자동 로드
        metric_for_best_model="eval_loss",
        greater_is_better=False,
    )

    trainer = Trainer(
        model=model,
        args=targs,
        train_dataset=ds_tr,
        eval_dataset=ds_va,
        data_collator=collator,
        tokenizer=tok,
    )

    trainer.train(resume_from_checkpoint=args.resume_from)

    # 여기서 model은 'best checkpoint' 가중치 상태
    trainer.save_model(args.out_dir)
    tok.save_pretrained(args.out_dir)

    state = trainer.state
    print("✓ saved BEST model to:", args.out_dir)
    print(f"best_model_checkpoint = {state.best_model_checkpoint}")
    print(f"best_metric(eval_loss) = {state.best_metric}")
    # best step 표시
    if state.best_model_checkpoint and state.best_model_checkpoint.rsplit('-',1)[-1].isdigit():
        print(f"best_step = {int(state.best_model_checkpoint.rsplit('-',1)[-1])}")
    else:
        print("best_step = (unavailable)")
    print("Done.")

if __name__ == "__main__":
    main()

Writing /content/train_edit_tag_no_val.py


In [None]:
%%writefile /content/infer_edit_tag.py
# -*- coding: utf-8 -*-
import argparse
from typing import List
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

EDIT_TAGS = [
    "KEEP", "DELETE",
    "APPEND_s", "APPEND_es", "APPEND_ed", "APPEND_ing",
    "REPLACE_VBZ", "REPLACE_VBD", "REPLACE_VBG", "REPLACE_NNS",
    "REPLACE_ART_A", "REPLACE_ART_AN",
    "MLM_REPLACE", "MLM_APPEND"
]
ID2TAG = {i:t for i,t in enumerate(EDIT_TAGS)}

def choose_article(word: str) -> str:
    return "an" if word[:1].lower() in "aeiou" else "a"

@torch.no_grad()
def predict_tags(model, tok, text: str, max_len=64) -> List[str]:
    enc = tok(
        text.strip().split(),
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=max_len,
        return_token_type_ids=False,
    ).to(model.device)
    logits = model(**enc).logits
    pred = logits.argmax(-1)[0].cpu().tolist()
    word_ids = enc.word_ids()
    tags, seen = [], set()
    for tidx, wid in enumerate(word_ids):
        if wid is None: continue
        if wid not in seen:
            seen.add(wid)
            tags.append(ID2TAG[pred[tidx]])
    return tags

def apply_tags(text: str, tags: List[str]) -> str:
    words = text.strip().split()
    out: List[str] = []
    for w, tg in zip(words, tags + ["KEEP"]*(len(words)-len(tags))):
        if tg == "KEEP": out.append(w)
        elif tg == "DELETE": continue
        elif tg == "APPEND_s": out.append(w + "s")
        elif tg == "APPEND_es": out.append(w + "es")
        elif tg == "APPEND_ed": out.append(w + "ed")
        elif tg == "APPEND_ing": out.append(w + "ing")
        elif tg in ("REPLACE_ART_A","REPLACE_ART_AN"):
            if w.lower() in ("a","an"):
                out.append("an" if tg.endswith("AN") else "a")
            else:
                art = "an" if tg.endswith("AN") else "a"
                out += [art, w]
        elif tg == "REPLACE_VBZ": out.append(w + "s")
        elif tg == "REPLACE_VBD": out.append(w + "ed")
        elif tg == "REPLACE_VBG": out.append(w + "ing")
        elif tg == "REPLACE_NNS": out.append(w + "s")
        else: out.append(w)
    # a/an 후처리
    res, i = [], 0
    while i < len(out):
        if out[i].lower() in ("a","an") and i+1 < len(out):
            res.append(choose_article(out[i+1])); i += 1
        else:
            res.append(out[i])
        i += 1
    return " ".join(res)

def correct(model, tok, text: str) -> str:
    return apply_tags(text, predict_tags(model, tok, text))

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--model_dir", type=str, required=True)
    ap.add_argument("--text", type=str, default=None)
    ap.add_argument("--in_file", type=str, default=None)
    ap.add_argument("--out_file", type=str, default=None)
    ap.add_argument("--max_len", type=int, default=160)
    args = ap.parse_args()

    device = "cuda" if torch.cuda.is_available() else "cpu"
    tok = AutoTokenizer.from_pretrained(args.model_dir)
    model = AutoModelForTokenClassification.from_pretrained(args.model_dir).to(device)
    model.eval()

    if args.text is not None:
        print(correct(model, tok, args.text)); return

    assert args.in_file is not None, "--text 또는 --in_file을 지정하세요"
    outs = []
    with open(args.in_file, "r", encoding="utf-8") as f:
        for line in f:
            s = line.strip()
            outs.append("" if not s else correct(model, tok, s))
    if args.out_file:
        with open(args.out_file, "w", encoding="utf-8") as f:
            for o in outs: f.write(o + "\n")
    else:
        for o in outs: print(o)

if __name__ == "__main__":
    main()

Writing /content/infer_edit_tag.py


In [None]:
!python /content/train_edit_tag_no_val.py \
  --train_csv "/content/drive/MyDrive/GEC_dataset/sentence_pairs_sample500k.csv" \
  --out_dir   "/content/drive/MyDrive/DeBERTa_GECToR" \
  --epochs 10 --batch_size 16 --lr 2e-5 --max_len 64

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 54% 839/1563 [00:29<00:24, 29.55it/s][A
 54% 842/1563 [00:29<00:24, 29.58it/s][A
 54% 845/1563 [00:29<00:24, 29.28it/s][A
 54% 848/1563 [00:29<00:24, 29.15it/s][A
 54% 851/1563 [00:29<00:24, 29.17it/s][A
 55% 854/1563 [00:29<00:24, 29.05it/s][A
 55% 857/1563 [00:29<00:24, 29.15it/s][A
 55% 860/1563 [00:29<00:24, 29.10it/s][A
 55% 863/1563 [00:29<00:23, 29.22it/s][A
 55% 866/1563 [00:30<00:23, 29.15it/s][A
 56% 869/1563 [00:30<00:23, 29.27it/s][A
 56% 872/1563 [00:30<00:23, 29.38it/s][A
 56% 875/1563 [00:30<00:23, 29.08it/s][A
 56% 878/1563 [00:30<00:23, 29.04it/s][A
 56% 881/1563 [00:30<00:23, 28.98it/s][A
 57% 884/1563 [00:30<00:23, 28.87it/s][A
 57% 887/1563 [00:30<00:23, 28.90it/s][A
 57% 890/1563 [00:30<00:23, 29.08it/s][A
 57% 893/1563 [00:31<00:22, 29.30it/s][A
 57% 896/1563 [00:31<00:22, 29.41it/s][A
 58% 899/1563 [00:31<00:22, 29.41it/s][A
 58% 902/1563 [00:31<00:22, 29.29it/s][A
 58% 905/15

In [None]:
!python /content/train_edit_tag_no_val.py \
  --train_csv "/content/drive/MyDrive/GEC_dataset/sentence_pairs_sample500k.csv" \
  --out_dir   "/content/drive/MyDrive/DeBERTa_GECToR" \
  --backbone  "microsoft/deberta-v3-base" \
  --epochs    8 \
  --batch_size 16 \
  --lr        2e-5 \
  --max_len   64 \
  --val_ratio 0.0001 \
  --eval_steps 10000 \
  --save_steps 20000 \

2025-11-02 11:06:51.143045: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-02 11:06:51.160002: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762081611.181300   71274 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762081611.187678   71274 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1762081611.203563   71274 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [None]:
!python /content/infer_edit_tag.py \
  --model_dir "/content/drive/MyDrive/DeBERTa_GECToR/checkpoint-160000" \
  --text "She study English last night."

2025-11-02 22:06:53.056927: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-02 22:06:53.075233: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762121213.096349     805 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762121213.102741     805 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1762121213.119759     805 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [None]:
!python /content/infer_edit_tag.py \
  --model_dir "/content/drive/MyDrive/DeBERTa_GECToR/checkpoint-160000" \
  --text "They is playing football now."

2025-11-02 22:07:46.635134: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-02 22:07:46.652714: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762121266.673546    1172 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762121266.679814    1172 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1762121266.695746    1172 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [None]:
!python /content/infer_edit_tag.py \
  --model_dir "/content/drive/MyDrive/DeBERTa_GECToR/checkpoint-190000" \
  --text "I was go to the market."

I was go to the market.


In [None]:
!du -h --max-depth=1 /content | sort -hr

72G	/content/drive
72G	/content
55M	/content/sample_data
140K	/content/.config


In [None]:
!du -h --max-depth=2 /content | sort -hr

72G	/content/drive/MyDrive
72G	/content/drive
72G	/content
55M	/content/sample_data
228K	/content/drive/.Encrypted
140K	/content/.config
84K	/content/.config/logs
12K	/content/drive/.Trash-0
8.0K	/content/.config/configurations
4.0K	/content/drive/.shortcut-targets-by-id


In [None]:
!du -h --max-depth=3 /content | sort -hr

72G	/content/drive/MyDrive
72G	/content/drive
72G	/content
31G	/content/drive/MyDrive/intent2gen_models
15G	/content/drive/MyDrive/Colab Notebooks
6.9G	/content/drive/MyDrive/DeBERTa_GECToR
4.2G	/content/drive/MyDrive/DeBERTa_GECToR_2M_add1
2.1G	/content/drive/MyDrive/ged_spanbert
1.3G	/content/drive/MyDrive/ged__spanbert
1.2G	/content/drive/MyDrive/GEC_dataset
1.1G	/content/drive/MyDrive/DeBERTa
849M	/content/drive/MyDrive/rolebert_finetune_out
712M	/content/drive/MyDrive/DeBERTa_GECToR_add2
712M	/content/drive/MyDrive/DeBERTa_GECToR_2M
403M	/content/drive/MyDrive/python_codeNET_preprocess
386M	/content/drive/MyDrive/dacon_preprocess_data
262M	/content/drive/MyDrive/models
259M	/content/drive/MyDrive/rolebert_finetuned
55M	/content/sample_data
15M	/content/drive/MyDrive/unified_out
4.0M	/content/drive/MyDrive/Google AI Studio
228K	/content/drive/.Encrypted
220K	/content/drive/.Encrypted/MyDrive
140K	/content/.config
134K	/content/drive/MyDrive/Chrome에서 저장됨
84K	/content/.config/