In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
#@title  경로 설정
CSV_PATH = "/content/drive/MyDrive/LikeLion_NLP2/Project_2/data/sentence_pairs_1900000.csv"  #@param {type:"string"}
SRC_TXT  = "/content/data.src"            # 변환된 source(txt, 1문장/줄)
TGT_TXT  = "/content/data.tgt"            # 변환된 target(txt, 1문장/줄)
OUT_PREFIX = "/content/processed"         # preprocess_data.py 출력 prefix

# 샘플 드라이런용 (원본 검증 전 빠른 체크)
DRY_SRC  = "/content/dry.src"
DRY_TGT  = "/content/dry.tgt"
DRY_N    = 20000  # 샘플 라인 수


In [3]:
#@title  gector utils 다운로드
!set -e
!mkdir -p utils
%cd utils
!wget -q https://github.com/grammarly/gector/raw/master/utils/preprocess_data.py
!wget -q https://raw.githubusercontent.com/grammarly/gector/master/utils/helpers.py
%cd ..
!echo "Downloaded: utils/preprocess_data.py, utils/helpers.py"


/content/utils
/content
Downloaded: utils/preprocess_data.py, utils/helpers.py


In [4]:
!pip -q install git+https://github.com/gotutiyan/gector
# Donwload the verb dictionary in advance
!mkdir data
%cd data
!wget https://github.com/grammarly/gector/raw/master/data/verb-form-vocab.txt
%cd ..
!echo "Downloaded: data/verb-form-vocab.txt"

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for gector (pyproject.toml) ... [?25l[?25hdone
/content/data
--2025-10-27 06:02:23--  https://github.com/grammarly/gector/raw/master/data/verb-form-vocab.txt
Resolving github.com (github.com)... 140.82.112.4
Co

In [5]:
#@title  CSV(noise, clean) → txt 두 개로 변환 (스트리밍)
import csv, sys
from pathlib import Path
from tqdm.auto import tqdm

csv_path = Path(CSV_PATH)
assert csv_path.exists(), f"CSV가 존재하지 않아요: {csv_path}"

def extract_two_txt(csv_path, src_path, tgt_path,
                    noise_keys=("noise","source","noisy","input"),
                    clean_keys=("clean","target","gold","output")):
    # 대용량에 안전한 스트리밍 변환
    n_read, n_written, n_skipped = 0, 0, 0
    with open(csv_path, 'r', encoding='utf-8', errors='ignore', newline='') as fin, \
         open(src_path, 'w', encoding='utf-8') as fs, \
         open(tgt_path, 'w', encoding='utf-8') as ft:
        reader = csv.DictReader(fin)
        # 컬럼명 정규화(lower) 후 매핑
        field_map = {k.lower():k for k in reader.fieldnames}
        def pick(row, keys):
            for k in keys:
                if k in field_map:
                    return (row.get(field_map[k]) or "").strip()
            return ""
        for row in tqdm(reader, desc="Extracting src/tgt"):
            n_read += 1
            s = pick(row, noise_keys)
            t = pick(row, clean_keys)
            if not s or not t:
                n_skipped += 1
                continue
            fs.write(s + "\n")
            ft.write(t + "\n")
            n_written += 1
    return n_read, n_written, n_skipped

n_read, n_written, n_skipped = extract_two_txt(CSV_PATH, SRC_TXT, TGT_TXT)
print(f"rows_read={n_read:,}, rows_written={n_written:,}, skipped={n_skipped:,}")
print("SRC sample:")
!head -n 3 $SRC_TXT
print("TGT sample:")
!head -n 3 $TGT_TXT


Extracting src/tgt: 0it [00:00, ?it/s]

rows_read=1,900,000, rows_written=1,900,000, skipped=0
SRC sample:
Much many brands and sellers still in the market.
Fairy Or Not, I'm the Godmother: no just look, but my outfit for taking the part as godmother.
The Commission is of the view that it is too early to develop a policy for HD Radio technology even though it is in its initial stages in Canada; Commission will accept continued experimentation, voluntary participation or transition to HD Radio technology, and will monitor developments and review its approach accordingly.
TGT sample:
Many brands and sellers still in the market.
Fairy Or Not, I'm the Godmother: Not just a look, but my outfit for taking on the role as godmother.
The Commission is of the view that it is too early to develop a policy for HD Radio technology given that it is still in its initial stages in Canada. The Commission will allow continued experimentation, voluntary participation in or transition to HD Radio technology, and will monitor developments and re

In [6]:
#@title  드라이런용 샘플 만들기 (빠른 검증)
!head -n $DRY_N $SRC_TXT > $DRY_SRC
!head -n $DRY_N $TGT_TXT > $DRY_TGT
!wc -l $DRY_SRC $DRY_TGT
!echo "SRC dry head:" && head -n 2 $DRY_SRC
!echo "TGT dry head:" && head -n 2 $DRY_TGT


  20000 /content/dry.src
  20000 /content/dry.tgt
  40000 total
SRC dry head:
Much many brands and sellers still in the market.
Fairy Or Not, I'm the Godmother: no just look, but my outfit for taking the part as godmother.
TGT dry head:
Many brands and sellers still in the market.
Fairy Or Not, I'm the Godmother: Not just a look, but my outfit for taking on the role as godmother.


In [7]:
!pip install Levenshtein



In [8]:
#@title  preprocess_data.py 실행 (DRY-RUN: 샘플)
!set -e
!python utils/preprocess_data.py \
    -s "$DRY_SRC" \
    -t "$DRY_TGT" \
    -o "{OUT_PREFIX}.dry"

!echo "샘플 전처리 완료. 출력 파일 목록:"
!ls -lh {OUT_PREFIX}.dry*


The size of raw dataset is 20000
20000it [00:13, 1536.77it/s]
Overall extracted 20000. Original TP 19882. Original TN 118
샘플 전처리 완료. 출력 파일 목록:
-rw-r--r-- 1 root root 11M Oct 27 06:03 /content/processed.dry


In [9]:
!echo "precessed.dry :" && head -n 2 {OUT_PREFIX}.dry

precessed.dry :
$STARTSEPL|||SEPR$KEEP MuchSEPL|||SEPR$DELETE manySEPL|||SEPR$TRANSFORM_CASE_CAPITAL brandsSEPL|||SEPR$KEEP andSEPL|||SEPR$KEEP sellersSEPL|||SEPR$KEEP stillSEPL|||SEPR$KEEP inSEPL|||SEPR$KEEP theSEPL|||SEPR$KEEP market.SEPL|||SEPR$KEEP
$STARTSEPL|||SEPR$KEEP FairySEPL|||SEPR$KEEP OrSEPL|||SEPR$KEEP Not,SEPL|||SEPR$KEEP I'mSEPL|||SEPR$KEEP theSEPL|||SEPR$KEEP Godmother:SEPL|||SEPR$KEEP noSEPL|||SEPR$REPLACE_Not justSEPL|||SEPR$APPEND_a look,SEPL|||SEPR$KEEP butSEPL|||SEPR$KEEP mySEPL|||SEPR$KEEP outfitSEPL|||SEPR$KEEP forSEPL|||SEPR$KEEP takingSEPL|||SEPR$APPEND_on theSEPL|||SEPR$KEEP partSEPL|||SEPR$REPLACE_role asSEPL|||SEPR$KEEP godmother.SEPL|||SEPR$KEEP


In [20]:
#@title  preprocess_data.py 실행 (FULL)
!set -e
!python utils/preprocess_data.py \
    -s "$SRC_TXT" \
    -t "$TGT_TXT" \
    -o "{OUT_PREFIX}.full"

!echo "전체 전처리 완료. 출력 파일 목록:"
!ls -lh {OUT_PREFIX}.full*


The size of raw dataset is 1900000
1900000it [14:01, 2258.97it/s]
Overall extracted 1900000. Original TP 1888857. Original TN 11143
전체 전처리 완료. 출력 파일 목록:
-rw-r--r-- 1 root root 1.1G Oct 27 05:22 /content/processed.full


In [10]:
# GECToR preprocess_data.py 산출물(한 줄: 태그-토큰 스트림) → 2컬럼 TSV 변환기
import re
from pathlib import Path

IN_PATH  = "/content/processed.dry"   # ← 당신의 파일
OUT_TSV  = "/content/train_tags.tsv"  # 결과(토큰/태그 2컬럼)

SEP = "SEPL\\|\\|\\|SEPR"   # 'SEPL|||SEPR'를 안전하게 매칭하기 위한 정규식

def parse_line(line: str):
    line = line.strip()
    if not line:
        return [], []
    # 1) 쌍 단위로 분할
    parts = re.split(SEP, line)
    # parts 예) ["$START", "$KEEP Much", "$DELETE many", "$TRANSFORM_CASE_CAPITAL brands", ...]
    tokens, tags = [], []
    for p in parts:
        p = p.strip()
        if not p:
            continue
        if p.startswith("$START"):
            # 문두 앵커 → 무시
            continue
        # 2) "$TAG [토큰]"으로 분해 (첫 공백 기준)
        if " " in p:
            tag, tok = p.split(" ", 1)
            tok = tok.strip()
        else:
            # 토큰이 비어있는 태그(거의 없지만 방어)
            tag, tok = p, ""
        # 3) 표준 태그 문자열로 정리(앞의 $ 제거)
        tag = tag.lstrip("$")
        # 예: TRANSFORM_CASE_CAPITAL → CASE_CAPITAL 로 단순화하고 싶다면 여기서 매핑
        # 여기서는 원형 그대로 둡니다.
        if tok == "":
            # 실토큰이 없으면 스킵
            continue
        tokens.append(tok)
        tags.append(tag)
    return tokens, tags

n = 0
bad = 0
with open(IN_PATH, encoding="utf-8") as fin, open(OUT_TSV, "w", encoding="utf-8") as fout:
    fout.write("tokens\ttags\n")
    for line in fin:
        toks, tg = parse_line(line)
        if not toks or not tg or len(toks) != len(tg):
            bad += 1
            continue
        fout.write("{}\t{}\n".format(" ".join(toks), " ".join(tg)))
        n += 1

print(f"wrote {n} lines to {OUT_TSV} (skipped {bad})")


wrote 20000 lines to /content/train_tags.tsv (skipped 0)


In [11]:
# 보기좋게 파싱 프린트
with open(IN_PATH, encoding="utf-8") as f:
    for i, line in enumerate(f, 1):
        toks, tg = parse_line(line)
        print(f"[Line {i}]")
        for a,b in zip(toks, tg):
            print(f"  {a:20s}  <--  {b}")
        if i >= 2: break


[Line 1]
  Much                  <--  KEEP
  many                  <--  DELETE
  brands                <--  TRANSFORM_CASE_CAPITAL
  and                   <--  KEEP
  sellers               <--  KEEP
  still                 <--  KEEP
  in                    <--  KEEP
  the                   <--  KEEP
  market.               <--  KEEP
[Line 2]
  Fairy                 <--  KEEP
  Or                    <--  KEEP
  Not,                  <--  KEEP
  I'm                   <--  KEEP
  the                   <--  KEEP
  Godmother:            <--  KEEP
  no                    <--  KEEP
  just                  <--  REPLACE_Not
  look,                 <--  APPEND_a
  but                   <--  KEEP
  my                    <--  KEEP
  outfit                <--  KEEP
  for                   <--  KEEP
  taking                <--  KEEP
  the                   <--  APPEND_on
  part                  <--  KEEP
  as                    <--  REPLACE_role
  godmother.            <--  KEEP
