# Preprocessing

## Environment & imports

In [1]:
# ---------------------------------------------
# 0)  Install (first‐time only) & import libs
# ---------------------------------------------
# !pip install -q datasets transformers emoji==2.10.0 tqdm

from pathlib import Path
import re
import random
import json
from collections import defaultdict
from typing import List, Dict, Tuple

import emoji
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


## Load SAMSum

In [2]:
# ---------------------------------------------------------
# 1) Load SAMSum — 14 732 / 819 / 818 dialogues
# ---------------------------------------------------------
raw_ds: DatasetDict = load_dataset("samsum")
print({k: len(v) for k, v in raw_ds.items()})

Using the latest cached version of the module from /home/drl-68/.cache/huggingface/modules/datasets_modules/datasets/samsum/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e (last modified on Mon May  5 10:16:55 2025) since it couldn't be found locally at samsum, or remotely on the Hugging Face Hub.


{'train': 14732, 'test': 819, 'validation': 818}


## Build an emoji vocabulary and speaker token & Build / extend the tokenizer

count [UNK] occurrences in one HF Dataset

In [3]:
from tqdm import tqdm
import numpy as np
import torch

def count_unk(ds, tokenizer, field="dialogue", batch_size=1024):
    unk_id = tokenizer.unk_token_id
    total_unk, total_tokens = 0, 0

    for i in tqdm(range(0, len(ds), batch_size), desc="Tokenising"):
        batch_texts = ds[i : i + batch_size][field]
        enc = tokenizer(batch_texts, add_special_tokens=True, padding=False, truncation=False)
        for ids in enc["input_ids"]:
            arr = np.array(ids)
            total_unk += np.sum(arr == unk_id)
            total_tokens += len(arr)
    return total_unk, total_tokens

BEFORE adding emojis

In [4]:
tok_base = AutoTokenizer.from_pretrained("bert-base-uncased")
unk_stats_before = {}
for split in ["train", "validation", "test"]:
    unk_stats_before[split] = count_unk(raw_ds[split], tok_base)
print("\n[UNK] counts BEFORE adding emoji tokens")
for split, (u, t) in unk_stats_before.items():
    print(f"{split:<10}: {u:8d}  ({u/t:.3%} of tokens)")

Tokenising:   0%|          | 0/15 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
Tokenising: 100%|██████████| 15/15 [00:00<00:00, 22.91it/s]
Tokenising: 100%|██████████| 1/1 [00:00<00:00, 38.01it/s]
Tokenising: 100%|██████████| 1/1 [00:00<00:00, 35.96it/s]


[UNK] counts BEFORE adding emoji tokens
train     :     3758  (0.185% of tokens)
validation:      191  (0.174% of tokens)
test      :      195  (0.170% of tokens)





สร้าง EMOJI_TOKENS

In [5]:
# ถ้า kernel เพิ่งรีสตาร์ต ตัวแปรจะหายหมด
# สร้างชุด emoji ใหม่จาก raw_ds
from typing import List
import emoji

def extract_emojis(text: str) -> List[str]:
    return [ch for ch in text if ch in emoji.EMOJI_DATA]

emoji_set = set()
for split in ["train", "validation", "test"]:
    for dlg in raw_ds[split]["dialogue"]:
        emoji_set.update(extract_emojis(dlg))

EMOJI_TOKENS = sorted(emoji_set)          # ≈ 300-320 รายการ
print(f"Unique emojis found: {len(EMOJI_TOKENS)}")

Unique emojis found: 305


Extend tokenizer with emojis + speaker tags

In [6]:
from transformers import AutoTokenizer

# ---------- 1) โหลด tokenizer ดั้งเดิม ----------
tok_base = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_orig = len(tok_base)

# ---------- 2) เตรียมชุด token ใหม่ ----------
#   • EMOJI_TOKENS  : ทุกอิโมจิที่ “พบอย่างน้อย 1 ครั้ง” ใน SAMSum
#   • SPEAKER_TOKENS: [S1] – [S10]
SPEAKER_TOKENS = [f"[S{i}]" for i in range(1, 11)]
new_tokens = EMOJI_TOKENS + SPEAKER_TOKENS

# ---------- 3) สร้าง tokenizer สำเนาแล้วเพิ่ม token ----------
tok_ext = AutoTokenizer.from_pretrained("bert-base-uncased")
added = tok_ext.add_tokens(new_tokens)
vocab_new = len(tok_ext)

# ---------- 4) แสดงผล ----------
print(f"Original vocab size : {vocab_orig}")
print(f"Added new tokens     : {added}  "
      f"(emoji = {len(EMOJI_TOKENS)}, speaker = {len(SPEAKER_TOKENS)})")
print(f"New vocab size       : {vocab_new}")

# (Optional) พิมพ์ตัวอย่างอิโมจิ 20 ตัวแรก
print("\nFirst 20 emoji tokens:", EMOJI_TOKENS[:20])

tok_ext.save_pretrained("tokenizer_samsum_su")   # โฟลเดอร์ใหม่

Original vocab size : 30522
Added new tokens     : 315  (emoji = 305, speaker = 10)
New vocab size       : 30836

First 20 emoji tokens: ['‼', '⏱', '☀', '☂', '☔', '☕', '☘', '☝', '☠', '☢', '☹', '☺', '♀', '♂', '♥', '♻', '⚪', '⚫', '⚰', '⚽']


('tokenizer_samsum_su/tokenizer_config.json',
 'tokenizer_samsum_su/special_tokens_map.json',
 'tokenizer_samsum_su/vocab.txt',
 'tokenizer_samsum_su/added_tokens.json',
 'tokenizer_samsum_su/tokenizer.json')

AFTER adding emojis

In [7]:
unk_stats_after = {}
for split in ["train", "validation", "test"]:
    unk_stats_after[split] = count_unk(raw_ds[split], tok_ext)
print("\n[UNK] counts AFTER adding emoji tokens")
for split, (u, t) in unk_stats_after.items():
    print(f"{split:<10}: {u:8d}  ({u/t:.3%} of tokens)")

Tokenising:   0%|          | 0/15 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
Tokenising: 100%|██████████| 15/15 [00:00<00:00, 22.52it/s]
Tokenising: 100%|██████████| 1/1 [00:00<00:00, 36.58it/s]
Tokenising: 100%|██████████| 1/1 [00:00<00:00, 36.09it/s]


[UNK] counts AFTER adding emoji tokens
train     :      451  (0.022% of tokens)
validation:        4  (0.004% of tokens)
test      :       24  (0.021% of tokens)





reduction check in UNKs

In [9]:
print("\nΔ [UNK] (before ➜ after):")
for split in ["train", "validation", "test"]:
    u0, _ = unk_stats_before[split]
    u1, _ = unk_stats_after[split]
    print(f"{split:<10}: {u0-u1:+d}  fewer UNKs  (↓{(u0-u1)/u0:.2%})")


Δ [UNK] (before ➜ after):
train     : +3307  fewer UNKs  (↓88.00%)
validation: +187  fewer UNKs  (↓97.91%)
test      : +171  fewer UNKs  (↓87.69%)


## Preprocess SAMSum Dateset

Speaker-name mapping → [S#]

In [10]:
# ---------------------------------------------------------
# 4) Helper to replace speaker names by [S#]
# ---------------------------------------------------------
SPEAKER_RE = re.compile(r"^([^:]+):\s*(.*)$")

def map_speakers(dialogue: str, max_speakers: int = 10
                 ) -> Tuple[str, Dict[str, str]]:
    """
    Returns dialogue with names replaced by [S#] and a mapping dict.
    """
    speaker_map, next_id = {}, 1
    new_lines = []
    for line in dialogue.split("\n"):
        m = SPEAKER_RE.match(line)
        if not m:                # safety – keep line as is
            new_lines.append(line)
            continue
        name, utt = m.groups()
        if name not in speaker_map:
            if next_id > max_speakers:      # truncate extra speakers
                name_token = "[SUNK]"
            else:
                name_token = f"[S{next_id}]"
                speaker_map[name] = name_token
                next_id += 1
        new_lines.append(f"{speaker_map.get(name, '[SUNK]')}: {utt}")
    return "\n".join(new_lines), speaker_map


Insert [SEP] after every utterance

In [11]:
def add_sep_every_utt(dialogue: str) -> str:
    lines = [l + " [SEP]" for l in dialogue.split("\n") if l.strip()]
    return " ".join(lines)

Switching-Utterance corruption
- Hyper-parameters: Pu = 1.0, Pn = 0/1

โดยที่

Pu (permute-utterance prob.) ความน่าจะเป็นที่ แต่ละ utterance จะถูกเลือก ใส่ลงในชุดที่นำไปสับตำแหน่ง

- pu = 1.0 แสดงว่าบังคับเลือกทุกบรรทัดแล้วค่อยสับคำแบบสุ่ม

Pn (name-mask prob.) ความน่าจะเป็นที่ token [S#] ด้านหน้าจะถูกเปลี่ยนเป็น [MASK]

- pn = 0.0 แสดงว่า ไม่ mask, โมเดลเห็น speaker tag

- pn = 1.0 แสดงว่า mask หมด, บังคับดู context

In [None]:
def make_switching_utterance(dialogue: str,
                             pu: float = 1.0,
                             pn: float = 0.0,
                             rng: random.Random = random
                            ) -> Tuple[str, List[int]]:
    """
    • dialogue  - speaker-tokenised, SEP-inserted string
    • pu        - prob. an utterance is selected for permutation
    • pn        - prob. we MASK the speaker token (⇒ [MASK])
    Returns:
        corrupted_dialogue, labels_per_utt  (1 = permuted (สลับบทพูด), 0 = original)
    """
    # 1) split back into utterances
    utts = [u.strip() for u in dialogue.split("[SEP]") if u.strip()]
    idxs = list(range(len(utts)))

    # 2) pick indices to permute
    perm_idx = [i for i in idxs if rng.random() < pu]
    shuffled = perm_idx.copy()
    rng.shuffle(shuffled)                 # in-place
    perm_map = dict(zip(perm_idx, shuffled))

    # 3) build new utterance list, labels
    new_utts, labels = [], []
    for i in idxs:
        src = perm_map.get(i, i)          # swapped or same
        u = utts[src]
        # optionally mask speaker token ([S#]: → [MASK]:)
        if rng.random() < pn:
            u = re.sub(r"^\[S\d+\]", "[MASK]", u)
        new_utts.append(u)
        labels.append(int(src != i))      # 1 if permuted
    corrupted = " [SEP] ".join(new_utts) + " [SEP]"
    return corrupted, labels


## Switching-Utterance (SU) pre-training dataset

In [None]:
# ---------------------------------------------------------
# 7) Create HF Datasets with tokenised inputs, attention,
#    SEP positions, and per-utterance labels
# ---------------------------------------------------------
MAX_LEN = 512                          # paper setting
Pu, Pn = 1.0, 0.0                      # best config in Table 2

def preprocess_example(example, split):
    # a) replace speakers & add SEP
    dlg, _ = map_speakers(example["dialogue"])
    dlg = add_sep_every_utt(dlg)

    # b) corruption
    corrupted, labels = make_switching_utterance(dlg, Pu, Pn)

    # c) tokenize (truncate if >512 tokens)
    enc = tok(corrupted,
              truncation=True, max_length=MAX_LEN,
              padding="max_length")
    
    # d) find SEP token positions (needed for loss later)
    sep_id = tok("[SEP]")["input_ids"][0]
    sep_positions = [i for i, id_ in enumerate(enc["input_ids"])
                     if id_ == sep_id][:len(labels)]  # clip if truncated

    enc["labels"] = labels[:len(sep_positions)]
    enc["sep_positions"] = sep_positions
    enc["dialogue_len"] = len(labels)
    return enc

su_ds = DatasetDict()
for split in ["train", "validation", "test"]:
    su_ds[split] = raw_ds[split].map(
        preprocess_example,
        fn_kwargs={"split": split},
        remove_columns=raw_ds[split].column_names,
        desc=f"Building SU {split}"
    )

su_ds.save_to_disk("data/samsum_switching_utterance")
print(su_ds)

Building SU train: 100%|██████████| 14732/14732 [00:07<00:00, 1850.04 examples/s]
Building SU validation: 100%|██████████| 818/818 [00:00<00:00, 1841.35 examples/s]
Building SU test: 100%|██████████| 819/819 [00:00<00:00, 1815.59 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 581300.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 204161.90 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 201261.72 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'sep_positions', 'dialogue_len'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'sep_positions', 'dialogue_len'],
        num_rows: 818
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'sep_positions', 'dialogue_len'],
        num_rows: 819
    })
})





ไฟล์ Arrow ถูกบันทึกไว้ที่ data/samsum_switching_utterance/ พร้อมฟิลด์ input_ids / attention_mask / labels / sep_positions / dialogue_len.

## Self-supervised Pre-training

ใช้ Dataset เฉพาะส่วนของ train ของ SAMSum มาทำการ pre_train แล้วใช้ validation ไว้ดู early-stopping / tuning ส่วน test ต้องไม่ถูกแตะ เพื่อไม่ให้โมเดล “เห็น” บทสนทนาที่จะใช้วัด ROUGE ภายหลัง

In [23]:
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

Imports & helpers

In [6]:
import math
import torch
import random
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_linear_schedule_with_warmup
)
from datasets import load_from_disk

# -------------------------------
# CONFIG
# -------------------------------
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16 # paper 128
MAX_LEN    = 512 # paper 512
LR         = 3e-5
WARMUP     = 500
MAX_STEPS  = 40000

  return torch._C._cuda_getDeviceCount() > 0


Dataset & collate

In [6]:
# -------------------------------
# LOAD DATASET
# -------------------------------
dataset = load_from_disk("data/samsum_switching_utterance")

# -------------------------------
# COLLATE FUNCTION
# -------------------------------
def collate_fn(batch):
    keys = ["input_ids", "token_type_ids", "attention_mask"]
    inputs = {k: torch.tensor([b[k] for b in batch]) for k in keys}
    labels = [torch.tensor(b["labels"], dtype=torch.float) for b in batch]
    sep_pos = [torch.tensor(b["sep_positions"]) for b in batch]
    return inputs, labels, sep_pos


Model

In [None]:
# -------------------------------
# MODEL
# -------------------------------
class SepClassifier(nn.Module):
    def __init__(self, model_name="bert-base-uncased", dropout=0.1):
        super().__init__()
        config = AutoConfig.from_pretrained(model_name)
        self.bert = AutoModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, token_type_ids, sep_positions):
        hidden_states = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        ).last_hidden_state

        # Collect hidden states at each [SEP] position
        # sep_vecs = [hidden_states[i, pos] for i, pos in enumerate(sep_positions)]
        sep_vecs = []
        for i, pos_tensor in enumerate(sep_positions):
            pos_tensor = pos_tensor.to(hidden_states.device).long()  # <-- เพิ่มการ cast
            sep_vecs.append(hidden_states[i].index_select(0, pos_tensor))  # (U_i, H)

        sep_vecs = torch.cat(sep_vecs, dim=0)  # Shape: (total_seps, hidden_size)
        logits = self.classifier(self.dropout(sep_vecs)).squeeze(-1)
        return logits

Training loop (train model until the train loss converged (upper bounded by 5k steps)

In [24]:
# -------------------------------
# INITIALIZATION
# -------------------------------
model = SepClassifier().to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained("tokenizer_samsum_su")
model.bert.resize_token_embeddings(len(tokenizer)) # Adjust embedding size for extended tokens

train_loader = DataLoader(
    dataset["train"],
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn
)

optimizer = AdamW(model.parameters(), lr=LR)
total_steps = min(MAX_STEPS, len(train_loader))
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP,
    num_training_steps=total_steps
)
loss_fn = nn.BCEWithLogitsLoss()

# -------------------------------
# TRAINING LOOP
# -------------------------------
step = 0
running_loss = 0.0
model.train()

for epoch in range(100):  # loop until MAX_STEPS reached
    for inputs, label_lists, sep_lists in train_loader:
        if step >= MAX_STEPS:
            break

        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
        flat_labels = torch.cat(label_lists).to(DEVICE)

        logits = model(**inputs, sep_positions=sep_lists)
        loss = loss_fn(logits, flat_labels)

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        running_loss += loss.item()
        step += 1

        if step % 100 == 0:
            print(f"Step {step:4d}/{total_steps} | Loss: {running_loss / 100:.4f}")
            running_loss = 0.0

    if step >= MAX_STEPS:
        break

# -------------------------------
# SAVE MODEL
# -------------------------------
torch.save(model.state_dict(), "bert_su_pretrained.pt")
print("Model saved to 'bert_su_pretrained.pt'")

Step  100/920 | Loss: 0.4475
Step  200/920 | Loss: 0.3658
Step  300/920 | Loss: 0.3322
Step  400/920 | Loss: 0.3086
Step  500/920 | Loss: 0.2784
Step  600/920 | Loss: 0.2073
Step  700/920 | Loss: 0.1651
Step  800/920 | Loss: 0.1274
Step  900/920 | Loss: 0.1344
Step 1000/920 | Loss: 0.1066
Step 1100/920 | Loss: 0.0985
Step 1200/920 | Loss: 0.0876
Step 1300/920 | Loss: 0.1076
Step 1400/920 | Loss: 0.1057
Step 1500/920 | Loss: 0.1119
Step 1600/920 | Loss: 0.1069
Step 1700/920 | Loss: 0.1009
Step 1800/920 | Loss: 0.0942
Step 1900/920 | Loss: 0.0999
Step 2000/920 | Loss: 0.1005
Step 2100/920 | Loss: 0.1036
Step 2200/920 | Loss: 0.0995
Step 2300/920 | Loss: 0.0981
Step 2400/920 | Loss: 0.1093
Step 2500/920 | Loss: 0.1070
Step 2600/920 | Loss: 0.0990
Step 2700/920 | Loss: 0.0996
Step 2800/920 | Loss: 0.1021
Step 2900/920 | Loss: 0.0949
Step 3000/920 | Loss: 0.1073
Step 3100/920 | Loss: 0.1104
Step 3200/920 | Loss: 0.1053
Step 3300/920 | Loss: 0.0948
Step 3400/920 | Loss: 0.1014
Step 3500/920 

In [31]:
torch.save(model.bert.state_dict(), "bert_su_pretrained.pt")

Validation & early-stop (optional)

- Use the same DataLoader/loop on su_ds["validation"], compute average BCE loss; if it plateaus you can stop earlier than 5 k steps (what the authors mean by “until train loss converged”).

# Create Summarization Dataset

ขั้นตอนการทำ preprocess
1. โหลดชุดข้อมูล SAMSum
2. ทำ preprocessing:
    - แทนชื่อ speaker ด้วย [S1]–[S10]
    - เติม [SEP] ท้ายทุกประโยค
    - ใช้ tokenizer เดิมจาก pretraining (tokenizer_samsum_su)
    - truncate/pad ความยาวที่ max_length = 512
3. แปลงให้อยู่ในรูปแบบที่พร้อมใช้สำหรับ Seq2SeqTrainer
4. Save เป็นไฟล์ .pt หรือ DatasetDict ที่พร้อมใช้งาน

Load SAMSum Dataset

In [12]:
raw_ds: DatasetDict = load_dataset("samsum")
print({k: len(v) for k, v in raw_ds.items()})

Using the latest cached version of the module from /home/drl-68/.cache/huggingface/modules/datasets_modules/datasets/samsum/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e (last modified on Mon May  5 10:16:55 2025) since it couldn't be found locally at samsum, or remotely on the Hugging Face Hub.


{'train': 14732, 'test': 819, 'validation': 818}


Load Pretrained Tokenizer (same as used during pretraining)

In [13]:
tokenizer = AutoTokenizer.from_pretrained("tokenizer_samsum_su")
MAX_LEN = 512

Speaker Normalization Helpers

In [15]:
SPEAKER_RE = re.compile(r"^([^:]+):\s*(.*)$")

def map_speakers(dialogue: str, max_speakers: int = 10) -> Tuple[str, Dict[str, str]]:
    """
    Replace speaker names with generic [S1], [S2], ... tokens.
    """
    speaker_map, next_id = {}, 1
    new_lines = []
    for line in dialogue.split("\n"):
        m = SPEAKER_RE.match(line)
        if not m:
            new_lines.append(line)
            continue
        name, utt = m.groups()
        if name not in speaker_map:
            if next_id > max_speakers:
                name_token = "[SUNK]"
            else:
                name_token = f"[S{next_id}]"
                speaker_map[name] = name_token
                next_id += 1
        name_token = speaker_map.get(name, "[SUNK]")
        new_lines.append(f"{name_token}: {utt}")
    return "\n".join(new_lines), speaker_map

def add_sep_every_utt(dialogue: str) -> str:
    lines = [l + " [SEP]" for l in dialogue.split("\n") if l.strip()]
    return " ".join(lines)


Preprocessing Function

In [16]:
def preprocess_fn(example):
    normed_dialogue, _ = map_speakers(example["dialogue"])
    sep_dialogue = add_sep_every_utt(normed_dialogue)

    inputs = tokenizer(
        sep_dialogue,
        truncation=True,
        padding='max_length',
        max_length=MAX_LEN,
    )
    targets = tokenizer(
        example["summary"],
        truncation=True,
        padding='max_length',
        max_length=MAX_LEN,
    )

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": targets["input_ids"]
    }

Apply Preprocessing

In [17]:
tokenized_ds = raw_ds.map(preprocess_fn, batched=False)
tokenized_ds.save_to_disk("samsum_finetune_ready")
print("Preprocessed dataset saved to 'samsum_finetune_ready'")

Map: 100%|██████████| 14732/14732 [00:08<00:00, 1649.04 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 1614.32 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 1669.57 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 323900.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 177123.59 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 159697.48 examples/s]

Preprocessed dataset saved to 'samsum_finetune_ready'





In [None]:
# MAX_LEN = 512  # paper setting

# def preprocess_example(example, split):
#     # a) replace speakers & add SEP (same as pretraining)
#     dlg, _ = map_speakers(example["dialogue"])  # แปลงชื่อให้เป็น token สั้น ๆ เช่น <USR1>
#     dlg = add_sep_every_utt(dlg)                # เพิ่ม [SEP] ทุกท้ายประโยค

#     # b) tokenize dialogue input
#     enc = tok_base(dlg,
#               truncation=True,
#               max_length=MAX_LEN,
#               padding="max_length")

#     # c) tok_baseenize target summary
#     with tok_base.as_target_tokenizer():
#         summary = example["summary"]
#         summary_enc = tok_base(summary,
#                           truncation=True,
#                           max_length=MAX_LEN,
#                           padding="max_length")
    
#     # d) pack input and label
#     enc["labels"] = summary_enc["input_ids"]
#     return enc

# # สร้าง dataset ใหม่สำหรับ fine-tune
# finetune_ds = DatasetDict()
# for split in ["train", "validation", "test"]:
#     finetune_ds[split] = raw_ds[split].map(
#         preprocess_example,
#         fn_kwargs={"split": split},
#         remove_columns=raw_ds[split].column_names,
#         desc=f"Building Fine-tuning {split}"
#     )

# finetune_ds.save_to_disk("data/samsum_finetune")
# print(finetune_ds)

Building Fine-tuning train: 100%|██████████| 14732/14732 [00:09<00:00, 1561.26 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 183818.74 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 129347.43 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 126710.99 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
})





# Fine-tuning 

### BERT + SU


**เทียบกับ Paper**

| **Parameter**       | **Code**                          | **Paper (Section 3.2)**          | **เหมือน / ไม่เหมือน**        |
| ------------------- | --------------------------------- | -------------------------------- | --------------       |
| Model               | BERT2BERT (EncoderDecoderModel)   | BERT2BERT                        | เหมือน                 |
| Tokenizer           | bert-base-uncased + custom tokens | ใช้ tokenizer ดัดแปลง              | เหมือน              |
| Batch Size          | 8                                 | **16 (per step)**                | ไม่เหมือน → เล็กกว่า  |
| Epochs              | 3                                 | 3                                | เหมือน              |
| Learning Rate       | 5e-5                              | **3e-5**                         | ไม่เหมือน → สูงกว่า   |
| Warmup Steps        | 500                               | ใช้ scheduler (แต่ไม่ระบุ exact)     | เหมือน (สมเหตุสมผล) |
| Max Length (input)  | 512                               | 512                              | เหมือน              |
| Max Length (output) | 128                               | 128                              | เหมือน              |
| Beam Search         | 4                                 | 4                                | เหมือน              |

---


In [None]:
import torch
from transformers import (
    BertTokenizerFast,
    EncoderDecoderModel,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
from datasets import load_from_disk

# ------------------------------
# Load processed dataset & tokenizer
# ------------------------------
dataset = load_from_disk("data/samsum_finetune_ready")
tokenizer = BertTokenizerFast.from_pretrained("tokenizer_samsum_su")

# ------------------------------
# Load pretrained EncoderDecoderModel
# ------------------------------
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-uncased", "bert-base-uncased"
)
model.encoder.resize_token_embeddings(len(tokenizer))
model.decoder.resize_token_embeddings(len(tokenizer))

# Load your pretrained encoder weights
model.encoder.load_state_dict(torch.load("bert_su_pretrained.pt", map_location="cpu"))
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size
model.config.max_length = 128
model.config.num_beams = 4

# ------------------------------
# Define training arguments
# ------------------------------
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    evaluation_strategy="steps",
    logging_steps=500,
    save_steps=1000,
    num_train_epochs=3,
    learning_rate=5e-5,
    warmup_steps=500,
    fp16=torch.cuda.is_available(),
    save_total_limit=2,
)

# ------------------------------
# Data Collator & Trainer
# ------------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ------------------------------
# Start Training
# ------------------------------
trainer.train()
model.save_pretrained("bert_samsum_finetuned")
tokenizer.save_pretrained("tokenizer_samsum_su_finetune")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.enco

{'loss': 1.4418, 'grad_norm': 0.3646565079689026, 'learning_rate': 4.96e-05, 'epoch': 0.27}


                                                  
  9%|▉         | 501/5526 [01:41<2:22:17,  1.70s/it]

{'eval_loss': 0.22444657981395721, 'eval_runtime': 5.0061, 'eval_samples_per_second': 163.402, 'eval_steps_per_second': 20.575, 'epoch': 0.27}


 18%|█▊        | 1000/5526 [03:17<14:36,  5.17it/s] 

{'loss': 0.2172, 'grad_norm': 0.4135509133338928, 'learning_rate': 4.506565857540788e-05, 'epoch': 0.54}


                                                   


{'eval_loss': 0.19904808700084686, 'eval_runtime': 5.0413, 'eval_samples_per_second': 162.259, 'eval_steps_per_second': 20.431, 'epoch': 0.54}


 27%|██▋       | 1500/5526 [05:00<12:53,  5.21it/s]  

{'loss': 0.2008, 'grad_norm': 0.274005264043808, 'learning_rate': 4.009152407481098e-05, 'epoch': 0.81}


                                                   
 27%|██▋       | 1501/5526 [05:06<1:53:34,  1.69s/it]

{'eval_loss': 0.1895398199558258, 'eval_runtime': 4.9978, 'eval_samples_per_second': 163.672, 'eval_steps_per_second': 20.609, 'epoch': 0.81}


 36%|███▌      | 2000/5526 [06:42<11:18,  5.20it/s]  

{'loss': 0.1864, 'grad_norm': 0.31300443410873413, 'learning_rate': 3.511738957421409e-05, 'epoch': 1.09}


                                                   
 36%|███▌      | 2000/5526 [06:47<11:18,  5.20it/s]

{'eval_loss': 0.18359985947608948, 'eval_runtime': 4.9794, 'eval_samples_per_second': 164.278, 'eval_steps_per_second': 20.685, 'epoch': 1.09}


 45%|████▌     | 2500/5526 [08:25<09:41,  5.21it/s]  

{'loss': 0.17, 'grad_norm': 0.36899664998054504, 'learning_rate': 3.0143255073617192e-05, 'epoch': 1.36}


                                                   
 45%|████▌     | 2501/5526 [08:30<1:25:32,  1.70s/it]

{'eval_loss': 0.17844413220882416, 'eval_runtime': 5.0089, 'eval_samples_per_second': 163.311, 'eval_steps_per_second': 20.564, 'epoch': 1.36}


 54%|█████▍    | 3000/5526 [10:06<08:05,  5.20it/s]  

{'loss': 0.1689, 'grad_norm': 0.43778106570243835, 'learning_rate': 2.5169120573020293e-05, 'epoch': 1.63}


                                                   
 54%|█████▍    | 3000/5526 [10:11<08:05,  5.20it/s]

{'eval_loss': 0.17464406788349152, 'eval_runtime': 4.984, 'eval_samples_per_second': 164.126, 'eval_steps_per_second': 20.666, 'epoch': 1.63}


 63%|██████▎   | 3500/5526 [11:50<06:25,  5.26it/s]  

{'loss': 0.1632, 'grad_norm': 0.3436298966407776, 'learning_rate': 2.01949860724234e-05, 'epoch': 1.9}


                                                   
 63%|██████▎   | 3501/5526 [11:55<56:57,  1.69s/it]

{'eval_loss': 0.17154935002326965, 'eval_runtime': 4.9824, 'eval_samples_per_second': 164.179, 'eval_steps_per_second': 20.673, 'epoch': 1.9}


 72%|███████▏  | 4000/5526 [13:31<04:52,  5.21it/s]

{'loss': 0.1487, 'grad_norm': 0.4152975380420685, 'learning_rate': 1.5220851571826503e-05, 'epoch': 2.17}


                                                   
 72%|███████▏  | 4000/5526 [13:36<04:52,  5.21it/s]

{'eval_loss': 0.17133557796478271, 'eval_runtime': 4.98, 'eval_samples_per_second': 164.259, 'eval_steps_per_second': 20.683, 'epoch': 2.17}


 81%|████████▏ | 4500/5526 [15:14<03:17,  5.21it/s]  

{'loss': 0.1356, 'grad_norm': 0.35845550894737244, 'learning_rate': 1.0246717071229607e-05, 'epoch': 2.44}


                                                   
 81%|████████▏ | 4501/5526 [15:19<28:52,  1.69s/it]

{'eval_loss': 0.1692614108324051, 'eval_runtime': 4.9857, 'eval_samples_per_second': 164.068, 'eval_steps_per_second': 20.659, 'epoch': 2.44}


 90%|█████████ | 5000/5526 [16:55<01:41,  5.20it/s]

{'loss': 0.1402, 'grad_norm': 0.3714558780193329, 'learning_rate': 5.27258257063271e-06, 'epoch': 2.71}


                                                   
 90%|█████████ | 5000/5526 [17:00<01:41,  5.20it/s]

{'eval_loss': 0.1681322604417801, 'eval_runtime': 4.9793, 'eval_samples_per_second': 164.281, 'eval_steps_per_second': 20.686, 'epoch': 2.71}


100%|█████████▉| 5500/5526 [18:38<00:04,  5.21it/s]

{'loss': 0.1387, 'grad_norm': 0.4560355544090271, 'learning_rate': 2.984480700358138e-07, 'epoch': 2.99}


                                                   
100%|█████████▉| 5501/5526 [18:43<00:42,  1.69s/it]

{'eval_loss': 0.1668214201927185, 'eval_runtime': 4.9802, 'eval_samples_per_second': 164.251, 'eval_steps_per_second': 20.682, 'epoch': 2.99}


100%|██████████| 5526/5526 [18:50<00:00,  4.89it/s]


{'train_runtime': 1130.5819, 'train_samples_per_second': 39.091, 'train_steps_per_second': 4.888, 'train_loss': 0.28212739849194124, 'epoch': 3.0}


('tokenizer_samsum_su_finetune/tokenizer_config.json',
 'tokenizer_samsum_su_finetune/special_tokens_map.json',
 'tokenizer_samsum_su_finetune/vocab.txt',
 'tokenizer_samsum_su_finetune/added_tokens.json',
 'tokenizer_samsum_su_finetune/tokenizer.json')

### Evaluation BERT+SU

1. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) ใช้วัดความคล้ายกันระหว่างสรุปที่โมเดลสร้างขึ้นกับสรุปอ้างอิง โดยเน้นไปที่ recall เป็นหลัก
	- ROUGE-1 (R-1) = Unigram overlap (คำเดี่ยว)
	- ROUGE-2 (R-2) = Bigram overlap (คำติดกัน 2 คำ)
	- ROUGE-L (R-L) = ใช้ Longest common subsequence (LCS) ในการวัดความคล้ายเชิงลำดับคำที่ยาวที่สุดที่ปรากฏในทั้งสองสรุป โดยคำนึงถึงลำดับคำด้วย

2. BLEU (Bilingual Evaluation Understudy) เดิมทีใช้ในงานแปลภาษา แต่ถูกประยุกต์ใช้ในงานสรุปข้อความได้เช่นกัน โดย BLEU จะเน้นการวัด precision คือดูว่า คำที่โมเดลสร้าง มีเท่าไรที่ตรงกับสรุปจริง ต่างจาก ROUGE ที่เน้น recall
	- BLEU วัดการทับซ้อนของ n-gram เช่น unigram, bigram, trigram
	- มีการใช้ brevity penalty หากสรุปสั้นกว่าที่ควรจะเป็น

3. BERTScore (BS) ใช้ embedding จากโมเดล BERT หรือ Transformer ตัวอื่น ๆ ในการวัด semantic similarity (ความใกล้เคียงด้านความหมาย) ระหว่างสรุปของโมเดลกับสรุปจริง โดยไม่จำเป็นต้องใช้คำเหมือนกันเป๊ะเหมือนกับ ROUGE หรือ BLEU แต่ BERTScore จะวัดว่าคำหรือวลีมีความหมายใกล้เคียงกันหรือไม่
	- วัดความคล้ายกันของคำใน embedding space เช่น "car" vs "vehicle" ก็ยังถือว่าใกล้เคียง
	- ใช้ precision / recall / F1 score ตามระยะห่างของ vector


In [1]:
from datasets import load_from_disk
from transformers import BertTokenizer, EncoderDecoderModel
from sklearn.metrics import precision_score, recall_score, f1_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import torch
from bert_score import score
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# โหลด dataset
dataset = load_from_disk("data/samsum_finetune_ready")

# โหลดโมเดลและ tokenizer
tokenizer = BertTokenizer.from_pretrained('tokenizer_samsum_su_finetune')
model = EncoderDecoderModel.from_pretrained('bert_samsum_finetuned')

EncoderDecoderModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [None]:
# ย้ายโมเดลและข้อมูลไปยังอุปกรณ์ที่เหมาะสม (GPU หรือ CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# วิเคราะห์ข้อมูลและทำการ summary ด้วย bert_samsum_finetuned
def generate_summary(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # ย้ายข้อมูลไปยัง device
    
    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'], 
            max_length=512, 
            num_beams=4, 
            early_stopping=True,
            decoder_start_token_id=model.config.decoder_start_token_id,  # กำหนดที่นี่
            pad_token_id=model.config.pad_token_id  # กำหนด pad_token_id ถ้าจำเป็น
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [4]:
# 1. ROUGE Score Calculation
def calculate_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores['rouge1'].append(score['rouge1'].fmeasure)
        scores['rouge2'].append(score['rouge2'].fmeasure)
        scores['rougeL'].append(score['rougeL'].fmeasure)
    
    return {key: sum(value)/len(value) for key, value in scores.items()}

# 2. BLEU Score Calculation
def calculate_bleu(predictions, references):
    bleu_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = [ref.split()]
        bleu_scores.append(sentence_bleu(ref_tokens, pred_tokens))
    return sum(bleu_scores) / len(bleu_scores)

# 3. BERTScore Calculation
def calculate_bertscore(predictions, references):
    P, R, F1 = score(predictions, references, lang='en')
    return P.mean().item(), R.mean().item(), F1.mean().item()

# การทดสอบกับ dataset
def evaluate_model(dataset):
    predictions = []
    references = []
    
    # ใช้ข้อมูลจาก train สำหรับทำนาย และข้อมูลจาก test สำหรับการเปรียบเทียบ
    for i in tqdm(range(len(dataset['test'])), desc="Evaluating", unit="sample"):
        # สร้างสรุปจากโมเดล
        input_text = dataset['train'][i]['dialogue']  # ใช้ 'dialogue' จาก train เพื่อสร้างสรุป
        reference_summary = dataset['test'][i]['summary']  # ใช้ 'summary' จาก test เป็นสรุปจริง
        pred_summary = generate_summary(input_text)  # สร้างสรุปจากโมเดล
        
        predictions.append(pred_summary)
        references.append(reference_summary)
    
    # ROUGE Score
    rouge_scores = calculate_rouge(predictions, references)
    print("ROUGE Scores:", rouge_scores)

    # BLEU Score
    bleu_score = calculate_bleu(predictions, references)
    print("BLEU Score:", bleu_score)

    # BERTScore
    P, R, F1 = calculate_bertscore(predictions, references)
    print("BERTScore - Precision:", P, "Recall:", R, "F1:", F1)

# เรียกใช้งาน
evaluate_model(dataset)

Evaluating: 100%|██████████| 819/819 [39:14<00:00,  2.88s/sample]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


ROUGE Scores: {'rouge1': 0.08103071050965537, 'rouge2': 0.005501493938462314, 'rougeL': 0.07293283175759344}
BLEU Score: 8.859648156109322e-05


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore - Precision: 0.8399370312690735 Recall: 0.8468782305717468 F1: 0.843207597732544


### BERT


**เทียบกับ Paper**

| **Parameter**       | **Code**                          | **Paper (Section 3.2)**          | **เหมือน / ไม่เหมือน**        |
| ------------------- | --------------------------------- | -------------------------------- | --------------       |
| Model               | BERT2BERT (EncoderDecoderModel)   | BERT2BERT                        | เหมือน                 |
| Tokenizer           | bert-base-uncased + custom tokens | ใช้ tokenizer ดัดแปลง              | เหมือน              |
| Batch Size          | 8                                 | **16 (per step)**                | ไม่เหมือน → เล็กกว่า  |
| Epochs              | 3                                 | 3                                | เหมือน              |
| Learning Rate       | 5e-5                              | **3e-5**                         | ไม่เหมือน → สูงกว่า   |
| Warmup Steps        | 500                               | ใช้ scheduler (แต่ไม่ระบุ exact)     | เหมือน (สมเหตุสมผล) |
| Max Length (input)  | 512                               | 512                              | เหมือน              |
| Max Length (output) | 128                               | 128                              | เหมือน              |
| Beam Search         | 4                                 | 4                                | เหมือน              |

---


In [3]:
import torch, gc; gc.collect(); torch.cuda.empty_cache()


In [None]:
import torch
from transformers import (
    BertTokenizerFast,
    EncoderDecoderModel,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
from datasets import load_from_disk

# ------------------------------
# Load processed dataset & tokenizer
# ------------------------------
dataset = load_from_disk("data/samsum_finetune_ready")
tokenizer = BertTokenizerFast.from_pretrained("tokenizer_samsum_su")

# ------------------------------
# Load pretrained EncoderDecoderModel
# ------------------------------

model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-uncased",         # encoder
    "bert-base-uncased",         # decoder (ใช้ weights เดิม-ชุดเดียวกัน)
    tie_encoder_decoder=True     # แชร์น้ำหนัก – เหมือนใน paper
)

# ปรับขนาด embedding หลังเพิ่ม emoji + speaker tokens
model.encoder.resize_token_embeddings(len(tokenizer))
model.decoder.resize_token_embeddings(len(tokenizer))
model.tie_weights()  

# Load your pretrained encoder weights
# model.encoder.load_state_dict(torch.load("bert_su_pretrained.pt", map_location="cpu"))
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size
model.config.max_length = 128
model.config.num_beams = 4

# ------------------------------
# Define training arguments
# ------------------------------
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    evaluation_strategy="steps",
    logging_steps=500,
    save_steps=1000,
    num_train_epochs=3,
    learning_rate=5e-5,
    warmup_steps=500,
    # fp16=torch.cuda.is_available(),
    fp16=False,
    save_total_limit=2,
)
model.config.use_cache = False
model.gradient_checkpointing_enable()

# ------------------------------
# Data Collator & Trainer
# ------------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ------------------------------
# Start Training
# ------------------------------
trainer.train()
model.save_pretrained("bert_samsum_finetuned_no_SU")
tokenizer.save_pretrained("tokenizer_samsum_finetune_no_SU")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.6.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.10.crossattention.self.query.bias', 'bert.encoder.layer.2.crossattention.self.key.bias', 'bert.encoder.layer.2.crossattention.self.query.bias', 'bert.encoder.layer.3.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.7.crossattention.self.value.bias', 'bert.encoder.layer.4.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.10.crossattention.self.value.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.en

{'loss': 1.5135, 'learning_rate': 5e-05, 'epoch': 0.07}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    
[A                                                  

  0%|          | 89/22098 [02:25<1:05:53,  5.57it/s]
[A
[A

{'eval_loss': 0.25244662165641785, 'eval_runtime': 16.8899, 'eval_samples_per_second': 48.431, 'eval_steps_per_second': 24.216, 'epoch': 0.07}


                                                    
  0%|          | 89/22098 [03:54<1:05:53,  5.57it/s]  

{'loss': 0.2455, 'learning_rate': 4.884248541531624e-05, 'epoch': 0.14}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    
[A                                                   

  0%|          | 89/22098 [04:11<1:05:53,  5.57it/s]
[A


{'eval_loss': 0.22957713901996613, 'eval_runtime': 16.7844, 'eval_samples_per_second': 48.736, 'eval_steps_per_second': 24.368, 'epoch': 0.14}


  return fn(*args, **kwargs)
                                                    
  0%|          | 89/22098 [05:41<1:05:53,  5.57it/s]  

{'loss': 0.236, 'learning_rate': 4.768497083063247e-05, 'epoch': 0.2}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    
[A                                                   

  0%|          | 89/22098 [05:58<1:05:53,  5.57it/s]
[A
[A

{'eval_loss': 0.2200525552034378, 'eval_runtime': 16.7523, 'eval_samples_per_second': 48.829, 'eval_steps_per_second': 24.415, 'epoch': 0.2}


                                                    
  0%|          | 89/22098 [07:31<1:05:53,  5.57it/s]  

{'loss': 0.2269, 'learning_rate': 4.6527456245948706e-05, 'epoch': 0.27}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    
[A                                                   

  0%|          | 89/22098 [07:48<1:05:53,  5.57it/s]
[A
[A

{'eval_loss': 0.21332186460494995, 'eval_runtime': 17.0525, 'eval_samples_per_second': 47.969, 'eval_steps_per_second': 23.985, 'epoch': 0.27}


  return fn(*args, **kwargs)
                                                    
  0%|          | 89/22098 [09:18<1:05:53,  5.57it/s]  

{'loss': 0.2156, 'learning_rate': 4.5369941661264933e-05, 'epoch': 0.34}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    
[A                                                   

  0%|          | 89/22098 [09:34<1:05:53,  5.57it/s]
[A
[A

{'eval_loss': 0.20742909610271454, 'eval_runtime': 16.3043, 'eval_samples_per_second': 50.171, 'eval_steps_per_second': 25.085, 'epoch': 0.34}


                                                    
  0%|          | 89/22098 [11:01<1:05:53,  5.57it/s]

{'loss': 0.2085, 'learning_rate': 4.421242707658116e-05, 'epoch': 0.41}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    
[A                                                 

  0%|          | 89/22098 [11:17<1:05:53,  5.57it/s]
[A
[A

{'eval_loss': 0.20444822311401367, 'eval_runtime': 15.8336, 'eval_samples_per_second': 51.662, 'eval_steps_per_second': 25.831, 'epoch': 0.41}


In [1]:
from inspect import signature
import transformers
print(transformers.__version__)               # 4.52.1
print("evaluation_strategy" in signature(transformers.Seq2SeqTrainingArguments).parameters)
# ต้องได้ True


  from .autonotebook import tqdm as notebook_tqdm


4.35.2
True


  _torch_pytree._register_pytree_node(
