# 4.3 事前学習: 4.1前処理 + Tokenizer学習 + GPT-2事前学習

このノートブックは、4.1節の前処理（正規化→連結→チャンク化）を適用したうえで、4.3節で
Byte-level BPE のトークナイザを学習し、Hugging Face Transformers の GPT-2 を用いて
Causal Language Modeling の事前学習を行います。

- データ: `globis-university/aozorabunko-clean`（train split）
- トークナイザ: ByteLevel BPE（train のみで学習）
- モデル: GPT-2（ランダム初期化、config は語彙サイズとコンテキスト長に合わせて作成）

注意: 大規模学習には時間とGPUが必要です。まずは小さな `max_steps` で動作確認してから、
徐々にスケールさせてください。

## インポートと設定

In [1]:
import os
import re
from typing import Iterable, Optional

import torch
from datasets import load_dataset, Dataset

try:
    import neologdn  # 日本語用正規化（任意）
except Exception:
    neologdn = None

from tokenizers import ByteLevelBPETokenizer
from transformers import (
    PreTrainedTokenizerFast,
    GPT2Config,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

torch.manual_seed(42)
TEXT_COL = "text"
SEP = "\n\n<|doc|>\n\n"

# パラメータ（必要に応じて変更）
block_size = 512            # 例: 512/1024/2048
vocab_size = 30000          # 例: 30k/50k
train_split = 'train'
eval_ratio = 0.01
per_device_train_batch_size = 2
gradient_accumulation_steps = 8
learning_rate = 5e-4
weight_decay = 0.1
warmup_steps = 100
max_steps = 200              # デモ用に小さめ（本番は増やす）
logging_steps = 20

# 保存先（ノートブック相対パス -> リポジトリ直下に配置）
REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
tokenizer_dir = os.path.join(REPO_ROOT, 'data', 'processed', 'tokenizer')
output_dir = os.path.join(REPO_ROOT, 'models', 'gpt2-aozora')
os.makedirs(tokenizer_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
tokenizer_dir, output_dir

('/home/akira-nagasawa/book-llm-from-scratch/data/processed/tokenizer',
 '/home/akira-nagasawa/book-llm-from-scratch/models/gpt2-aozora')

## 4.1: 前処理（正規化→連結→チャンク化）

In [2]:
def normalize_ja(text: str) -> str:
    """4.1節の方針に沿った簡易正規化。neologdn があれば利用。
    - 全角英数字を半角へ
    - クオート半角化
    - 三点リーダ統一
    - 連続空白の圧縮
    """
    if neologdn is not None:
        text = neologdn.normalize(text)

    # 全角英数 → 半角（A-Z/a-z/0-9）
    def z2h_alnum(match):
        ch = match.group(0)
        return chr(ord(ch) - 0xFEE0)
    text = re.sub(r'[Ａ-Ｚａ-ｚ０-９]', z2h_alnum, text)

    # クオートの半角化
    text = text.replace('＂', chr(34)).replace('＇', chr(39))

    # 三点リーダの統一
    text = text.replace('･･･', '…').replace('・・・', '…')

    # 連続空白の圧縮
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def apply_normalize(batch):
    texts = batch[TEXT_COL]
    return {TEXT_COL: [normalize_ja(t) for t in texts]}

def make_long_string(dsdict, split: str, col: str = TEXT_COL) -> str:
    texts = dsdict[split][col]
    return SEP.join(texts)

def chunk_text(s: str, size: int) -> Iterable[str]:
    for i in range(0, len(s), size):
        yield s[i : i + size]

def load_texts_local() -> list[str]:
    import json, os
    base = os.path.join(REPO_ROOT, 'data')
    jl = os.path.join(base, 'train.jsonl')
    txt = os.path.join(base, 'train.txt')
    if os.path.exists(jl):
        with open(jl, 'r', encoding='utf-8') as f:
            return [json.loads(line)['text'] for line in f if line.strip()]
    if os.path.exists(txt):
        with open(txt, 'r', encoding='utf-8') as f:
            raw = f.read()
        return [s.strip() for s in raw.split('\n\n') if s.strip()]
    raise FileNotFoundError(f'Local data not found under {base}. Prepare data/train.jsonl or data/train.txt in 4.1.')

# ローカルデータを取得し、連結→チャンク化
texts = load_texts_local()


# ローカル or HF からテキスト一覧を取得し、連結→チャンク化
texts = load_texts_local_or_hf()
long_text = SEP.join(texts)
chunks = list(chunk_text(long_text, block_size))
train_text_ds = Dataset.from_dict({TEXT_COL: chunks})

# eval用にごく一部を分割
eval_size = max(1, int(len(train_text_ds) * eval_ratio))
split = train_text_ds.train_test_split(test_size=eval_size)
train_text_ds, eval_text_ds = split['train'], split['test']
len(train_text_ds), len(eval_text_ds)


(385410, 3893)

## 4.3: Byte-level BPE トークナイザの学習と保存
時間かかるので目安を書く

In [None]:
def train_bytelevel_bpe(iterator: Iterable[str], vocab_size: int, special_tokens: Optional[list[str]] = None):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train_from_iterator(
        iterator,
        vocab_size=vocab_size,
        special_tokens=special_tokens or ['[PAD]', '[BOS]', '[EOS]', '[UNK]'],
        show_progress=True,
    )
    return tokenizer

def wrap_transformers_tokenizer(bytelevel_tokenizer, save_dir: str) -> PreTrainedTokenizerFast:
    os.makedirs(save_dir, exist_ok=True)
    bytelevel_tokenizer.save_model(save_dir)
    fast = PreTrainedTokenizerFast(
        tokenizer_file=None,
        vocab_file=os.path.join(save_dir, 'vocab.json'),
        merges_file=os.path.join(save_dir, 'merges.txt'),
        bos_token='[BOS]',
        eos_token='[EOS]',
        pad_token='[PAD]',
        unk_token='[UNK]',
    )
    fast.save_pretrained(save_dir)
    return fast

byte_bpe = train_bytelevel_bpe(
    iterator=(ex for ex in train_text_ds[TEXT_COL]),
    vocab_size=vocab_size,
    special_tokens=['[PAD]', '[BOS]', '[EOS]', '[UNK]'],
)
hf_tokenizer = wrap_transformers_tokenizer(byte_bpe, tokenizer_dir)
len(hf_tokenizer), hf_tokenizer.bos_token, hf_tokenizer.eos_token





## データセットのトークン化

In [None]:
def tokenize_function(examples):
    return hf_tokenizer(
        examples[TEXT_COL],
        truncation=True,
        max_length=block_size,
    )

tokenized_train = train_text_ds.map(tokenize_function, batched=True, remove_columns=[TEXT_COL])
tokenized_eval = eval_text_ds.map(tokenize_function, batched=True, remove_columns=[TEXT_COL])
tokenized_train[0].keys()

## GPT-2の構築と事前学習（Causal LM）

In [None]:
config = GPT2Config(
    vocab_size=len(hf_tokenizer),
    n_positions=block_size,
    n_ctx=block_size,
    bos_token_id=hf_tokenizer.bos_token_id,
    eos_token_id=hf_tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config)
model.config.pad_token_id = hf_tokenizer.pad_token_id

data_collator = DataCollatorForLanguageModeling(tokenizer=hf_tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps,
    max_steps=max_steps,
    logging_steps=logging_steps,
    evaluation_strategy='steps',
    eval_steps=logging_steps * 5,
    save_steps=logging_steps * 5,
    report_to=['none'],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=hf_tokenizer,
)
trainer.train()
trainer.save_model(output_dir)
hf_tokenizer.save_pretrained(output_dir)
'saved to: ' + output_dir

In [None]:
# 直近の学習ログを表示（必要に応じて調整）
import pandas as pd
pd.DataFrame(trainer.state.log_history).tail(20)


## 簡単な生成テスト（任意）

In [None]:
model.eval()
prompt = '吾輩は猫である'
inputs = hf_tokenizer(prompt, return_tensors='pt')
with torch.no_grad():
    out = model.generate(
        input_ids=inputs['input_ids'],
        max_new_tokens=100,
        temperature=0.8,
        top_k=50,
        do_sample=True,
    )
print(hf_tokenizer.decode(out[0], skip_special_tokens=True))