# 4.3 事前学習: 4.1前処理 + Tokenizer学習 + GPT-2事前学習

このノートブックは、4.1節の前処理（正規化→連結→チャンク化）を適用したうえで、4.3節で
Byte-level BPE のトークナイザを学習し、Hugging Face Transformers の GPT-2 を用いて
Causal Language Modeling の事前学習を行います。

- データ: `globis-university/aozorabunko-clean`（train split）
- トークナイザ: ByteLevel BPE（train のみで学習）
- モデル: GPT-2（ランダム初期化、config は語彙サイズとコンテキスト長に合わせて作成）

注意: 大規模学習には時間とGPUが必要です。まずは小さな `max_steps` で動作確認してから、
徐々にスケールさせてください。

## インポートと設定

In [None]:
import os
import re
from typing import Iterable, Optional

import torch
from datasets import load_dataset, Dataset

try:
    import neologdn  # 日本語用正規化（任意）
except Exception:
    neologdn = None

from tokenizers import ByteLevelBPETokenizer
from transformers import (
    PreTrainedTokenizerFast,
    GPT2Config,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

torch.manual_seed(42)
TEXT_COL = "text"
SEP = "\n\n<|doc|>\n\n"

# パラメータ（必要に応じて変更）
block_size = 512            # 例: 512/1024/2048
vocab_size = 30000          # 例: 30k/50k
train_split = 'train'
eval_ratio = 0.01
per_device_train_batch_size = 10
gradient_accumulation_steps = 8
learning_rate = 5e-4
weight_decay = 0.1
warmup_steps = 100
max_steps = 200              # デモ用に小さめ（本番は増やす）
logging_steps = 20

# 保存先（ノートブック相対パス -> リポジトリ直下に配置）
REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
tokenizer_dir = os.path.join(REPO_ROOT, 'data', 'processed', 'tokenizer')
output_dir = os.path.join(REPO_ROOT, 'models', 'gpt2-aozora')
os.makedirs(tokenizer_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
tokenizer_dir, output_dir


('/home/akira-nagasawa/book-llm-from-scratch/data/processed/tokenizer',
 '/home/akira-nagasawa/book-llm-from-scratch/models/gpt2-aozora')

## セクション01の前処理済みデータを利用（連結→チャンク化のみ）

In [2]:
# セクション01の成果物（notebooks/chapter04/data）を読み込み、連結→チャンク化のみ実施
from pathlib import Path
import json

def chunk_text(s: str, size: int) -> Iterable[str]:
    for i in range(0, len(s), size):
        yield s[i : i + size]

# notebooks/chapter04/ からの相対パス
DATA_ROOT = Path('data')
CANDIDATES = [DATA_ROOT / 'aozora', DATA_ROOT]  # 優先順に探す

def load_docs(base: Path, split: str) -> list[str]:
    jsonl = base / f'{split}.jsonl'
    txt   = base / f'{split}.txt'
    if jsonl.exists():
        with jsonl.open('r', encoding='utf-8') as f:
            return [json.loads(line)['text'] for line in f if line.strip()]
    if txt.exists():
        raw = txt.read_text(encoding='utf-8')
        return [s.strip() for s in raw.split('\n\n') if s.strip()]
    return []

train_docs, val_docs = [], []
for base in CANDIDATES:
    if not train_docs:
        train_docs = load_docs(base, 'train')
    if not val_docs:
        val_docs = load_docs(base, 'val')

if not train_docs or not val_docs:
    raise FileNotFoundError('前処理済みデータが見つかりません。notebooks/chapter04/data/(aozora)/{train,val}.{jsonl,txt} を用意してください。')

# 文書をセパレータで連結し、block_size 文字ごとにチャンク
train_long = SEP.join(train_docs)
val_long   = SEP.join(val_docs)
train_chunks = list(chunk_text(train_long, block_size))
val_chunks   = list(chunk_text(val_long,   block_size))

# Hugging Face Datasets へ
train_text_ds = Dataset.from_dict({TEXT_COL: train_chunks})
eval_text_ds  = Dataset.from_dict({TEXT_COL: val_chunks})
len(train_text_ds), len(eval_text_ds)


(389303, 7009)

## 4.3: Byte-level BPE トークナイザの学習と保存
時間かかるので目安を書く

In [3]:
def train_bytelevel_bpe(iterator: Iterable[str], vocab_size: int, special_tokens: Optional[list[str]] = None):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train_from_iterator(
        iterator,
        vocab_size=vocab_size,
        special_tokens=special_tokens or ['[PAD]', '[BOS]', '[EOS]', '[UNK]'],
        show_progress=True,
    )
    return tokenizer

byte_bpe = train_bytelevel_bpe(
    iterator=(ex for ex in train_text_ds[TEXT_COL]),
    vocab_size=vocab_size,
    special_tokens=['[PAD]', '[BOS]', '[EOS]', '[UNK]'],
)







KeyboardInterrupt: 

In [None]:
def wrap_transformers_tokenizer(bytelevel_tokenizer, save_dir: str) -> PreTrainedTokenizerFast:
    os.makedirs(save_dir, exist_ok=True)
    # Save vocab/merges (for reference) and also tokenizer.json for direct fast loading
    bytelevel_tokenizer.save_model(save_dir)
    bytelevel_tokenizer.save(os.path.join(save_dir, 'tokenizer.json'))
    # Load fast tokenizer directly from tokenizer.json to avoid slow->fast conversion
    fast = PreTrainedTokenizerFast(
        tokenizer_file=os.path.join(save_dir, 'tokenizer.json'),
        bos_token='[BOS]',
        eos_token='[EOS]',
        pad_token='[PAD]',
        unk_token='[UNK]',
    )
    fast.save_pretrained(save_dir)
    return fast


hf_tokenizer = wrap_transformers_tokenizer(byte_bpe, tokenizer_dir)
len(hf_tokenizer), hf_tokenizer.bos_token, hf_tokenizer.eos_token

(30000, '[BOS]', '[EOS]')

## データセットのトークン化

In [None]:
def tokenize_function(examples):
    return hf_tokenizer(
        examples[TEXT_COL],
        truncation=True,
        max_length=block_size,
    )

tokenized_train = train_text_ds.map(tokenize_function, batched=True, remove_columns=[TEXT_COL])
tokenized_eval = eval_text_ds.map(tokenize_function, batched=True, remove_columns=[TEXT_COL])
tokenized_train[0].keys()

Map:   0%|          | 0/389303 [00:00<?, ? examples/s]

Map:   0%|          | 0/7009 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

## GPT-2の構築と事前学習（Causal LM）

In [None]:
#検証用上書き
per_device_train_batch_size = 10


config = GPT2Config(
    vocab_size=len(hf_tokenizer),
    n_positions=block_size,
    n_ctx=block_size,
    bos_token_id=hf_tokenizer.bos_token_id,
    eos_token_id=hf_tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config)
model.config.pad_token_id = hf_tokenizer.pad_token_id

data_collator = DataCollatorForLanguageModeling(tokenizer=hf_tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps,
    num_train_epochs=10,
    logging_steps=logging_steps,
    eval_strategy='steps',
    eval_steps=logging_steps * 5,
    save_steps=logging_steps * 5,
    report_to=['none'],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=hf_tokenizer,
)
trainer.train()
trainer.save_model(output_dir)
hf_tokenizer.save_pretrained(output_dir)
'saved to: ' + output_dir

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss,Validation Loss
100,2.072,8.308963
200,2.0018,8.044173
300,1.9448,7.829145
400,1.9169,7.650832
500,1.8844,7.516081
600,1.8418,7.413182
700,1.8234,7.340619
800,1.8215,7.285051
900,1.7863,7.212158
1000,1.7821,7.172904




RuntimeError: [enforce fail at inline_container.cc:664] . unexpected pos 376256192 vs 376256084

In [8]:
# 直近の学習ログを表示（必要に応じて調整）
import pandas as pd
pd.DataFrame(trainer.state.log_history).tail(20)


Unnamed: 0,loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,2.3741,0.224923,9.5e-05,0.003288,20,,,,,,,,,
1,2.1858,0.777973,0.000195,0.006576,40,,,,,,,,,
2,2.0781,0.290171,0.000295,0.009864,60,,,,,,,,,
3,2.0673,0.21914,0.000395,0.013152,80,,,,,,,,,
4,2.072,0.285811,0.000495,0.01644,100,,,,,,,,,
5,,,,0.01644,100,8.308963,71.9135,97.464,12.195,,,,,
6,2.0415,0.225071,0.000405,0.019728,120,,,,,,,,,
7,2.0499,0.32368,0.000305,0.023015,140,,,,,,,,,
8,2.0463,0.169706,0.000205,0.026303,160,,,,,,,,,
9,2.0157,0.183917,0.000105,0.029591,180,,,,,,,,,


## 簡単な生成テスト（任意）

In [None]:
# 学習済みチェックポイントからの推論（簡単な生成テスト）
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# 事前に定義済みの REPO_ROOT を利用してチェックポイントを指す
ckpt_dir = os.path.join(REPO_ROOT, 'models', 'gpt2-aozora', 'checkpoint-7000')
tokenizer = AutoTokenizer.from_pretrained(ckpt_dir)
model = AutoModelForCausalLM.from_pretrained(ckpt_dir)

# pad_token が未設定の場合は eos を代用
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

device = 'cuda' if torch.cuda.is_available() else ('mps' if getattr(torch.backends, 'mps', None) and torch.backends.mps.is_available() else 'cpu')
model = model.to(device).eval()

prompt = '吾輩は猫である。名前はまだ無い。'
inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=120,
        do_sample=True,
        temperature=0.8,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

# 入力部分を除いた新規生成のみを表示
gen_only = out[0][inputs['input_ids'].shape[1]:]
print(tokenizer.decode(gen_only, skip_special_tokens=True))


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



「いや、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、


In [17]:
はprompt = ''
inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=120,
        do_sample=False,
        temperature=0.8,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

# 入力部分を除いた新規生成のみを表示
gen_only = out[0][inputs['input_ids'].shape[1]:]
print(tokenizer.decode(gen_only, skip_special_tokens=True))


「いや、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、君、
