# 日本語要約モデル Ver.2
事前学習済み日本語T5モデルを、記事要約用にファインチューニングする。

※私的学習用に作成したもので、非商用利用になります。


## ドライブにアクセス

In [None]:
from google.colab import drive
drive.mount('/content/drive')

SUMMARY_ML_SPACE = '/content/drive/My Drive/Colab Notebooks/summary/'

%cd $SUMMARY_ML_SPACE
!pwd

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/summary
/content/drive/My Drive/Colab Notebooks/summary


In [None]:
from IPython.display import clear_output

!pip install transformers
!pip install datasets
!pip install evaluate

# tokenizerのインスタンス化時にエラー発生するためインストールしてランタイム再起動 「Couldn't instantiate the backend tokenizer from one」
!pip install sentencepiece
# evaluate-metric/rougeを使用するため
!pip install rouge_score
# テキスト正規化
!pip install neologdn

clear_output()

**ランタイム再起動**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

SUMMARY_ML_SPACE = '/content/drive/My Drive/Colab Notebooks/summary/'

%cd $SUMMARY_ML_SPACE
!pwd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks/summary
/content/drive/My Drive/Colab Notebooks/summary


In [None]:
import nltk
nltk.download('punkt')
import numpy as np
from datasets import load_dataset
import pandas as pd
import neologdn
import re
import evaluate
import transformers
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
import torch

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
set_seed(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## 情報抽出

In [None]:
# 前もって準備したニュースデータを使用
data_file = SUMMARY_ML_SPACE + 'summaryDataset/summarynewsV2.csv'
display_data = pd.read_csv(data_file)
display_data.head()

Unnamed: 0,text,summary
0,女性●田中敦子田中敦子さんは『攻殻機動隊 STAND ALONE COMPLEX』の草薙素子...,「その声で叱ってほしい」と思う女性声優を紹介している。「攻殻機動隊」の草薙素子役の田中敦子に...
1,1月下旬の夜、東京・世田谷区の住宅街にある居酒屋『Y』にいた記者の耳に、隣の個室から熱を帯び...,1月下旬、居酒屋の個室から小栗旬の声が聞こえてきたと筆者が述べている。「いちばん仕事したくな...
2,楽器は音だけでなく、そのデザインもこだわりのポイントだ。3作目ほぼ完成！！いろいろ失敗しまく...,ドラゴンクエストへの愛が溢れる自作ギターがTwitterで話題だ。ボディが「スライム」を模し...
3,NHK連続テレビ小説「まれ」でブレイクした女優の今月4日に行われた第39回日本アカデミー賞授...,16日、土屋太鳳が10歳の頃の写真をブログで公開し、反響を呼んでいる。力強い目元が印象的で、...
4,▽うどん好きで知られる日本代表MF▽29日に埼玉スタジアム2002で行われたロシア・ワールド...,香川真司が31日、どん兵衛とサポート契約を結んだことをブログで明かした。「うどんを愛するもの...


## データ分割

In [None]:
extension = data_file.split('.')[-1]

raw_datasets = load_dataset(extension, data_files=data_file, split='train')

raw_datasets = raw_datasets.train_test_split(train_size=0.9)
dataset_dev = raw_datasets['test'].train_test_split(train_size=0.5)

train_dataset = raw_datasets['train']
eval_dataset = dataset_dev['train']
predict_dataset = dataset_dev['test']



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1d88f3a4dd68073c/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1d88f3a4dd68073c/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


## パラメータ

In [None]:
# 汎用変数を定義

# MAX_SOURCE_LENGTH = 1024
# MAX_TARGET_LENGTH = 128

# エラー出る場合「CUDA out of memory」
MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 64

# 事前学習済みモデル
PRETRAINED_MODEL_NAME = 'sonoisa/t5-base-japanese'
output_dir = SUMMARY_ML_SPACE + 'summaryOutputV2'

column_names = train_dataset.column_names
text_column = column_names[0]
summary_column = column_names[1]

## データ前処理

In [None]:
# 前処理
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

def preprocess_text(text):
    text = re.sub(r'[\r\t\n\u3000]', '', text)
    text = neologdn.normalize(text)
    text = text.lower()
    text = text.strip()
    return text

def preprocess_tokenize(batch):
  inputs, targets = [], []
  for i in range(len(batch[text_column])):
    if batch[text_column][i] and batch[summary_column][i]:
      input_text = preprocess_text(batch[text_column][i])
      target_summary = preprocess_text(batch[summary_column][i])

      inputs.append(input_text)
      targets.append(target_summary)

  model_inputs = tokenizer(inputs, max_length=MAX_SOURCE_LENGTH, truncation=True)
  labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

train_dataset = train_dataset.map(preprocess_tokenize, batched=True)
eval_dataset = eval_dataset.map(preprocess_tokenize, batched=True)
predict_dataset = predict_dataset.map(preprocess_tokenize, batched=True)

train_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
predict_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

Downloading:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/710 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/804k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## モデル定義・各種パラメータ設定


In [None]:
# Metric関数
metric = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

# モデル
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(PRETRAINED_MODEL_NAME)

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-05,
    evaluation_strategy='steps',
    save_steps=2000,
    save_total_limit=3,
    remove_unused_columns=True, # 無駄なカラム削除
    logging_steps=2000,
    eval_steps=2000,
    predict_with_generate=True,
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

## 訓練（T5でファインチューニング）

In [None]:
# ファインチューニング
train_result = trainer.train()

# モデルを保存
trainer.save_model(output_dir + '/model')

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9131
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9136
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
2000,5.2784,1.837656,19.5192,5.5254,19.1666,19.1895,54.944773
4000,1.7836,1.762343,19.8477,5.9856,19.5559,19.5694,55.940828
6000,1.6656,1.741351,22.1112,6.0447,21.7559,21.8075,55.629191
8000,1.6011,1.735333,21.2076,5.9314,20.9584,21.0551,55.246548


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 507
  Batch size = 8
Saving model checkpoint to /content/drive/My Drive/Colab Notebooks/summary/summaryOutputV2/checkpoint-2000
Configuration saved in /content/drive/My Drive/Colab Notebooks/summary/summaryOutputV2/checkpoint-2000/config.json
Model weights saved in /content/drive/My Drive/Colab Notebooks/summary/summaryOutputV2/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/Colab Notebooks/summary/summaryOutputV2/checkpoint-2000/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/Colab Notebooks/summary/summaryOutputV2/checkpoint-2000/special_tokens_map.json
Copy vocab file to /content/drive/My Drive/Colab Not

## 評価

In [None]:
trainer.evaluate(max_length=MAX_TARGET_LENGTH, metric_key_prefix="eval")

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 507
  Batch size = 8


{'eval_loss': 1.7385016679763794,
 'eval_rouge1': 22.1847,
 'eval_rouge2': 5.9635,
 'eval_rougeL': 21.9172,
 'eval_rougeLsum': 21.8835,
 'eval_gen_len': 55.145956607495066,
 'eval_runtime': 140.2067,
 'eval_samples_per_second': 3.616,
 'eval_steps_per_second': 0.456,
 'epoch': 8.0}

## テストデータの本文に対する要約生成

In [None]:
# テストデータに対する要約
predict_results = trainer.predict(predict_dataset, max_length=MAX_TARGET_LENGTH, metric_key_prefix="predict")

sources = []
labels = []
preds = []

decoded_sources = []
for row in predict_dataset:
    decoded_sources.append(tokenizer.decode(row['input_ids']))

decoded_preds = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)
decoded_labels = tokenizer.batch_decode(predict_results.label_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

decoded_preds = [pred.strip() for pred in decoded_preds]
decoded_labels = [label.strip() for label in decoded_labels]

output = pd.DataFrame({'本文': decoded_sources, '人が作成した要約': decoded_labels, '生成された要約': decoded_preds})
output.to_excel(output_dir + "/predictions.xlsx")

sources.extend(decoded_sources)
labels.extend(decoded_labels)
preds.extend(decoded_preds)

The following columns in the test set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 508
  Batch size = 8


## 生成結果確認

In [None]:
for pred, label, source in zip(preds, labels, sources):
    print("生成された要約:      " + pred)
    print("人が作成した要約:    " + label)
    print("本文:               " + source)
    print()

## 学習済みモデルで任意の文章に対する要約

In [None]:
# 要約したい文章（任意のニュース記事）
input_text = """
ロシアによるウクライナ侵略で、露軍の占領下にある東部ドネツク州の要衝リマンの攻略作戦を展開していたウクライナ軍は１日、露軍を包囲し、市街地の入り口に達したことを明らかにした。ロシア国防省は同日、リマンからの撤退を始めたことを明らかにした。ロイター通信が報じた。

ロシアのプーチン大統領は９月３０日、ドネツク州を含むウクライナ東南部４州の併合を宣言したばかり。直後に支配地域を奪還されれば、プーチン氏の威信に傷がつく可能性がある。

ウクライナ軍は同日、リマンの包囲をほぼ完了したと発表していた。ロイターによると５０００人規模の露軍兵がいるとみられる。

露国防省は１０月１日、リマンの自軍が包囲されたことを認めた上で、「より優勢な線」へと引き下がると説明した。


米シンクタンクの戦争研究所は、露軍がリマンを喪失した場合、既に全域の制圧を宣言しているルガンスク州の中心都市セベロドネツクやリシチャンスクの支配を維持できなくなる可能性があると指摘していた。
"""


In [None]:
# エラー対策: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
torch.set_default_tensor_type('torch.cuda.FloatTensor')

model_dir = output_dir + '/model'
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

with torch.no_grad():

    # 前処理とトークナイズ
    input_text = preprocess_text(input_text)
    tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

    source_ids = tokenized_text['input_ids'].to(device, dtype = torch.long)
    source_mask = tokenized_text['attention_mask'].to(device, dtype = torch.long)

    # 生成処理
    generated_ids = model.generate(
        input_ids=source_ids,
        attention_mask=source_mask,
        max_length=MAX_TARGET_LENGTH,
        temperature=1.0,          # 生成にランダム性を入れる温度パラメータ
        # num_beams=10,             # ビームサーチの探索幅
        repetition_penalty=1.5,   # 同じ文の繰り返し（モード崩壊）へのペナルティ
    )

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

# 生成された要約文を表示する
print("\noutput:\n" + pred)

loading configuration file /content/drive/My Drive/Colab Notebooks/summary/summaryOutputV2/model/config.json
Model config T5Config {
  "_name_or_path": "/content/drive/My Drive/Colab Notebooks/summary/summaryOutputV2/model",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "eos_token_ids": [
    1
  ],
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "max_length": 512,
  "model_type": "t5",
  "n_positions": 512,
  "num_beams": 4,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "torch_dtype": "float32",
  "transformers_version": "4.22.2",
  "use_cache": true,
  "vocab_size": 32128
}

loadi


output:
露軍の占領下にあるリマンの攻略作戦を展開していたウクライナ軍が1日、包囲し市街地の入り口に達した。ロシア国防省は同日、リマンからの撤退を始めたことを明らかにした。ロシア国防省は10月1日、より優勢な線へと引き下がると説明した
