In [26]:
# https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja から引用・一部改変
from __future__ import unicode_literals
import re
import unicodedata

def unicode_normalize(cls, s):
    pt = re.compile('([{}]+)'.format(cls))

    def norm(c):
        return unicodedata.normalize('NFKC', c) if pt.match(c) else c

    s = ''.join(norm(x) for x in re.split(pt, s))
    s = re.sub('－', '-', s)
    return s

def remove_extra_spaces(s):
    s = re.sub('[ 　]+', ' ', s)
    blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
                      '\u3040-\u309F',  # HIRAGANA
                      '\u30A0-\u30FF',  # KATAKANA
                      '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
                      '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
                      ))
    basic_latin = '\u0000-\u007F'

    def remove_space_between(cls1, cls2, s):
        p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
        while p.search(s):
            s = p.sub(r'\1\2', s)
        return s

    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s

def normalize_neologd(s, enable_remove_extra_spaces=True):
    s = s.strip()
    s = unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

    def maketrans(f, t):
        return {ord(x): ord(y) for x, y in zip(f, t)}

    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
    s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
    s = re.sub('[~∼∾〜〰～]+', '〜', s)  # normalize tildes (modified by Isao Sonobe)
    s = re.sub('[″〝〟˝＂]', '”', s) # normalize double quotes


    # 半角のアルファベットを全角に変換
    s = s.translate(
        maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz',
                  'ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ'))

    # 半角数字を全角数字に変換
    s = s.translate(
        maketrans('0123456789', '０１２３４５６７８９'))

    s = s.translate(
        maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
              '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))

    if enable_remove_extra_spaces:
        s = remove_extra_spaces(s)
    # s = unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
    # s = re.sub('[’]', '\'', s)
    # s = re.sub('[”]', '"', s)
    return s

In [27]:
import os
os.environ['HF_HOME'] = '/autofs/diamond3/share/cache/huggingface'

In [28]:
from datasets import load_dataset

dataset = load_dataset('wikipedia', date='20240801', language='ja')

In [38]:
example = dataset['train'].shuffle()[0]

In [39]:
len_max = 30
text = example['text']

texts = text.split('\n')
normalized_texts = []
for t in texts:
    if t == '':
        continue
    if '。' not in t:
        normalized_texts.append(normalize_neologd(t))
        continue
    sents = t.split('。')
    current_output = ''
    for s in sents:
        if s == '':
            continue
        if current_output == '':
            current_output = s + '。'
        elif len(current_output) + len(s) < len_max:
            current_output += s + '。'
        else:
            normalized_texts.append(normalize_neologd(current_output))
            current_output = s + '。'
    if current_output != '':
        normalized_texts.append(normalize_neologd(current_output))

In [40]:
from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody

text = "私の名前は藤江と言いますか？"
phoneme_list = pyopenjtalk_g2p_prosody(text)
# phoneme_list内の要素の "N" を "nn" に変換
phoneme_list = [phoneme.replace("N", "nn") for phoneme in phoneme_list]
phonemes = " ".join(phoneme_list)

print(text)
print(phonemes)

私の名前は藤江と言いますか？
^ w a [ t a sh i n o # n a [ m a e w a # f u [ j i e t o # i [ i m a ] s u k a ?


In [41]:
for t in normalized_texts:
    print(t)
    print(' '.join(pyopenjtalk_g2p_prosody(t)))
    print('---')

龍井一磨（りゅうせいかずま）は、日本のシンガーソングライター。
^ r o ] N ch i N # k a ] z u m a _ ry u [ u s e i # k a ] z u m a _ w a _ n i [ cl p o ] N n o # sh i [ N g a a s o N g u r a ] i t a a $
---
愛称：カズマックス。
^ a [ i sh o o _ k a [ z u m a ] cl k u s u $
---
アーティスト、ダンスのお兄さん、声優と幅広く活動中。
^ a ] a t i s u t o _ d a ] N s u n o # o [ n i ] i s a N _ s e [ e y u u t o # h a [ b a h i r o ] k u # k a [ ts u d o o ch u u $
---
２００１年から２０１０年大阪を拠点にバンド活動後、２０１０年から東京を拠点にソロ活動をしている。
^ n i [ s e ] N # i [ ch i ] n e N k a r a # n i [ s e ] N # j u [ u ] n e N # o [ o s a k a o # ky o [ t e N n i # b a [ N d o k a ts u d o o ] g o _ n i [ s e ] N # j u [ u ] n e N k a r a # t o [ o ky o o o # ky o [ t e N n i # s o [ r o k a ] ts u d o o o # sh i [ t e # i [ r u $
---
香川県出身、東京都稲城市在住で稲城市観光大使を担っている。
^ k a [ g a w a ] k e N # sh u [ cl sh i N _ t o [ o ky o o ] t o # i [ n a g i ] sh i # z a [ i j u u d e # i [ n a g i ] sh i # k a [ N k o o t a ] i sh i o # n i [ n a ] cl t e # i [ r u $
---
経歴
^ k e [ e r e k i $
---
脚注
^ ky a [



In [23]:
from transformers import MBartForConditionalGeneration, AutoTokenizer

model_name = "ku-nlp/bart-base-japanese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/471 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/589k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.60M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/502M [00:00<?, ?B/s]

In [33]:
tokenizer.tokenize(normalized_texts[0])

['▁カ',
 'ッペ',
 'ッラ',
 '・',
 'デ',
 '・',
 'ピ',
 'チェ',
 'ナル',
 'ディ',
 '（',
 '）',
 'は',
 '、',
 'イ',
 'タリア',
 '共和国',
 'ロン',
 'バルディア',
 '州',
 'クレ',
 'モ',
 'ナ',
 '県',
 'に',
 'ある',
 '、',
 '人',
 '口',
 '約',
 '４００',
 '人',
 'の',
 '基',
 '礎',
 '自',
 '治',
 '体',
 '（',
 'コム',
 'ーネ',
 '）',
 '。']

In [5]:
from transformers import pipeline

generator = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_new_tokens=60)
generated = generator(text)
print(generated)

[{'generated_text': '私の名前は藤江と言いますか？？？ ？ 私 の 名前 は 私の 名前 。 私 の 姓名 は 私 の 名前 の 後 に … 。 私の 名前 の 後ろ に … 」 と ある の は 、 私 の こと である 。 私'}]


In [31]:
tokenizer.tokenize("ｎ Ｈ Ｋ")

['▁ｎ', '▁Ｈ', '▁Ｋ']

In [6]:
from datasets import load_dataset

dataset = load_dataset("shunk031/livedoor-news-corpus")

In [108]:
def preprocess(example):
    text = example["title"]
    text = text.replace("\t", " ")
    text = text.strip()
    text = normalize_neologd(text)
    # text = text.lower()
    phoneme_list = pyopenjtalk_g2p_prosody(text)
    # phoneme_list = [phoneme.replace("N", "nn") for phoneme in phoneme_list]
    phonemes = " ".join(phoneme_list)
    phonemes = normalize_neologd(phonemes, enable_remove_extra_spaces=False)
    example["text"] = text
    example["phonemes"] = phonemes
    return example

dataset = dataset.map(preprocess)

Map:   0%|          | 0/5894 [00:00<?, ? examples/s]



Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Map:   0%|          | 0/736 [00:00<?, ? examples/s]



In [109]:
def tokenize_function(examples):
    model_inputs = tokenizer(
        text=examples["text"],
        # max_length=model.config.max_length, # prob. 512
        max_length=512,
        padding="max_length",
        truncation=True)
    labels = tokenizer(
        text_target=examples["phonemes"],
        # max_length=model.config.max_length,
        max_length=512,
        padding="max_length",
        truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    examples.update(model_inputs)
    return examples

dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5894 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]

Map:   0%|          | 0/736 [00:00<?, ? examples/s]

In [110]:
dataset["train"][0]["text"]

'ＮＨＫの”韓流寄り”番組に批判の声'

In [111]:
tokenizer.decode(dataset["train"][0]['input_ids'])

'<s> ＮＨＫの”韓流寄り”番組に批判の声</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [112]:
tokenizer.decode(dataset["train"][0]["labels"])

'<s> ＾ ｅ ［ ｎ ｕ ｅ ｉ ｃｈ ｉ ］ ｋ ｅ ｉ ｎ ｏ ＿ ｋ ａ ［ Ｎ ｒｙ ｕ ｕ ｙ ｏ ｒ ｉ ＿ ｂ ａ ［ Ｎ ｇ ｕ ｍ ｉ ｎ ｉ ＃ ｈ ｉ ［ ｈ ａ Ｎ ｎ ｏ ＃ ｋ ｏ ］ ｅ ＄</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [113]:
exp_name = "base"
output_dir = f"exp/{exp_name}/results"
logging_dir = f"exp/{exp_name}/logs"

In [114]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    num_train_epochs=10,            # 最大10エポックとする
    per_device_train_batch_size=8,  # バッチサイズ
    auto_find_batch_size=True,    # バッチサイズを自動で見つける

    weight_decay=0.01,              # 重み減衰
    learning_rate=2e-5,             # 学習率
    warmup_steps=500,               # ウォームアップステップ数

    evaluation_strategy="epoch",    # 評価はエポックごとに行う
    # metric_for_best_model="accuracy", # 最良のモデルの評価指標
    # greater_is_better=True,           # 評価指標が大きいほど良い場合はTrue

    output_dir=output_dir,          # モデルの保存先
    save_strategy="epoch",          # モデルの保存はエポックごとに行う
    save_total_limit=3,             # 保存するモデルの数)

    logging_dir=logging_dir,        # ログの保存先
    logging_strategy="steps",       # ログの保存はエポックごとに行う
    logging_steps=100,              # 100ステップごとにログを出力する

    load_best_model_at_end=True,    # 最良のモデルを最後にロードする
)



In [115]:
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3, 
    early_stopping_threshold=0.001)

In [116]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    # compute_metrics=compute_metrics,
    # callbacks=[early_stopping],
)

In [118]:
enable_training = True
if enable_training:
    trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4498,0.352919
2,0.2849,0.205058
3,0.1929,0.143917
4,0.1548,0.119547
5,0.1338,0.107196
6,0.1181,0.098133
7,0.1103,0.092281
8,0.1048,0.088649
9,0.0981,0.086607
10,0.0975,0.085987


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size

In [17]:
enable_model_loading = False
model_path = "exp/base/results/checkpoint-58940"
if enable_model_loading:
    model = T5ForConditionalGeneration.from_pretrained(model_path)

In [123]:
generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)

In [126]:
split = 'validation'
i = 1
predicted = generator(dataset[split][i]["text"])
print(f"input: {dataset[split][i]['text']}")
print(f"target: {dataset[split][i]['phonemes']}")
print(f"predicted: {predicted[0]['generated_text']}")

input: 「履歴書ってパソコンで作ったらダメなの？」パソコンＶＳ手書きについて考える【話題】
target: ＾ ｒ ｉ ［ ｒ ｅ ｋ ｉ ｓｈ ｏ ｃｌ ｔ ｅ ＃ ｐ ａ ［ ｓ ｏ ｋ ｏ Ｎ ｄ ｅ ＃ ｔｓ ｕ ［ ｋ ｕ ］ ｃｌ ｔ ａ ｒ ａ ＃ ｄ ａ ［ ｍ ｅ ］ ｎ ａ ｎ ｏ ＿ ｐ ａ ［ ｓ ｏ ｋ ｏ Ｎ ｂ ａ ａ ｓ ａ ｓ ｕ ｔ ｅ ］ ｇ ａ ｋ ｉ ｎ ｉ ＃ ｔｓ ｕ ］ ｉ ｔ ｅ ＃ ｋ ａ ［ Ｎ ｇ ａ ］ ｅ ｒ ｕ ＿ ｗ ａ ［ ｄ ａ ｉ ＄
predicted: ＾ ｒ ｅ ［ ｋ ｉ ｒ ｅ ］ ｓｈ ｉ ｔ ｅ ＃ ｐ ａ ［ ｓ ｏ ｋ ｏ Ｎ ｄ ｅ ＃ ｔｓ ｕ ［ ｋ ｕ ｃｌ ｔ ａ ］ ｒ ａ ＃ ｄ ａ ］ ｍ ｅ ｎ ａ ｎ ｏ ＿ ｐ ａ ］ ｓ ｏ Ｎ ｂ ｕ ｉ ＃ ｅ ］ ｓ ｕ ＃ ｓｈ ｕ ［ ｇ ａ ｋ ｉ ｎ ｉ ＃ ｔ ｅ ［ ｃｌ ｃｈ ｉ ＃ ｋ ａ ［ ｎ ａ ｅ ］ ｒ ｕ ＿ ｗ ａ ［ ｄ ａ ｉ ＄


In [132]:
generator("知らない人に道を聞いてみました")

[{'generated_text': '＾ ｓｈ ｉ ［ ｒ ａ ｎ ａ ｉ ＃ ｈ ｉ ［ ｔ ｏ ｎ ｉ ＃ ｄ ｏ ［ ｏ ｏ ＃ ｋ ｉ ［ ｉ ｔ ｅ ＃ ｍ ｉ ［ ｍ ａ ］ ｓｈ ｉ ｔ ａ ＄'}]