In [2]:
import os
import sys
sys.path.insert(0, os.path.abspath('.'))

# Load texts
from utils import load_json
from data.utils import parse_label


def load_texts(min_len: int = 2) -> list:
    TEXTS_PATH = '../data/sequences/seq_texts.json'
    texts = load_json(TEXTS_PATH)
    texts = [seq['text'] for seq in texts]
    print(f'Loaded {len(texts)} sequences.')
    texts = [t for t in texts if len(t) > 0]  # Many sequences are empty
    print(f'# non-empty sequences: {len(texts)}')
    texts = [t for t in texts if len(t) >= min_len]
    print(f'Minimum length: {min_len}')
    print(f'# sequences with enough length: {len(texts)}')
    texts = [
        ''.join(
            [parse_label(c, True, comb_token='…', unk_token='…') for c in seq]
        ) for seq in texts
    ]
    return texts

texts = load_texts()
print('====== examples ======')
for i in range(12):
    if len(texts[i]) > 0:
        print(i, texts[i])

Loaded 6068 sequences.
# non-empty sequences: 5043
Minimum length: 2
# sequences with enough length: 4599
0 上不乍二伐伐兄光兵出吉君君坪居居居左己𠭁𠭁𠭁𠭁旬旬星是是東畜相箸胃自雨雨首…………………
1 一三不乙二亥亥以以以以八出出利利利可可壬女女女子居左𠭁必日旬木欠死甬甲甲發白色行視軍量黃………………
2 其二在長…
3 …重鎰…足…重八鎰…鎰一銖
4 十月乙丑
5 之上與𫺕哲王之威俈…尹郘逯㠯王命賜舒方御歲愲
6 廷等
7 二褥席
8 無及也已入之或入之至之或至之
9 咎告尒某邑之社…又石曾孫某邑不幸命…敢用五器宮之以石…邑是…昌大縵…君夫君婦一
10 君高石奴君之神霝攸政民人句史四方之羣明歸曾孫某之邑者亓麥…亓麥徇見某乃𢝫…
11 …諥與悆奴隧重者女奴見亓父奴見亓母女見亓妻奴見亓子奴百湩川之歸…奴…內三


In [3]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

MODEL_NAME = "KoichiYasuoka/roberta-classical-chinese-base-char"
TOKENIZER_PATH = 'tokenization/tokenizer'
print('Loading tokenizer and model...')
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))

Loading tokenizer and model...


Embedding(26419, 768)

In [4]:
# Split data by 8:1:1
import random

from typing import Tuple, List
from dataset import ChujianMLMDataset


def get_dataset(
    texts: List[str]
) -> Tuple[ChujianMLMDataset, ChujianMLMDataset, ChujianMLMDataset]:
    random.seed(0)
    split_idx = [int(len(texts) * 0.8), int(len(texts) * 0.9)]
    print(f'Splitting data into {split_idx}...')
    print(f'Train size: {split_idx[0]}')
    print(f'Dev size: {split_idx[1] - split_idx[0]}')
    print(f'Test size: {len(texts) - split_idx[1]}')
    random.shuffle(texts)
    train_texts = texts[:int(split_idx[0])]
    dev_texts = texts[int(split_idx[0]):int(split_idx[1])]
    test_texts = texts[int(split_idx[1]):]
    print('Building dataset...')
    train_data = ChujianMLMDataset(train_texts, tokenizer)
    dev_data = ChujianMLMDataset(dev_texts, tokenizer)
    test_data = ChujianMLMDataset(test_texts, tokenizer)
    return train_data, dev_data, test_data

train_data, dev_data, test_data = get_dataset(texts)


Splitting data into [3679, 4139]...
Train size: 3679
Dev size: 460
Test size: 460
Building dataset...


# Training

In [6]:
from trainer import Trainer
from pathlib import Path
from transformers import DataCollatorForLanguageModeling

num_epochs = 2
lr = 2e-5
batch_size = 64
log_interval = 10
output_dir = Path(
    'result/roberta-classical-chinese-base-char', 
    f'lr{lr}-bs{batch_size}',
)

train_collate_fn = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

trainer = Trainer(
    model,
    output_dir,
    train_collate_fn=train_collate_fn,
    num_epochs=num_epochs,
    batch_size=batch_size,
    log_interval=log_interval
)
trainer.train(train_data, dev_data)

  Num steps: 58
  Num examples: 3679
  Num epochs: 2
  Batch size: 64
  Log interval: 10
Start epoch 0


AttributeError: 'str' object has no attribute 'to'