In [9]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

dataset = load_dataset("dataset/my-own-dataset") #needed to construct a new Dataset class, to be done
num_of_labels = ...

tokenizer = BertTokenizerFast.from_pretrained("mshamrai/bert-base-ukr-eng-rus-uncased")
model = BertForTokenClassification.from_pretrained("mshamrai/bert-base-ukr-eng-rus-uncased", num_labels=num_of_labels)  # #number of labels


label_list = ...  #Collect all the labels like 'O', 'B-inscription', 'I-inscription'


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = examples["ner_tags"]
    aligned_labels = []
    for i, label_list in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_label = []
        previous_word_idx = None
        for word_idx in word_ids:
            
            if word_idx is None:
                aligned_label.append(-100)
            
            elif word_idx != previous_word_idx:
                aligned_label.append(label_list[word_idx])
            
            else:
                aligned_label.append(label_list[word_idx])
            previous_word_idx = word_idx
        aligned_labels.append(aligned_label)
    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["validation"]
test_dataset = tokenized_dataset["test"]


DatasetNotFoundError: Dataset 'dataset/my-own-dataset' doesn't exist on the Hub. If the repo is private or gated, make sure to log in with `huggingface-cli login`.

In [30]:
import pandas as pd
data_df = pd.read_csv('data/final/data.tsv', sep='\t', names=['token', 'label'])

In [37]:
label_set = set(data_df['label'].tolist())

In [40]:
label_list = list(label_set)
label_list

['B-decoration',
 'B-inscription_type',
 'B-monument',
 'I-inscripiton type',
 'B-epigraphic_shorthand',
 'I-decoration',
 'B-material',
 'B-object_type',
 'I-object_type',
 'I-preservation_state',
 'B-inscripiton type',
 'I-monument',
 'B-other',
 'I-material',
 'O',
 'B-preservation_state',
 'I-inscription_type',
 'B-execution_technique',
 'B-dating_criteria',
 'B-inscription',
 'B-symbol',
 'I-other']

In [42]:
label_dict = {}
for idx in range(len(label_list)):
    label_dict[label_list[idx]] = idx
    

In [43]:
label_dict

{'B-decoration': 0,
 'B-inscription_type': 1,
 'B-monument': 2,
 'I-inscripiton type': 3,
 'B-epigraphic_shorthand': 4,
 'I-decoration': 5,
 'B-material': 6,
 'B-object_type': 7,
 'I-object_type': 8,
 'I-preservation_state': 9,
 'B-inscripiton type': 10,
 'I-monument': 11,
 'B-other': 12,
 'I-material': 13,
 'O': 14,
 'B-preservation_state': 15,
 'I-inscription_type': 16,
 'B-execution_technique': 17,
 'B-dating_criteria': 18,
 'B-inscription': 19,
 'B-symbol': 20,
 'I-other': 21}

In [78]:
with open('data/final/splits/dev.csv', 'r', encoding='utf-8') as file:
    dev_data = file.read().strip().split('\n\n')  # Splitting data by double newlines
dev_tokens = []
dev_ner_tags = []

for record in dev_data:
    #print(type(record))
    #print(record)
    lines = record.split('\n')
    token = []
    ner_tag = []
    for line in lines:
        word = line.split('\t')
        token.append(word[0])
        
        ner_tag.append(label_dict[word[1]])
        
    dev_tokens.append(token)
    dev_ner_tags.append(ner_tag)
with open('data/final/splits/train.csv', 'r', encoding='utf-8') as file:
    train_data = file.read().strip().split('\n\n')  # Splitting data by double newlinestr
train_tokens = []
train_ner_tags = []

for record in train_data:
    #print(type(record))
    #print(record)
    lines = record.split('\n')
    token = []
    ner_tag = []
    for line in lines:
        word = line.split('\t')
        token.append(word[0])
        
        ner_tag.append(label_dict[word[1]])
        
    train_tokens.append(token)
    train_ner_tags.append(ner_tag)
    
with open('data/final/splits/test.csv', 'r', encoding='utf-8') as file:
    test_data = file.read().strip().split('\n\n')  # Splitting data by double newlinestr
test_tokens = []
test_ner_tags = []

for record in test_data:
    #print(type(record))
    #print(record)
    lines = record.split('\n')
    token = []
    ner_tag = []
    for line in lines:
        word = line.split('\t')
        token.append(word[0])
        
        ner_tag.append(label_dict[word[1]])
        
    test_tokens.append(token)
    test_ner_tags.append(ner_tag)

In [82]:
import json
train_json_dict = {"tokens": train_tokens, "ner_tags": train_ner_tags}
dev_json_dict = {"tokens": dev_tokens, "ner_tags": dev_ner_tags}
test_json_dict = {"tokens": test_tokens, "ner_tags": test_ner_tags}
train_path = 'data/bert/train.json'
dev_path = 'data/bert/validation.json'
test_path = 'data/bert/test.json'
with open(train_path, "w", encoding="utf-8") as f:
    json.dump(train_json_dict, f, ensure_ascii=False, indent=4)
with open(dev_path, "w", encoding="utf-8") as f:
    json.dump(dev_json_dict, f, ensure_ascii=False, indent=4)
with open(test_path, "w", encoding="utf-8") as f:
    json.dump(test_json_dict, f, ensure_ascii=False, indent=4)

In [85]:
from datasets import Dataset

# 读取 JSON 文件
with open(f'{dataset_path}/train.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

# 构建包含 'tokens' 和 'ner_tags' 字段的字典
train_dict = {'tokens': train_tokens, 'ner_tags': train_ner_tags}

# 创建 Dataset 对象
train_dataset = Dataset.from_dict(train_dict)


In [86]:
train_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 2510
})

In [13]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
import numpy as np
import json

class CustomDataset:
    def __init__(self, dataset_path, tokenizer_name, label_list):
        self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name, max_length=32)
        self.label_list = label_list
        self.num_labels = len(label_list)
        self.train_data = self.load_data(f'{dataset_path}/train.json')
        self.validation_data = self.load_data(f'{dataset_path}/validation.json')
        self.test_data = self.load_data(f'{dataset_path}/test.json')
        self.tokenized_train_data = self.tokenize_and_align_labels(self.train_data)
        self.tokenized_validation_data = self.tokenize_and_align_labels(self.validation_data)
        self.tokenized_test_data = self.tokenize_and_align_labels(self.test_data)

    def load_data(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data

    def tokenize_and_align_labels(self, data):
        tokenized_data = self.tokenizer(data["tokens"], truncation=True, is_split_into_words=True)
        labels = data["ner_tags"]
        aligned_labels = []
        for i, label_list in enumerate(labels):
            word_ids = tokenized_data.word_ids(batch_index=i)
            aligned_label = []
            previous_word_idx = None
            for word_idx in word_ids:
                if word_idx is None:
                    aligned_label.append(-100)
                elif word_idx != previous_word_idx:
                    aligned_label.append(label_list[word_idx])
                else:
                    aligned_label.append(label_list[word_idx])
                previous_word_idx = word_idx
            aligned_labels.append(aligned_label)
        tokenized_data["labels"] = aligned_labels
        return tokenized_data

    def get_train_dataset(self):
        return Dataset.from_dict(self.tokenized_train_data)

    def get_validation_dataset(self):
        return Dataset.from_dict(self.tokenized_validation_data)

    def get_test_dataset(self):
        return Dataset.from_dict(self.tokenized_test_data)

# 参数配置
dataset_path = "data/bert"
tokenizer_name = "mshamrai/bert-base-ukr-eng-rus-uncased"

# 创建 CustomDataset 对象
dataset = CustomDataset(dataset_path, tokenizer_name, label_list)

# 加载模型
model = BertForTokenClassification.from_pretrained(tokenizer_name, num_labels=len(label_list))

# 获取数据集
train_dataset = dataset.get_train_dataset()
val_dataset = dataset.get_validation_dataset()
test_dataset = dataset.get_test_dataset()

# 训练参数配置
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
)

# 数据处理器
data_collator = DataCollatorForTokenClassification(BertTokenizerFast.from_pretrained(tokenizer_name, max_length=32))

# 计算指标
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }
    return results

# 创建 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,  # 将数据处理器传递给 Trainer
)

# 开始训练
trainer.train()

# 评估模型
trainer.evaluate()

# 打印测试集的分类报告
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

true_labels = [
    [label_list[l] for l in label if l != -100]
    for label in labels
]
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

print(classification_report(true_labels, true_predictions))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at mshamrai/bert-base-ukr-eng-rus-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1968,0.12781,0.915254,0.698276,0.792176
2,0.1212,0.125853,0.913793,0.685345,0.783251
3,0.0812,0.108655,0.918478,0.728448,0.8125
4,0.0655,0.100946,0.883838,0.75431,0.813953
5,0.062,0.097996,0.888325,0.75431,0.815851


Checkpoint destination directory ./results\checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results\checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results\checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


                     precision    recall  f1-score   support

    dating_criteria       0.91      0.84      0.87        37
         decoration       0.29      0.20      0.24        10
execution_technique       0.00      0.00      0.00         9
        inscription       0.98      0.94      0.96       126
           material       0.00      0.00      0.00         1
           monument       0.00      0.00      0.00         2
        object_type       1.00      0.12      0.22         8
              other       0.63      0.47      0.54        36
             symbol       0.00      0.00      0.00         1

          micro avg       0.89      0.73      0.80       230
          macro avg       0.42      0.29      0.31       230
       weighted avg       0.83      0.73      0.77       230



  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
bert_model_dir = "model/bert2"
model.save_pretrained(bert_model_dir)