### Imports

In [1]:
import json
import os
import numpy as np
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    MarkupLMForTokenClassification,
    MarkupLMFeatureExtractor,
    MarkupLMProcessor,
    TrainingArguments,
    Trainer
)
from evaluate import load
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Параметры

In [2]:

MODEL_NAME = "microsoft/markuplm-large"
LABEL_MAP_PATH = "../label_map.json"
MAX_LENGTH = 512
BATCH_SIZE = 8
NUM_EPOCHS = 5
LEARNING_RATE = 3e-5
SEED = 42

### Загрузка label map

In [3]:
with open(LABEL_MAP_PATH) as f:
    label_map = json.load(f)

# Для id2label преобразуем строковые ключи в int
id2label = {int(k): v for k, v in label_map["id2label"].items()}

# Для label2id оставляем как есть (label: id)
label2id = label_map["label2id"]

num_labels = len(id2label)

### Вариант 1: Загрузка предразделенного датасета

In [4]:
DATA_DIR = "../test_datasets"

dataset = load_dataset(
    "json",
    data_files={
        "train": f"{DATA_DIR}/train.jsonl",
        "test": f"{DATA_DIR}/test.jsonl",
        "validation": f"{DATA_DIR}/val.jsonl"
    }
)

FileNotFoundError: Unable to find 'F:/учеба 6 курс/DIPLOMA/dataset_folder/markuplm-dataset-creator/src\../test_datasets/train.jsonl'

### Вариант 2: Разделение единого файла

In [4]:
from datasets import load_from_disk
from sklearn.model_selection import train_test_split

# Укажите путь к сохраненному датасету
SINGLE_FILE_PATH = "../test_datasets/merged_dataset"
TEST_SIZE = 0.2
VAL_SIZE = 0.1  # Относительно оставшихся после test

# Загрузка датасета с диска
full_dataset = load_from_disk(SINGLE_FILE_PATH)

# Генерация индексов для разделения
indices = range(len(full_dataset))

# Первое разделение: train_val + test
train_val_idx, test_idx = train_test_split(
    indices,
    test_size=TEST_SIZE,
    random_state=SEED
)

# Второе разделение: train и validation
train_idx, val_idx = train_test_split(
    train_val_idx,
    test_size=VAL_SIZE,
    random_state=SEED
)

# Создание подмножеств через selection
train_dataset = full_dataset.select(train_idx)
val_dataset = full_dataset.select(val_idx)
test_dataset = full_dataset.select(test_idx)

# Создание DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print(f"Размеры датасетов:")
print(f"Train: {len(dataset['train'])}")
print(f"Validation: {len(dataset['validation'])}")
print(f"Test: {len(dataset['test'])}")

Размеры датасетов:
Train: 90
Validation: 10
Test: 25


### Инициализация процессора

In [5]:
feature_extractor = MarkupLMFeatureExtractor()
processor = MarkupLMProcessor.from_pretrained(
    MODEL_NAME,
    parse_html=False,
    max_length=MAX_LENGTH
)

### Обработка данных

In [6]:
def process_examples(examples):
    processed = processor(
        nodes=examples["tokens"],
        xpaths=examples["xpaths"],
        node_labels=examples["node_labels"],
        padding="max_length",
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors="pt",
    )
    return processed

tokenized_datasets = dataset.map(
    process_examples,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

### Метрики

In [7]:
seqeval = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(
        predictions=true_predictions,
        references=true_labels,
        mode="strict",
        scheme="IOB2"
    )
    
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

### Инициализация модели

In [8]:
model = MarkupLMForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

Some weights of MarkupLMForTokenClassification were not initialized from the model checkpoint at microsoft/markuplm-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Настройка обучения

In [9]:
torch.cuda.is_available()

True

In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    logging_dir="./logs",
    report_to="none",
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

### Запуск обучения

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.26707,0.0,0.0,0.0,0.949612
2,No log,0.142,0.0,0.0,0.0,0.955426
3,No log,0.075965,0.0,0.0,0.0,0.97093
4,No log,0.096366,1.0,0.8,0.888889,0.978682
5,No log,0.074756,1.0,0.8,0.888889,0.984496


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=60, training_loss=0.6786595026652018, metrics={'train_runtime': 789.7761, 'train_samples_per_second': 0.57, 'train_steps_per_second': 0.076, 'total_flos': 435124565452800.0, 'train_loss': 0.6786595026652018, 'epoch': 5.0})

### Сохранение модели

In [12]:
output_dir = "./fine_tuned_markuplm_large"
trainer.save_model(output_dir)
processor.save_pretrained(output_dir)

[]

### Оценка на тестовом наборе

In [None]:
results = trainer.evaluate(tokenized_datasets["test"])
print("Test results:")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")
print(f"F1 Score: {results['eval_f1']:.4f}")
print(f"Accuracy: {results['eval_accuracy']:.4f}")

Test results:
Precision: 0.9245
Recall: 0.7206
F1 Score: 0.8099
Accuracy: 0.9856


  _warn_prf(average, modifier, msg_start, len(result))


: 