In [1]:
! pip install transformers[torch]

[0m

In [2]:
! rm -r ~/.cache/huggingface/datasets

rm: cannot remove '/root/.cache/huggingface/datasets': No such file or directory


In [3]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [4]:
import json
import os
import numpy as np
from datasets import load_dataset, Features, Value, Sequence
from transformers import (
    MarkupLMForTokenClassification,
    MarkupLMProcessor,
    TrainingArguments,
    Trainer
)
from evaluate import load
import torch

# Параметры модели и пути
MODEL_NAME = "microsoft/markuplm-base"  # или "microsoft/markuplm-large"
LABEL_MAP_PATH = "label_map.json"
DATA_DIR = "dataset/markuplm_dataset_finetuning/"
MODEL_SAVE_PATH = "models"
MAX_LENGTH = 512
BATCH_SIZE = 1
NUM_EPOCHS = 5
LEARNING_RATE = 3e-5
SEED = 42

# Загрузка label map
with open(LABEL_MAP_PATH) as f:
    label_map = json.load(f)

# Преобразуем id2label, ключи делаем int
id2label = {int(k): v for k, v in label_map["id2label"].items()}
label2id = label_map["label2id"]
num_labels = len(id2label)
print(f"Number of labels: {num_labels}")

# Задаём expected_fields — список всех возможных полей (если потребуется)
expected_fields = ['title', 'author', 'date', 'doi', 'issn', 'eissn', 'journal',
                   'publisher', 'pages', 'first_page', 'last_page', 'language', 'volume',
                   'issue', 'abstract', 'affiliation', 'keyword', 'doc_type', 'isbn',
                   'eisbn', 'editor', 'orcid', 'book_version', 'subtitle', 'conference_title',
                   'book_series', 'book_title']

features = Features({
    "id": Value("string"),
    "source_file": Value("string"),
    "resource": Value("string"),
    "doc_type": Value("string"),
    "html": Value("string"),
    "tokens": Sequence(Value("string")),
    "xpaths": Sequence(Value("string")),
    "metadata": Features({
        field: Features({
            "text": Sequence(Value("string")),
            "xpaths": Sequence(Value("string"))
        }) for field in expected_fields
    }),
    "node_labels": Sequence(Value("int64")),
    "processing_time": Value("string")
})


Number of labels: 55


In [5]:
# Загружаем датасет в режиме streaming
dataset = load_dataset(
    "parquet",
    data_files={
        "train": f"{DATA_DIR}/train/*.parquet",
        "validation": f"{DATA_DIR}/val/*.parquet",
        "test": f"{DATA_DIR}/test/*.parquet"
    },
    features=features,
    streaming=True
)

# Применяем shuffle с буфером для каждой части.
# Размер буфера можно изменять в зависимости от объёма и доступной памяти.
train_dataset = dataset["train"].shuffle(buffer_size=10000, seed=SEED)
validation_dataset = dataset["validation"].shuffle(buffer_size=5000, seed=SEED)
test_dataset = dataset["test"].shuffle(buffer_size=5000, seed=SEED)


Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

In [6]:
train_dataset = train_dataset.take(30000)  # Ограничение для теста
validation_dataset = validation_dataset.take(3000)
test_dataset = test_dataset.take(3000)

In [7]:
processor = MarkupLMProcessor.from_pretrained(
    MODEL_NAME,
    parse_html=False,
    max_length=MAX_LENGTH
)
processor.parse_html = False

#def split_into_chunks(nodes, xpaths, labels, max_length):
#    chunks = []
#    start = 0
#    n = len(nodes)
#    
#    while start < n:
#        # Определяем конец чанка
#        end = start + max_length
#        # Ищем последнюю значимую метку в текущем диапазоне
#        last_sig_in_chunk = -1
#        for i in range(start, min(end, n)):
#            if labels[i] != 0:
#                last_sig_in_chunk = i
#                
#        # Если есть значимые метки, корректируем конец
#        if last_sig_in_chunk != -1:
#            end = last_sig_in_chunk + 1  # Включаем последнюю значимую метку
#        else:
#            end = min(end, n)  # Просто обрезаем до max_length
#            
#        # Добавляем чанк
#        chunks.append({
#            "nodes": nodes[start:end],
#            "xpaths": xpaths[start:end],
#            "labels": labels[start:end]
#        })
#        
#        start = end  # Переходим к следующему чанку
#        
#    return chunks

def split_into_chunks(nodes, xpaths, labels, max_length):
    chunks = []
    start = 0
    n = len(nodes)
    
    while start < n:
        end = start + max_length
        # Упрощенная логика без поиска меток
        chunks.append({
            "nodes": nodes[start:end],
            "xpaths": xpaths[start:end],
            "labels": labels[start:end]
        })
        start = end
        
    return chunks

def process_examples(examples):
    all_nodes = []
    all_xpaths = []
    all_labels = []
    
    # Обрабатываем каждый пример в батче
    for i in range(len(examples["tokens"])):
        nodes = examples["tokens"][i]
        xpaths = examples["xpaths"][i]
        labels = examples["node_labels"][i]
        
        # Разбиваем на чанки
        chunks = split_into_chunks(nodes, xpaths, labels, MAX_LENGTH)
        
        # Собираем все чанки
        for chunk in chunks:
            all_nodes.append(chunk["nodes"])
            all_xpaths.append(chunk["xpaths"])
            all_labels.append(chunk["labels"])
    
    # Обрабатываем все чанки через процессор
    processed = processor(
        nodes=all_nodes,
        xpaths=all_xpaths,
        node_labels=all_labels,
        padding="max_length",
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors="pt",
    )
    
    # Маскируем метки для padding токенов
    labels = processed["labels"]
    labels[processed["attention_mask"] == 0] = -100
    processed["labels"] = labels

    torch.cuda.empty_cache()
    return processed

# Обрабатываем датасеты. При streaming датасетах метод .map возвращает IterableDataset.
train_dataset = train_dataset.map(
    process_examples,
    batched=True,
    remove_columns=list(dataset["train"].features.keys())
)
validation_dataset = validation_dataset.map(
    process_examples,
    batched=True,
    remove_columns=list(dataset["validation"].features.keys())
)
test_dataset = test_dataset.map(
    process_examples,
    batched=True,
    remove_columns=list(dataset["test"].features.keys())
)


In [8]:
torch.cuda.is_available()

True

In [9]:
# Загружаем метрику seqeval для оценки
seqeval = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(
        predictions=true_predictions,
        references=true_labels,
        mode="strict",
        scheme="IOB2"
    )
    
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Загружаем модель MarkupLM для задачи токенной классификации
model = MarkupLMForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True  # если размеры эмбеддингов не совпадают
)

training_args = TrainingArguments(
    output_dir="./results",
    logging_strategy="steps",
    eval_strategy="steps",        # Переключаем на оценку по шагам
    save_strategy="steps",        # Сохранение по шагам
    eval_steps=3000,  # Оценка после каждой эпохи
    save_steps=6000,   # Сохранение после каждой эпохи
    logging_steps=3000,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    logging_dir="./logs",
    report_to="none",
    seed=SEED,
    max_steps=40000, #~112500 if we gonna use full dataset
    #use_cpu=True # for debug
)


Some weights of MarkupLMForTokenClassification were not initialized from the model checkpoint at microsoft/markuplm-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)

# Запуск обучения
trainer.train()

# Сохраняем модель и processor
output_dir = "./fine_tuned_markuplm_base"
trainer.save_model(output_dir)
processor.save_pretrained(output_dir)




Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
3000,0.3604,0.370523,0.539863,0.167113,0.255223,0.959662
6000,0.0712,0.372104,0.645291,0.488308,0.55593,0.963226
9000,0.052,0.357707,0.648391,0.421943,0.511213,0.9669
12000,0.048,0.314948,0.719739,0.627369,0.670387,0.971675
15000,0.0511,0.230355,0.655815,0.710371,0.682004,0.975811
18000,0.0266,0.296451,0.806095,0.657639,0.724339,0.979226
21000,0.0277,0.25931,0.800873,0.771744,0.786039,0.981718
24000,0.0208,0.264992,0.861811,0.756863,0.805935,0.98044
27000,0.0178,0.260588,0.873862,0.745032,0.804321,0.981101
30000,0.0122,0.268272,0.781554,0.806729,0.793942,0.980203


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[]

In [11]:
results = trainer.evaluate(test_dataset)
print("Test results:")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")
print(f"F1 Score: {results['eval_f1']:.4f}")
print(f"Accuracy: {results['eval_accuracy']:.4f}")


  _warn_prf(average, modifier, msg_start, len(result))


Test results:
Precision: 0.8617
Recall: 0.7479
F1 Score: 0.8008
Accuracy: 0.9800
