In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

from commonfn import align_labels

dataset = load_dataset("json", data_files={'train': './data/FNDEE_train1.json', 'test': './data/FNDEE_valid.json'})

tokenizer = AutoTokenizer.from_pretrained("../models/xlm-roberta-large")

label_to_index = {
    # convert label to index
    "O": 0,
    "Experiment": 1,
    "Manoeuvre": 2,
    "Deploy": 3,
    "Support": 4,
    "Accident": 5,
    "Exhibit": 6,
    "Conflict": 7,
    "Injure": 8,
}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text"], is_split_into_words=False)

    labels = []
    for event in examples["event_list"]:
        event_type = event["event_type"]
        trigger_text = event["trigger"]["text"]
        trigger_offset = event["trigger"]["offset"]
        labels.append((event_type, trigger_text, trigger_offset))
    # 将文本转换为标记的ID
    tokens = tokenizer.tokenize(examples["text"])

    # 对齐标签到分词后的文本
    aligned_labels = align_labels(examples["text"], labels, tokens)

    # 将标签转换为索引
    label_ids = [label_to_index[label] for label in aligned_labels]
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

preprocessed_dataset = dataset.map(tokenize_and_align_labels, batched=False)
#preprocessed_dataset['train'][0]

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

from commonfn import align_labels

dataset = load_dataset("json", data_files={'train': './data/FNDEE_train1.json', 'test': './data/FNDEE_valid.json'})

tokenizer = AutoTokenizer.from_pretrained("../models/xlm-roberta-base")

label_to_index = {
    # convert label to index
    "O": 0,
    "Experiment": 1,
    "Manoeuvre": 2,
    "Deploy": 3,
    "Support": 4,
    "Accident": 5,
    "Exhibit": 6,
    "Conflict": 7,
    "Injure": 8,
}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text"], is_split_into_words=False)

    labels = []
    for event in examples["event_list"]:
        event_type = event["event_type"]
        trigger_text = event["trigger"]["text"]
        trigger_offset = event["trigger"]["offset"]
        labels.append((event_type, trigger_text, trigger_offset))
    # 将文本转换为标记的ID
    tokens = tokenizer.tokenize(examples["text"])
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # 对齐标签到分词后的文本
    aligned_labels = align_labels(examples["text"], labels, tokens)

    # 将标签转换为索引
    label_ids = [label_to_index[label] for label in aligned_labels]
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

preprocessed_dataset = dataset.map(tokenize_and_align_labels, batched=False)

from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained("../models/xlm-roberta-base", num_labels=len(label_to_index))

import numpy as np
import evaluate
metric  = evaluate.load("seqeval")

import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_to_index[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_to_index[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

training_args = TrainingArguments(
    output_dir="../results/models",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_dataset['train'],
    eval_dataset=preprocessed_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.model.to("cuda")
trainer.train()


In [1]:
from datasets import load_dataset, Dataset

dataset = load_dataset("json", data_files="./data/FNDEE_valid.json")
dataset 


Found cached dataset json (/home/cokkiy/.cache/huggingface/datasets/json/default-e7b7cebfcc7b1b9d/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'coref_arguments', 'event_list'],
        num_rows: 2000
    })
})