In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import ast
import evaluate
import numpy as np
from sklearn.metrics import f1_score

In [2]:
def add_ast(example):
    example["ner_tags"] = ast.literal_eval(example["ner_tags"])
    example["tokens"] = ast.literal_eval(example["tokens"])
    return example

In [3]:
dataset = load_dataset("csv", data_files="data/train.csv")["train"]
dataset = dataset.remove_columns("processed_text")
dataset = dataset.remove_columns("target_labels_positions")
dataset = dataset.remove_columns("label")
dataset = dataset.remove_columns("strat")
dataset = dataset.train_test_split(test_size=0.15)
dataset = dataset.rename_column("label_ids", "ner_tags")
dataset = dataset.rename_column("processed_text_split", "tokens")
dataset = dataset.map(add_ast)

Map:   0%|          | 0/423 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ner_tags', 'tokens'],
        num_rows: 423
    })
    test: Dataset({
        features: ['ner_tags', 'tokens'],
        num_rows: 75
    })
})

In [5]:
model_name = "ai-forever/sbert_large_mt_nlu_ru"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/423 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [8]:
seqeval = evaluate.load("seqeval")

In [9]:
label2id = {
    "O": 0,
    "B-discount": 1,
    "B-value": 2,
    "I-value": 3
}
id2label = {
    0: "O",
    1: "B-discount",
    2: "B-value",
    3: "I-value"
}

In [10]:
def f1(trues, preds):
    weights = {
        1: 1,
        2: 2,
        3: 2,
        0: 0
    }
    samples_weight = []
    for t in trues:
        samples_weight.append(weights[t])

    score = f1_score(trues, preds, sample_weight=samples_weight, average="weighted")
    return score

def compute_metrics(p):
    predictions, labels = p
    # print(len(labels))
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    true_predictions = []

    for prediction, label in zip(predictions, labels):
        for pr, le in zip(prediction, label):
            if le == -100:
                continue
            else:
                true_labels.append(le)
                true_predictions.append(pr)

    # true_predictions = [
    #     [p for (p, l) in zip(prediction, label) if l != -100]
    #     for prediction, label in zip(predictions, labels)
    # ]
    # true_labels = [
    #     [l for (p, l) in zip(prediction, label) if l != -100]
    #     for prediction, label in zip(predictions, labels)
    # ]
    # print(len(true_predictions), len(true_labels))
    # print(true_labels)

    results = f1(true_labels, true_predictions)
    return {
        "f1": results,
    }

In [11]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=4, id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/sbert_large_mt_nlu_ru and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir="baseline_sbert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/270 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.022394193336367607, 'eval_f1': 0.6058950782254452, 'eval_runtime': 1.6029, 'eval_samples_per_second': 46.791, 'eval_steps_per_second': 3.119, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.015408470295369625, 'eval_f1': 0.8780529972977766, 'eval_runtime': 1.6681, 'eval_samples_per_second': 44.962, 'eval_steps_per_second': 2.997, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.014678948558866978, 'eval_f1': 0.9032343868851236, 'eval_runtime': 1.6233, 'eval_samples_per_second': 46.203, 'eval_steps_per_second': 3.08, 'epoch': 3.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.015517398715019226, 'eval_f1': 0.9082228032730467, 'eval_runtime': 1.6472, 'eval_samples_per_second': 45.533, 'eval_steps_per_second': 3.036, 'epoch': 4.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.015750650316476822, 'eval_f1': 0.8796184390790539, 'eval_runtime': 1.6049, 'eval_samples_per_second': 46.73, 'eval_steps_per_second': 3.115, 'epoch': 5.0}
