In [1]:
import numpy as np
import os

from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, ClassLabel

import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def preprocess_function(examples):
    full_str = "[CLS]"
    full_str += examples["query"]
    
    for key in ["authors", "title", "abstract", "journal"]:
        if examples[key] != None:
            full_str += "[SEP] " + examples[key]
    
    return tokenizer(full_str, padding="max_length", max_length=512, truncation=True)

data_path = f"./data/model-data"

train_file = os.path.join(data_path, 'train', r'tweets-query-pairs.tsv')
# valid_file = os.path.join(data_path, 'valid', r'tweets-query-pairs.tsv')

train_dataset = load_dataset(data_path, split="train")
test_dataset = load_dataset(data_path, split="test")

id2label = {0: "non-match", 1: "match"}
label2id = {"non-match": 0, "match": 1}

tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased")
model = AutoModelForSequenceClassification.from_pretrained("allenai/scibert_scivocab_cased", num_labels=2, id2label=id2label, label2id=label2id)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
train_dataset_tokenized = train_dataset.map(preprocess_function)

Map:   0%|          | 0/68412 [00:00<?, ? examples/s]

In [3]:
test_dataset_tokenized = test_dataset.map(preprocess_function)

Map:   0%|          | 0/17106 [00:00<?, ? examples/s]

In [4]:

training_args = TrainingArguments(
    output_dir="models/cross-embedding",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True,
    optim="adamw_torch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=test_dataset_tokenized,
    compute_metrics=compute_metrics
)

trainer.train(resume_from_checkpoint=False)

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0734,0.06185,0.978838
2,0.0387,0.067199,0.981059


TrainOutput(global_step=4276, training_loss=0.06474901985296921, metrics={'train_runtime': 6928.2951, 'train_samples_per_second': 19.749, 'train_steps_per_second': 0.617, 'total_flos': 3.599990703857664e+16, 'train_loss': 0.06474901985296921, 'epoch': 2.0})