In [1]:
import numpy as np
import pandas as pd
import os

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
from sentence_transformers.losses import OnlineContrastiveLoss
from sentence_transformers.training_args import BatchSamplers
from datasets import load_dataset, Dataset

import evaluate

metric = evaluate.load("accuracy")
model = SentenceTransformer("allenai/scibert_scivocab_cased")

loss = OnlineContrastiveLoss(model)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


# data_path = f"./data/bi-encoder-model-data"

# dataset = load_dataset("tsv", data_dir=data_path, split="train")
df = pd.read_csv( "data/bi-encoder-model-data/tweet-abs-pairs.tsv", sep = '\t')
df = pd.DataFrame(df)

dataset = Dataset.from_pandas(df)
split = dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]
print(split)

No sentence-transformers model found with name allenai/scibert_scivocab_cased. Creating a new one with mean pooling.


DatasetDict({
    train: Dataset({
        features: ['query', 'abstract', 'label'],
        num_rows: 61694
    })
    test: Dataset({
        features: ['query', 'abstract', 'label'],
        num_rows: 15424
    })
})


In [None]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/bi-encoder",
    # Optional training parameters:
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss
)

trainer.train(resume_from_checkpoint=False)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss


KeyboardInterrupt: 