In [1]:
import numpy as np
import pandas as pd
import os

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
from sentence_transformers.losses import OnlineContrastiveLoss
from sentence_transformers.training_args import BatchSamplers
from datasets import load_dataset, Dataset

import evaluate

metric = evaluate.load("accuracy")
model = SentenceTransformer("allenai/scibert_scivocab_cased")

loss = OnlineContrastiveLoss(model)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


# data_path = f"./data/bi-encoder-model-data"

# dataset = load_dataset("tsv", data_dir=data_path, split="train")
df = pd.read_csv( "data/bi-encoder-model-data/tweet-abs-pairs.tsv", sep = '\t')
df = pd.DataFrame(df)

dataset = Dataset.from_pandas(df)
split = dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]
print(split)

No sentence-transformers model found with name allenai/scibert_scivocab_cased. Creating a new one with mean pooling.


DatasetDict({
    train: Dataset({
        features: ['query', 'paper_data', 'label'],
        num_rows: 61694
    })
    test: Dataset({
        features: ['query', 'paper_data', 'label'],
        num_rows: 15424
    })
})


In [2]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/bi-encoder",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=250,
    save_strategy="steps",
    save_steps=250,
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss
)

trainer.train(resume_from_checkpoint=False)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss
250,No log,0.089881
500,0.194100,0.035965
750,0.194100,0.017219
1000,0.025200,0.010068
1250,0.025200,0.006716
1500,0.011000,0.004378
1750,0.011000,0.007629
2000,0.011400,0.007363
2250,0.011400,0.008017
2500,0.008800,0.004143


TrainOutput(global_step=3856, training_loss=0.03372743030183048, metrics={'train_runtime': 3781.9477, 'train_samples_per_second': 16.313, 'train_steps_per_second': 1.02, 'total_flos': 0.0, 'train_loss': 0.03372743030183048, 'epoch': 1.0})