In [1]:
# fake data generator; only needed for the blueprint
import faker

generator: faker.Faker = faker.Faker()

In [2]:
# define GPU used during training
# depends on architecture (default should be 0) 
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
import pandas
import datasets
import setfit

# futher information on the training paradigm:
# https://huggingface.co/docs/setfit/index

In [4]:
# TODO: configure training setup 
#

# sentence transformers used as a base model
# selection of compatible models on hugging face: 
# https://huggingface.co/models?library=sentence-transformers&sort=trending
MODEL_SLUG: str = "BAAI/bge-m3"

# NOTE: saving the model in the huggingface cloud allows easy access and sharing
# how to setup: https://huggingface.co/docs/transformers/v4.15.0/en/model_sharing
MODEL_UPLOAD_SLUG: str | None = None

# portion of the original dataset used for evaluation/testing
DATA_TEST_FRAC: float = 0.05

TRAINER_ARGS: setfit.TrainingArguments = setfit.TrainingArguments(
    # adapt these to as necessary
    batch_size=16,
    num_epochs=4,
    # fixed
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [5]:
# TODO: replace with the actual data and column names
TEXT_COL: str = "text"
CLASS_COL: str = "class"

DATASET: pandas.DataFrame = pandas.DataFrame([
    {"text": generator.text(), "class": generator.random_digit()}
    for _ in range(100)
])
DATASET

Unnamed: 0,text,class
0,Prevent maintain anyone page discussion ball r...,6
1,Become drug any stop future simple human. War ...,5
2,Religious various employee accept toward top. ...,3
3,Here choose power relationship. Situation poli...,1
4,Director improve recognize son thank real. Nat...,0
...,...,...
95,Throw indicate personal area. Should statement...,7
96,Main scene ball baby contain return your. Side...,9
97,Month process middle different successful. Ins...,8
98,Cultural call out. Understand production maint...,7


In [6]:
train_dataset: pandas.DataFrame = DATASET.sample(frac=1.0 - DATA_TEST_FRAC)
eval_dataset: pandas.DataFrame = DATASET.loc[DATASET.index.difference(train_dataset.index)]

len(train_dataset), len(eval_dataset)

(95, 5)

In [7]:
trainer = setfit.Trainer(
    model=setfit.SetFitModel.from_pretrained(
        MODEL_SLUG,
        labels=list(DATASET[CLASS_COL].unique()),
    ),
    args=TRAINER_ARGS,
    train_dataset=datasets.Dataset.from_pandas(train_dataset),
    eval_dataset=datasets.Dataset.from_pandas(eval_dataset),
    metric="accuracy",
    column_mapping={TEXT_COL: "text", CLASS_COL: "label"}
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/95 [00:00<?, ? examples/s]

In [8]:
trainer.train()

***** Running training *****
  Num unique pairs = 8038
  Batch size = 16
  Num epochs = 4


Epoch,Training Loss,Validation Loss
1,0.0015,0.089805
2,0.0008,0.078737
3,0.0006,0.076644
4,0.0005,0.077687


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [9]:
trainer.evaluate()

***** Running evaluation *****


{'accuracy': 0.2}

In [None]:
if MODEL_UPLOAD_SLUG:
    trainer.push_to_hub(MODEL_UPLOAD_SLUG)