In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

# Replace <model_checkpoint> with the path to the ESM-2 model checkpoint or the name of the model if you want to use a pre-trained one.
model_checkpoint = "<model_checkpoint>"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=3)


In [None]:
import pandas as pd

# Replace <path_to_dataset> with the path to your dataset file.
dataset = pd.read_csv("<path_to_dataset>")
# Rename the column containing the labels to "labels".
dataset = dataset.rename(columns={"<column_name>": "labels"})

# Tokenize the input data.
tokenized_inputs = tokenizer(
    list(dataset["input_text"]),
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Convert the labels to a list of integers.
labels = list(dataset["labels"])

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=2e-5,
    evaluation_strategy="steps",
    eval_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_inputs,
    train_labels=labels,
)

In [None]:
trainer.train()

In [None]:
# Replace <path_to_validation_set> with the path to your validation set file.
validation_set = pd.read_csv("<path_to_validation_set>")
validation_set = validation_set.rename(columns={"<column_name>": "labels"})

# Tokenize the validation set inputs.
tokenized_validation_inputs = tokenizer(
    list(validation_set["input_text"]),
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Convert the validation set labels to a list of integers.
validation_labels = list(validation_set["labels"])

# Evaluate the model on the validation set.
eval_results = trainer.evaluate(tokenized_validation_inputs, validation_labels)
print(eval_results)

In [None]:
# Replace <path_to_saved_model> with the path where you want to save the fine-tuned model.
trainer.save_model("<path_to_saved_model>")