# Finetuning the ESM2 token classification model

### 0. Libraries & path

In [6]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

In [4]:
# absolute path to project
absolute_path = '/Users/dimi/Documents/GitHub/PhageDEPOdetection/'

### 1. Functions

In [2]:
def get_labels(df , label = 1) :
    labels_df = []
    for _,row in df.iterrows():
        info = row["Boundaries"]
        seq_length = len(row["Full_seq"])
        if info == "Negative" :
            labels = [label] * seq_length
            labels_df.append(labels)
        elif info == "full_protein" or info == "full" :
            labels = [label] * seq_length
            labels_df.append(labels)
        elif info.count(":") > 0 : 
            start = int(info.split(":")[0])
            end = int(info.split(":")[1])
            labels = [0 if i < start or i >= end else label for i in range(seq_length)]
            labels_df.append(labels)
        else :
            start = int(info.split("_")[-2])
            end = int(info.split("_")[-1])
            labels = [0 if i < start or i >= end else label for i in range(seq_length)]
            labels_df.append(labels)
    return labels_df

In [3]:
def compute_metrics(p):
    predictions, labels = p
    labels = list(labels)
    predictions = list(np.argmax(predictions, axis=2))
    # Remove ignored index (special tokens)
    #true_predictions = [
    #    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    #    for prediction, label in zip(predictions, labels)]
    #true_labels = [
    #    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    #    for prediction, label in zip(predictions, labels)]
    return {"accuracy": accuracy_score(labels, predictions),
            "precision": precision_score(labels, predictions),
            "recall": recall_score(labels, predictions),
            "f1": f1_score(labels, predictions)}

### 2. Code to finetune

In [7]:
# load data
df_depo = pd.read_csv(absolute_path+'data/Phagedepo.Dataset.2007.tsv' , sep = '\t' , header = 0)

In [12]:
# assign and get the labels
df_beta_helix = df_depo[df_depo["Fold"] == "right-handed beta-helix"]
df_beta_prope = df_depo[df_depo["Fold"] == "6-bladed beta-propeller"]
df_beta_triple =  df_depo[df_depo["Fold"] == "triple-helix"]
df_negative = df_depo[df_depo["Fold"] == "Negative"]

labels_beta_helix = get_labels(df_beta_helix , label = 1)
seq_beta_helix = df_beta_helix["Full_seq"].to_list()
labels_beta_propeller = get_labels(df_beta_prope , label = 2)
seq_beta_propeller = df_beta_prope["Full_seq"].to_list()
labels_triple_helix = get_labels(df_beta_triple , label = 1)
seq_triple_helix = df_beta_triple["Full_seq"].to_list()
labels_negative = get_labels(df_negative , label = 0)
seq_negative = df_negative["Full_seq"].to_list()

# final input data
sequences = seq_beta_helix + seq_beta_propeller + seq_triple_helix + seq_negative
labels = labels_beta_helix + labels_beta_propeller + labels_triple_helix + labels_negative

In [15]:
# train-test split
train_sequences, test_sequences, train_labels, test_labels = train_test_split(sequences, labels, test_size=0.2, random_state = 243)
train_esm2 , train_CNV , esm2_labels , CNV_labels = train_test_split(train_sequences, train_labels, test_size=0.25, random_state = 243)

In [20]:
# initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
data_collator = DataCollatorForTokenClassification(tokenizer)
model = AutoModelForTokenClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=3)

Some weights of the model checkpoint at facebook/esm2_t6_8M_UR50D were not used when initializing EsmForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing EsmForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmForTokenClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task 

In [22]:
# tokenize the data and make a dataset from it
train_tokenized = tokenizer(train_esm2, truncation=True)
test_tokenized = tokenizer(test_sequences, truncation=True)

train_dataset = Dataset.from_dict(train_tokenized)
test_dataset = Dataset.from_dict(test_tokenized)
train_dataset = train_dataset.add_column("labels", esm2_labels)
test_dataset = test_dataset.add_column("labels", test_labels)

In [29]:
# set the training arguments
training_args = TrainingArguments(
    output_dir=absolute_path+'data/finetune',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4, #16
    per_device_eval_batch_size=8, #16
    num_train_epochs=1,
    weight_decay=0.01,
    #load_best_model_at_end=True,
    metric_for_best_model='f1',
)

In [30]:
# set the trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

***** Running training *****
  Num examples = 1373
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 344
  Number of trainable parameters = 7738364


Epoch,Training Loss,Validation Loss


ValueError: expected sequence of length 1024 at dim 1 (got 1113)

In [None]:
# save the model & tokenizer
trainer.save_model('data/finetune')
tokenizer.save_pretrained('data/finetune')