<a href="https://colab.research.google.com/github/fcf2/amp-bert/blob/main/Copy_of_AMP_BERT_Fine_tune_Abel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
!pip install --upgrade accelerate

In [None]:
!pip install wandb

In [None]:
!pip install pandas scikit-learn

In [None]:
!pip install matplotlib

In [None]:
!pip3 install torch torchvision torchaudio

In [None]:
!pip install simpletransformers

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import re

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader

torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

from transformers import AutoTokenizer, Trainer, TrainingArguments, BertForSequenceClassification, AdamW

In [None]:
print(torch.version.cuda)

In [None]:
# define a class for the AMP data that will correctly format the sequence information
# for fine-tuning with huggingface API
# the input dataframe columns must be formatted the same way as the given example

class amp_data():
    def __init__(self, df, tokenizer_name='Rostlab/prot_bert_bfd', max_len=200):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        self.max_len = max_len

        self.seqs, self.labels = self.get_seqs_labels()

    def get_seqs_labels(self):
        # isolate the amino acid sequences and their respective AMP labels
        seqs = list(df['aa_seq'])
        labels = list(df['AMP'].astype(int))

#         assert len(seqs) == len(labels)
        return seqs, labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        seq = " ".join("".join(self.seqs[idx].split()))
        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_len)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])

        return sample

In [None]:
# read in the train dataset
# create an amp_data class of the dataset

data_url = './allAMP.csv'
df = pd.read_csv(data_url, index_col = 0)
df = df.sample(frac=1, random_state = 0)
print(df.head(7))

train_dataset = amp_data(df)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Cargar el DataFrame desde la URL
data_url = './allAMP.csv'
df = pd.read_csv(data_url, index_col=0)

# Mezclar los datos aleatoriamente utilizando el método frac=1 (todos los datos) y random_state para reproducibilidad
df = df.sample(frac=1, random_state=0)

# Dividir el DataFrame en conjuntos de entr'D'enamiento y evaluación (80% para entrenamiento y 20% para evaluación)
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

# Luego, puedes utilizar la función amp_data para preparar los datos para el entrenamiento con el modelo de lenguaje
train_dataset = amp_data(train_df)
eval_dataset = amp_data(eval_df)

In [None]:
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [None]:
# Dataset
data_url = './allAMP.csv'
df = pd.read_csv(data_url, index_col=0)
df = df.sample(frac=1, random_state=0)
train_data = pd.DataFrame(df)

# prepare cross validation
n=5
kf = KFold(n_splits=n, random_state=42, shuffle=True)

results = []

#for train_index, val_index in kf.split(train_data):
  # splitting Dataframe (dataset not included)
    #train_df = train_data.iloc[train_index]
    #val_df = train_data.iloc[val_index]
    #train_dataset = amp_data(train_df)
    #eval_dataset = amp_data(val_df)

In [None]:
# define the necessary metrics for performance evaluation

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    #conf = confusion_matrix(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        #'confusion matrix': conf
    }

In [None]:
# define the model initializing function for Trainer in huggingface

def model_init():
    return BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd')

In [None]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="my-awesome-project"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [None]:
#@title Default title text
# training on entire data
# no evaluation/validation

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    learning_rate = 2e-5,
    per_device_train_batch_size=1,
    warmup_steps=0,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    gradient_accumulation_steps=64,
    fp16=True,
    fp16_opt_level="O2",
    run_name="AMP-BERT",
    seed=0,
    load_best_model_at_end = True
)

In [None]:
for train_index, val_index in kf.split(train_data):
  # splitting Dataframe (dataset not included)
    train_df = train_data.iloc[train_index]
    val_df = train_data.iloc[val_index]
    train_dataset = amp_data(train_df)
    eval_dataset = amp_data(val_df)
    trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics = compute_metrics,
    )
    trainer.train()
    #validate the model
    result = trainer.evaluate(eval_dataset)
    print(result['eval_accuracy'])
    #append model score
    results.append(result['eval_accuracy'])


print("results",results)
print(f"Mean-Precision: {sum(results) / len(results)}")

In [None]:
#trainer.train()

In [None]:
eval_results = trainer.evaluate(eval_dataset=eval_dataset)

# Obtener el valor de la pérdida (loss) del modelo en el conjunto de evaluación
eval_loss = eval_results['eval_loss']
print(f"Loss en el conjunto de evaluación: {eval_loss:.2f}")

In [None]:
# performance metrics on the training data itself

predictions, label_ids, metrics = trainer.predict(train_dataset)
metrics

In [None]:
predictions, label_ids, metrics = trainer.predict(eval_dataset)
metrics

In [None]:
# save the model, if desired

trainer.save_model('./Fine-tuned_model_Gaby_dados_crossValidation/')

In [None]:
# predict AMP/non-AMP for a single example

# load appropriate tokenizer and fine-tuned model
tokenizer = AutoTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False)
model = BertForSequenceClassification.from_pretrained("./Fine-tuned_model_Gaby_dados_crossValidation")

--------

In [None]:
# predict AMP/non-AMP for a single example (default ex. is from external test data: DRAMP00126)

#@markdown **Input peptide sequence (upper case only)**
input_seq = 'GILSDFMGMVA' #@param {type:"string"}
input_seq_spaced = ' '.join([ input_seq[i:i+1] for i in range(0, len(input_seq), 1) ])
input_seq_spaced = re.sub(r'[UZOB]', 'X', input_seq_spaced)
input_seq_tok = tokenizer(input_seq_spaced, return_tensors = 'pt')

output = model(**input_seq_tok)
logits = output[0]

# extract AMP class probability and make binary prediction
y_prob = torch.sigmoid(logits)[:,1].detach().numpy()
y_pred = y_prob > 0.5
if y_pred == True:
  input_class = 'AMP'
else:
  input_class = 'non-AMP'
print(y_pred)
print(y_prob)
print('Input peptide sequence: ' + input_seq)
print('Class prediction: ' + input_class)

------------

MASK

In [None]:
#!apt install git-lfs

In [None]:
# read in the train dataset
# create an amp_data class of the dataset

data_url2 = './AMP_cdhit.csv'
df = pd.read_csv(data_url2, index_col = 0)
df = df.sample(frac=1, random_state = 0)
print(df.head(7))

train_dataset2 = amp_data(df)

In [None]:
# Dataset
data_url = './AMP_cdhit.csv'
df = pd.read_csv(data_url, index_col=0)
train_data = pd.DataFrame(df)

# prepare cross validation
n=5
kf = KFold(n_splits=n, random_state=42, shuffle=True)

results = []

for train_index, val_index in kf.split(train_data):
  # splitting Dataframe (dataset not included)
    train_df = train_data.iloc[train_index]
    val_df = train_data.iloc[val_index]

train_dataset2 = amp_data(train_df)
eval_dataset2 = amp_data(val_df)

In [None]:
from transformers import AutoModelForMaskedLM
model2 = AutoModelForMaskedLM.from_pretrained('Rostlab/prot_bert_bfd')

In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
model_checkpoint = 'Rostlab/prot_bert_bfd'

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    learning_rate = 2e-5,
    per_device_train_batch_size=2,
    warmup_steps=0,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="no",
    save_strategy='no',
    gradient_accumulation_steps=64,
    fp16=True,
    fp16_opt_level="O2",
    run_name="AMP-BERT_MASK",
    seed=0,
    load_best_model_at_end = True
)

In [None]:
trainer2 = Trainer(
    model=model2,
    args=training_args,
    train_dataset=train_dataset2,
    eval_dataset=eval_dataset2,
    data_collator=data_collator,
    compute_metrics = compute_metrics,
)


In [None]:
trainer2.train()

In [None]:
# save the model, if desired

trainer2.save_model('./Fine-tuned_MASK3_model_Gaby_dataset/')

In [None]:
from transformers import AutoModelForMaskedLM

model_mask = AutoModelForMaskedLM.from_pretrained("./Fine-tuned_MASK3_model_Gaby_dataset")

In [None]:
from transformers import BertForMaskedLM, BertTokenizer, pipeline
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False )
#model_mask = BertForMaskedLM.from_pretrained("Rostlab/prot_bert_bfd")
unmasker = pipeline('fill-mask', model=model_mask, tokenizer=tokenizer)
unmasker('G Q A D [MASK] I L K A L G')

In [None]:
from transformers import BertForMaskedLM, BertTokenizer, pipeline
#tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False )
model_mask2 = BertForMaskedLM.from_pretrained("Rostlab/prot_bert_bfd")
unmasker = pipeline('fill-mask', model=model_mask2, tokenizer=tokenizer)
unmasker('G Q A D [MASK] I L K A L G')

-----

Predict multiple peptides

In [None]:
# Open the file containing the FASTA-formatted list of words
with open("new_desing8.fasta", "r") as file:

    # Read in the contents of the file as a string
    file_contents = file.read()

    # Split the contents into separate sequences
    sequences = file_contents.split(">")[1:]

    # Iterate over each sequence and extract the sequence ID and nucleotide sequence
    for seq in sequences:
        seq_id, *seq_lines = seq.split("\n")
        seq = "".join(seq_lines)

        seq_spaced = ' '.join([ seq[i:i+1] for i in range(0, len(seq), 1) ])
        seq_spaced = re.sub(r'[UZOB]', 'X', seq_spaced)
        seq_tok = tokenizer(seq_spaced, return_tensors = 'pt')

        output = model(**seq_tok)
        logits = output[0]

        # extract AMP class probability and make binary prediction
        y_prob = torch.sigmoid(logits)[:,1].detach().numpy()
        y_pred = y_prob > 0.5
        if y_pred == True:
           input_class = 'AMP'
        else:
           input_class = 'non-AMP'
        #print(y_pred)
        print(y_prob)
        #print('Input peptide sequence: ' + seq)
        #print('Class prediction: ' + input_class)



-------------------

In [None]:
!pip install lime

In [None]:
import numpy as np
import lime
import torch
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer

from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
class_names = ['positive','negative', 'neutral']

In [None]:
def predictor(texts):
    outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
    probas = F.softmax(outputs.logits).detach().numpy()
    return probas

explainer = LimeTextExplainer(class_names=class_names)

input_seq_spaced = 'G L F S T V K G I L K'
exp = explainer.explain_instance(input_seq_spaced, predictor, num_features=11)
exp.show_in_notebook(text=input_seq_spaced)

--------------

In [None]:
input_seq_spaced = 'G L F S T V K G I L K'

In [None]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
print(pipe([input_seq_spaced]))

In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
exp = explainer.explain_instance(input_seq_spaced, pipe, num_features=6)

In [None]:
exp.show_in_notebook(text=input_seq_spaced)

### Applying transformers interpret

---------------------

----------------

**SHAP**

In [None]:
#!pip install --quiet shap==0.39
!pip install shap
!pip install xformers

In [None]:
!pip install --upgrade Numba

In [None]:
!pip install NumPy==1.23

In [None]:
import shap

In [None]:

import transformers

from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TextClassificationPipeline)

In [None]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)


In [None]:
def score_and_visualize(text):
  prediction = pipe([text])
  print(prediction[0])

  explainer = shap.Explainer(pipe)
  shap_values = explainer([text])

  shap.plots.text(shap_values)


In [None]:
score_and_visualize(input_seq_spaced)

-----

In [None]:
model_gpu = model.cuda()

In [None]:
labels = [x[0] for x in sorted(model.config.label2id.items(), key=lambda x: x[1])]

def model_prediction_gpu(x):
    tv = torch.tensor([tokenizer.encode(v, padding='max_length',
                                        max_length=512, truncation=True) for v in x]).cuda()
    attention_mask = (tv!=0).type(torch.int64).cuda()
    outputs = model_gpu(tv, attention_mask=attention_mask)[0]
    scores = torch.nn.Softmax(dim=-1)(outputs)
    val = torch.logit(scores).detach().cpu().numpy()

    return val

In [None]:
gpu_explainer = shap.Explainer(model_prediction_gpu, tokenizer, output_names=labels)

shap_values = gpu_explainer(
    [input_seq_spaced]
)

output = shap.plots.text(shap_values)

--------

In [None]:
!pip install transformers_interpret

In [None]:
from transformers_interpret import MultiLabelClassificationExplainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel

In [None]:
# With both the model and tokenizer initialized we are now able to get explanations on an example text.
cls_explainer = MultiLabelClassificationExplainer(model, tokenizer)

word_attributions = cls_explainer(input_seq_spaced)

# show output
word_attributions

