In [1]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from torch import nn 
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score , matthews_corrcoef, confusion_matrix
from sklearn.model_selection import train_test_split

import os
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
from Bio import SeqIO
import warnings
warnings.filterwarnings("ignore") 

#path_work = "/home/conchae/PhageDepo_pdb"
path_work = "/media/concha-eloko/Linux/depolymerase_building"

df_depo = pd.read_csv(f"{path_work}/Phagedepo.Dataset.2007.tsv" , sep = "\t" , header = 0)

df_beta_helix = df_depo[df_depo["Fold"] == "right-handed beta-helix"]
df_beta_prope = df_depo[df_depo["Fold"] == "6-bladed beta-propeller"]
df_beta_triple =  df_depo[df_depo["Fold"] == "triple-helix"]
df_negative = df_depo[df_depo["Fold"] == "Negative"]

# The phage proteins associated with PL16
pl16_interpro = SeqIO.parse(f"{path_work}/PL_16.phage_proteins.fasta" , "fasta")
seq_pl16_interpro = [str(record.seq) for record in pl16_interpro]
labels_pl16 = [1]*len(seq_pl16_interpro)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_beta_triple

Unnamed: 0,Seq_ID,Fold,Prob,Boundaries,Full_seq
1409,PL16__112,triple-helix,manual,110:310,MSENIPLRVQFKRMTASEWARSDVILLESEIGFETDTGFARAGDGH...
1410,PL16__71,triple-helix,manual,0:210,MSENIPLRVQFKRMTASEWARSDVILLESEIGFETDTGFARAGDGH...
1411,PL16__90,triple-helix,manual,138:336,MSENIPLRVQFKRMTASEWARSDVILLESEIGFETDTGFARAGDGH...
1412,PL16__94,triple-helix,manual,137:336,MSENIPLRVQFKRMTASEWARSDVILLESEIGFETDTGFARAGDGH...
1413,PL16__156,triple-helix,manual,140:340,MSENIPLRVQFKRMTASEWARSDVILLESEIGFETDTGFARAGDGH...
...,...,...,...,...,...
1586,A0A7Z6TP65,triple-helix,manual,41:226,MTVGRRVFLGAFTAGAVTVATGSEAAADGEYTLYTSPAQFYGSSTT...
1587,A0A1D8G892,triple-helix,manual,42:252,MGVTRRLFLGGFTAGAVTVAAGGDAVAAEAGGDAVAAEAAGETTVF...
1588,A0A5C4XFE5,triple-helix,manual,61:292,MKGDTGTAGAKGDTGDTGPAGPGVAAGGTTNQFLIKSSSTNYATTW...
1589,A0A1G9HV85,triple-helix,manual,30:302,MTSRRLFLGAFTAGAVTVAAGASEAAAAEAEGVVEGDTTFTGAVKA...


In [89]:
# T12
#esm2_model_path = f"{path_work}/esm2_t12_35M_UR50D-finetuned-depolymerase.labels_3/checkpoint-6015"
#DpoDetection_path = f"{path_work}/DepoDetection.T12.3Labels.1908.model"

esm2_model_path = f"{path_work}/esm2_t12_35M_UR50D-finetuned-depolymerase.labels_4/checkpoint-6015"
DpoDetection_path = f"{path_work}/DepoDetection.T12.4Labels.1908.model"

# T6
#esm2_model_path = f"{path_work}/esm2_t6_8M_UR50D-finetuned-depolymerase.labels_4/checkpoint-6015"
#DpoDetection_path = f"{path_work}/DepoDetection.T6.4Labels.1908.model"

#esm2_model_path = f"{path_work}/esm2_t6_8M_UR50D-finetuned-depolymerase.labels_3/checkpoint-6015"
#DpoDetection_path = f"{path_work}/DepoDetection.T6.3Labels.1908.model"

# T30
#esm2_model_path = f"{path_work}/script_files/esm2_t12_35M_UR50D-finetuned-depolymerase.1608.3_labels.final/checkpoint-6015"
#DpoDetection_path = f"{path_work}/DepoDetection.esm2_t30_150M_UR50D.1608.final.model"

tokenizer = AutoTokenizer.from_pretrained(esm2_model_path)
esm2_finetuned = AutoModelForTokenClassification.from_pretrained(esm2_model_path)

In [90]:
class Dpo_classifier(nn.Module):
    def __init__(self, pretrained_model):
        super(Dpo_classifier, self).__init__()
        self.max_length = 1024
        self.pretrained_model = pretrained_model
        self.conv1 = nn.Conv1d(1, 64, kernel_size=5, stride=1)  # Convolutional layer
        self.conv2 = nn.Conv1d(64, 128, kernel_size=5, stride=1)  # Convolutional layer
        self.fc1 = nn.Linear(128 * (self.max_length - 2 * (5 - 1)), 32)  # calculate the output shape after 2 conv layers
        self.classifier = nn.Linear(32, 1)  # Binary classification

    def make_prediction(self, fasta_txt):
        input_ids = tokenizer.encode(fasta_txt, truncation=True, return_tensors='pt')
        with torch.no_grad():
            outputs = self.pretrained_model(input_ids)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            token_probs, token_ids = torch.max(probs, dim=-1)
            tokens = token_ids.view(1, -1) # ensure 2D shape
            return tokens

    def pad_or_truncate(self, tokens):
        if tokens.size(1) < self.max_length:
            tokens = F.pad(tokens, (0, self.max_length - tokens.size(1)))
        elif tokens.size(1) > self.max_length:
            tokens = tokens[:, :self.max_length]
        return tokens

    def forward(self, sequences):
        batch_size = len(sequences)
        tokens_batch = []
        for seq in sequences:
            tokens = self.make_prediction(seq)
            tokens = self.pad_or_truncate(tokens)
            tokens_batch.append(tokens)

        outputs = torch.cat(tokens_batch).view(batch_size, 1, self.max_length)  # ensure 3D shape
        outputs = outputs.float()  

        out = F.relu(self.conv1(outputs))
        out = F.relu(self.conv2(out))
        out = out.view(batch_size, -1)  # Flatten the tensor
        out = F.relu(self.fc1(out))
        out = self.classifier(out)
        return out, outputs

model_classifier = Dpo_classifier(esm2_finetuned) # Create an instance of Dpo_classifier
model_classifier.load_state_dict(torch.load(DpoDetection_path), strict = False) # Load the saved weights ; weird Error with some of the keys 
model_classifier.eval() # Set the model to evaluation mode for inference


Dpo_classifier(
  (pretrained_model): EsmForTokenClassification(
    (esm): EsmModel(
      (embeddings): EsmEmbeddings(
        (word_embeddings): Embedding(33, 480, padding_idx=1)
        (dropout): Dropout(p=0.0, inplace=False)
        (position_embeddings): Embedding(1026, 480, padding_idx=1)
      )
      (encoder): EsmEncoder(
        (layer): ModuleList(
          (0-11): 12 x EsmLayer(
            (attention): EsmAttention(
              (self): EsmSelfAttention(
                (query): Linear(in_features=480, out_features=480, bias=True)
                (key): Linear(in_features=480, out_features=480, bias=True)
                (value): Linear(in_features=480, out_features=480, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
                (rotary_embeddings): RotaryEmbedding()
              )
              (output): EsmSelfOutput(
                (dense): Linear(in_features=480, out_features=480, bias=True)
                (dropout): Dropout(p=0.0, inpla

In [91]:
def predict_sequence(model, sequence):
    model.eval()
    with torch.no_grad():
        sequence = [sequence]  # Wrap the sequence in a list to match the model's input format
        outputs, sequence_outputs = model(sequence)
        probas = torch.sigmoid(outputs)  # Apply sigmoid activation for binary classification
        predictions = (probas > 0.5).float()  # Convert probabilities to binary predictions
        sequence_outputs_list = sequence_outputs.cpu().numpy().tolist()[0][0]
        prob_predicted = probas[0].item()
        return (predictions.item(), prob_predicted), sequence_outputs_list



In [92]:
# ********************************************
def get_labels(df) :
    labels_df = []
    for _,row in df.iterrows():
        info = row["Boundaries"]
        seq_length = len(row["Full_seq"])
        if info == "Negative" :
            label = 0
            labels_df.append(label)         
        else :
            label = 1
            labels_df.append(label)
    return labels_df

# Beta-helix :
labels_beta_helix = get_labels(df_beta_helix)
seq_beta_helix = df_beta_helix["Full_seq"].to_list()

# Beta propeller : 
labels_beta_propeller = get_labels(df_beta_prope)
seq_beta_propeller = df_beta_prope["Full_seq"].to_list()

# Triple helix : 
labels_triple_helix = get_labels(df_beta_triple )
seq_triple_helix = df_beta_triple["Full_seq"].to_list()

# Negative :
labels_negative = get_labels(df_negative)
seq_negative = df_negative["Full_seq"].to_list()

# The input data :
sequences = seq_beta_helix + seq_beta_propeller + seq_triple_helix + seq_negative
labels = labels_beta_helix + labels_beta_propeller + labels_triple_helix + labels_negative

train_sequences, test_sequences, train_labels, test_labels = train_test_split(sequences, labels, test_size=0.2, random_state = 243)
train_esm2 , train_CNV , esm2_labels , CNV_labels = train_test_split(train_sequences, train_labels, test_size=0.125, random_state = 243)

train_sequences_PL16, test_sequences_PL16, train_labels_PL16, test_labels_PL16 = train_test_split(seq_pl16_interpro, labels_pl16, test_size=0.5, random_state = 243)

train_CNV = train_CNV + train_sequences_PL16
CNV_labels = CNV_labels + train_labels_PL16

test_sequences = test_sequences + test_sequences_PL16
test_labels = test_labels + test_labels_PL16


> Make predictions on the test dataset :

In [93]:
predicted_labels = []
for _,seq in tqdm(enumerate(test_sequences)) : 
    prediction, sequence_outputs = predict_sequence(model_classifier, seq)
    predicted_labels.append(prediction[0])

492it [06:23,  1.28it/s]


> get right format : concatenate the test labels and the predicted labels then input it into the metric calculator

In [84]:
len(test_labels) , len(predicted_labels)


test_labels[35] = 1.0
#test_labels.pop(24)
#predicted_labels.pop(24)

0.0

In [1]:
for i, labels in enumerate(list(zip(test_labels,predicted_labels))) : 
    if labels[0] != labels[1] : 
        print(i , labels[0], labels[1], sep = "\t")


NameError: name 'test_labels' is not defined

In [81]:
test_sequences[468]

'MTETIPLRVQFKRMKAAEWASSDVVLLEGEIGFETDTGFAKFGDGQNTFSKLKYLTGPKGPKGDTGLQGKTGGTGPRGPAGKPGTTDYNQLQNKPNLDAFARKQETDSKITELKSNKADKNAVYLKAESNAKLDEKLSLTGGIVTGQLQFKPNKSGIKPSSSVGGAINIDMSKSEGAGVVVYSNNDTSDGPLMILRSDKDTFNQSVQFVDYRGKTNAVNIVMRQPSTPNFSSALNITSANEGGSAMQLRGSEGALGTLKITHENPSLKANYDKNAAALSIDIVKKTNGAGTAAQGIYINSTSGTTGKLLRIRNLGDDKLHQLSAKITTTSSGTTETYENKLNSLRAEFTRSHQGLLTKLESQITGLRTVQQTTANQISQEIRNREGAVSRVQQDLDSYQRRLQNAEGSYSSLQQTVSGLQSDVNSPNSKFNSRISQLASQIDQRVTRADVTSIINQSGDSIKLAIQRAGGIDAKMSAKEIVSAINLNGYGVRISGERIALDGNTTVNGAFGAKLGEFIKLRADNIIGGTIDANKINVINLKASSIRGLDAEFIKARIEHTITSLLEGKVIRARNGAMMIDLNNSTIDFNSDASINFNSNNNALVRKSGTHTAFVHFSNATPKNFAGSALYASIGITSSGDRINSASSGRFCGARFFRTASGYEHTASIDQAEIYGDTVYLSDDFNINRGFRFRPAMMPGLLDMNDLYSAIIALGRCWQHLANANWNTARSNFINAVNSELNNHITKI'

In [87]:
f1 = f1_score(test_labels, predicted_labels,average="binary" )
precision = precision_score(test_labels, predicted_labels,average="binary")
recall = recall_score(test_labels, predicted_labels,average="binary" )  # Calculate recall
mcc = matthews_corrcoef(test_labels, predicted_labels)  # Calculate MCC
accuracy = accuracy_score(test_labels, predicted_labels)
cm = confusion_matrix(test_labels, predicted_labels )

ValueError: Found input variables with inconsistent numbers of samples: [491, 492]

In [79]:
round(precision,5) , round(recall,5) , round(accuracy,5) , round(f1,5) , round(mcc,5)

(1.0, 0.97059, 0.9878, 0.98507, 0.97508)

In [80]:
cm

array([[288,   0],
       [  6, 198]])

In [23]:
seq_negative[0]

'MRQNRERKLAEKAVRLAQSPDPRLRKKKMSMGFDPGSPEGDYSATVPVENSPTGNIELRMFSQLQQLADIRSIQEKIAKKAEWLPEYAGYIEGCLVTSPAPQNNVLVRLMIWAADVGDYEQAVRIAEFALLNEMVMPEGHTRSIAEFITEQCSQDFIKDHELAVKNASVIEKIIEIGTGESMVDEVRAKGFRALGDALRDAQPVEALNAYKNALRFNTNAGCVKQVTQLEKKLNLQPTESSPDATVGSQADASSVSTENESVPASTDTTPE'

In [None]:
with open(f"{path_work}/T30_eval.binary_classification.metrics.2008.txt", "w") as outfile :
    outfile.write(f"F1 : {round(f1,5)},precision : {round(precision,5)},recall : {round(recall,5)},accuracy : {round(accuracy,5)},mcc : {round(mcc,5)},cm : {cm}\n")
    for i, labels in enumerate(list(zip(test_labels,predicted_labels))) : 
        if labels[0] != labels[1] : 
            outfile.write(f"{i}\t{labels[0]}\t{labels[1]}\n")


In [None]:
#!/bin/bash
#BATCH --job-name=big_pred__
#SBATCH --qos=short 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=5
#SBATCH --mem=10gb 
#SBATCH --time=0-01:00:00 
#SBATCH --output=big_pred__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate embeddings

python /home/conchae/PhageDepo_pdb/script_files/Metrics.T30.binary_class.py
