# The goal here is to benchmark the DpoDetection tool with other depolymerase detection tools 
***
# I. Load the models :
#### 77 phages candidates 
>Detected Dpos <br>
>Missed Dpos <br>
***
# II. Getting the resulting predictions
#### Pires Dpos

***
# III. Make figures
***

I.
> Make the predictions : DpoDetection Tool :

In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/PhageDepo_pdb/DepoDetection.S1.conv.model \
/media/concha-eloko/Linux/depolymerase_building

In [1]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from torch import nn 
import torch.nn.functional as F

import os
import numpy as np
import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

path_work = "/media/concha-eloko/Linux/depolymerase_building"

#esm2_model_path = f"{path_work}/esm2_t12_35M_UR50D-finetuned-depolymerase.1907.LR5.4_labels/checkpoint-4277"
#DpoDetection_path = f"{path_work}/DepoDetection.esm2_t12_35M_UR50D.LR5.4L.1907.model"

esm2_model_path = f"{path_work}/esm2_t12_35M_UR50D-finetuned-depolymerase.2007.3_labels/checkpoint-3784"
DpoDetection_path = f"{path_work}/DepoDetection.esm2_t12_35M_UR50D.LR5.3L.2007.model"

tokenizer = AutoTokenizer.from_pretrained(esm2_model_path)
esm2_finetuned = AutoModelForTokenClassification.from_pretrained(esm2_model_path)


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class Dpo_classifier(nn.Module):
    def __init__(self, pretrained_model):
        super(Dpo_classifier, self).__init__()
        self.max_length = 1024
        self.pretrained_model = pretrained_model
        self.conv1 = nn.Conv1d(1, 64, kernel_size=5, stride=1)  # Convolutional layer
        self.conv2 = nn.Conv1d(64, 128, kernel_size=5, stride=1)  # Convolutional layer
        self.fc1 = nn.Linear(128 * (self.max_length - 2 * (5 - 1)), 32)  # calculate the output shape after 2 conv layers
        self.classifier = nn.Linear(32, 2)  # Binary classification

    def make_prediction(self, fasta_txt):
        input_ids = tokenizer.encode(fasta_txt, truncation=True, return_tensors='pt')
        with torch.no_grad():
            outputs = self.pretrained_model(input_ids)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            token_probs, token_ids = torch.max(probs, dim=-1)            
            tokens = token_ids.view(1, -1) # ensure 2D shape
            return tokens

    def pad_or_truncate(self, tokens):
        if tokens.size(1) < self.max_length:
            tokens = F.pad(tokens, (0, self.max_length - tokens.size(1)))
        elif tokens.size(1) > self.max_length:
            tokens = tokens[:, :self.max_length]
        return tokens

    def forward(self, sequences):
        batch_size = len(sequences)
        tokens_batch = []
        for seq in sequences:
            tokens = self.make_prediction(seq)
            tokens = self.pad_or_truncate(tokens)
            tokens_batch.append(tokens)
        
        outputs = torch.cat(tokens_batch).view(batch_size, 1, self.max_length)  # ensure 3D shape
        outputs = outputs.float()  # Convert to float
        
        out = F.relu(self.conv1(outputs))
        out = F.relu(self.conv2(out))
        out = out.view(batch_size, -1)  # Flatten the tensor
        out = F.relu(self.fc1(out))
        out = self.classifier(out)
        return out, outputs


In [8]:
model_classifier = Dpo_classifier(esm2_finetuned) # Create an instance of Dpo_classifier
model_classifier.load_state_dict(torch.load(DpoDetection_path), strict = False) # Load the saved weights ; weird Error with some of the keys 
model_classifier.eval() # Set the model to evaluation mode for inference


Dpo_classifier(
  (pretrained_model): EsmForTokenClassification(
    (esm): EsmModel(
      (embeddings): EsmEmbeddings(
        (word_embeddings): Embedding(33, 480, padding_idx=1)
        (dropout): Dropout(p=0.0, inplace=False)
        (position_embeddings): Embedding(1026, 480, padding_idx=1)
      )
      (encoder): EsmEncoder(
        (layer): ModuleList(
          (0-11): 12 x EsmLayer(
            (attention): EsmAttention(
              (self): EsmSelfAttention(
                (query): Linear(in_features=480, out_features=480, bias=True)
                (key): Linear(in_features=480, out_features=480, bias=True)
                (value): Linear(in_features=480, out_features=480, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
                (rotary_embeddings): RotaryEmbedding()
              )
              (output): EsmSelfOutput(
                (dense): Linear(in_features=480, out_features=480, bias=True)
                (dropout): Dropout(p=0.0, inpla

In [9]:
def predict_sequence(model, sequence):
    model.eval()  
    with torch.no_grad():   
        outputs, sequence_outputs = model([sequence])
        probas = torch.nn.functional.softmax(outputs, dim=-1)
        predictions = torch.argmax(probas, dim=1)  
        sequence_outputs_list = sequence_outputs.cpu().numpy().tolist()[0][0]  
        prob_predicted = probas[0][predictions].item()
        return (predictions.item(), prob_predicted), sequence_outputs_list

def plot_token(tokens) :
    tokens = np.array(tokens)  # convert your list to numpy array for convenience
    plt.figure(figsize=(10,6))
    for i in range(len(tokens) - 1):
        if tokens[i] == 0:
            color = 'black'
        elif tokens[i] == 1:
            color = 'blue'
        else:
            color = 'red'
        plt.plot([i, i+1], [tokens[i], tokens[i+1]], color=color, marker='o')
    plt.xlabel('Token')
    plt.ylabel('Label')
    plt.title('Label for each token')
    plt.xticks(rotation='vertical')
    plt.yticks(np.arange(2), ['0', '1'])  
    plt.grid(True)
    plt.show()

***
# VS Pires 2016

> Make Pires predictions for the missing Phage genomes : 

In [8]:
path_out = "/media/concha-eloko/Linux/PhageDEPOdetection/Benchmarking"
from Bio import SeqIO
from tqdm import tqdm 
import os 

path_bench_1 = "/media/concha-eloko/Linux/PhageDEPOdetection/Benchmarking"
path_bench_2 = "/media/concha-eloko/Linux/PhageDEPOdetection/data_Benchmarking/pires_dna_sequences"

bench_1_files = set([file.split(".multi")[0] for file in os.listdir(path_bench_1)])
bench_2_files = set([file.split("CDS_")[1].split(".fasta")[0] for file in os.listdir(path_bench_2)])

#missing_files = [bench_2_files.symmetric_difference(bench_1_files)]
missing_files = list(bench_2_files - bench_1_files)
missing_files

['NC_011421.1',
 'NC_020873.1',
 'NC_021856.1',
 'NC_024137.1',
 'NC_004167.1',
 'NC_019926.1',
 'NC_041858.1',
 'NC_016767.1',
 'NC_022088.2',
 'NC_022773.1']

In [18]:
path_out = "/media/concha-eloko/Linux/PhageDEPOdetection/data_Benchmarking"
from Bio import SeqIO
from tqdm import tqdm 

prediction_results = {}
for fasta_file in tqdm(os.listdir(path_out)) :
    if fasta_file.count("multi.faa") > 0 : 
        fastas = SeqIO.parse(f"{path_out}/{fasta_file}" , "fasta")
        tmp_results = []
        try : 
            for record in fastas :
                prediction, sequence_outputs = predict_sequence(model_classifier, str(record.seq))
                if prediction[0] == 1 :
                    a = (prediction , record.id)
                    tmp_results.append(a)
                else :
                    pass
            prediction_results[fasta_file] = tmp_results
        except Exception as e : 
            print(fasta_file, e)
        

100%|████████████████████████████████████████████████████████| 16/16 [03:22<00:00, 12.67s/it]


In [2]:
import json
import pprint
pp = pprint.PrettyPrinter(width = 100, sort_dicts = True, compact = True)

#with open(f"/media/concha-eloko/Linux/PhageDEPOdetection/PhageDpo_Bench_final.results.3L.0708.json" , "w") as outfile : 
#    json.dump(prediction_results , outfile)

prediction_results = json.load(open(f"/media/concha-eloko/Linux/PhageDEPOdetection/PhageDpo_Bench_final.results.3L.0708.json"))
pp.pprint(prediction_results)

{'AY349011.3.multi.faa': [[[1, 1.0], 'lcl|AY349011.3_prot_AAQ54995.1_60'],
                          [[1, 0.5708374977111816], 'lcl|AY349011.3_prot_AAQ54997.1_62'],
                          [[1, 0.6505632996559143], 'lcl|AY349011.3_prot_AAQ54998.2_63'],
                          [[1, 0.9349229335784912], 'lcl|AY349011.3_prot_AAQ54999.1_64'],
                          [[1, 1.0], 'lcl|AY349011.3_prot_AAQ55000.1_65']],
 'CP000711.1.multi.faa': [[[1, 1.0], 'lcl|CP000711.1_prot_ABQ88383.1_2']],
 'DQ831957.1.multi.faa': [[[1, 1.0], 'lcl|DQ831957.1_prot_ABI15697.1_20']],
 'DQ834250.1.multi.faa': [[[1, 1.0], 'lcl|DQ834250.1_prot_ABI21803.1_19']],
 'FJ230960.1.multi.faa': [[[1, 1.0], 'lcl|FJ230960.1_prot_ACI90966.1_61'],
                          [[1, 0.995552122592926], 'lcl|FJ230960.1_prot_ACI91000.1_95'],
                          [[1, 1.0], 'lcl|FJ230960.1_prot_ACI91001.1_96'],
                          [[1, 1.0], 'lcl|FJ230960.1_prot_ACI91003.1_98'],
                          [[1, 1.0], '

In [3]:
prediction_results["HQ632825.1.multi.faa"]

[[[1, 0.9918290972709656], 'lcl|HQ632825.1_prot_AGN12239.1_81'],
 [[1, 1.0], 'lcl|HQ632825.1_prot_AGN12240.1_82'],
 [[1, 1.0], 'lcl|HQ632825.1_prot_AGN12247.1_89'],
 [[1, 0.8116859793663025], 'lcl|HQ632825.1_prot_AGN12296.1_138'],
 [[1, 1.0], 'lcl|HQ632825.1_prot_AGN12298.1_140'],
 [[1, 1.0], 'lcl|HQ632825.1_prot_AGN12301.1_143'],
 [[1, 1.0], 'lcl|HQ632825.1_prot_AGN12302.1_144'],
 [[1, 1.0], 'lcl|HQ632825.1_prot_AGN12305.1_147'],
 [[1, 1.0], 'lcl|HQ632825.1_prot_AGN12307.1_149'],
 [[1, 1.0], 'lcl|HQ632825.1_prot_AGN12470.1_312']]

> Write the json file

In [10]:
import json
import pprint
pp = pprint.PrettyPrinter(width = 100, sort_dicts = True, compact = True)

#with open(f"/media/concha-eloko/Linux/PhageDEPOdetection/PhageDpo_Bench_final.results.3L.0708.json" , "w") as outfile : 
#    json.dump(prediction_results , outfile)

prediction_results = json.load(open(f"/media/concha-eloko/Linux/PhageDEPOdetection/PhageDpo_Bench_final.results.3L.0708.json"))

#pp.pprint(prediction_results)


In [11]:
class Dpo_model_pred:
    def __init__(self, name, tokens, prediction, sequence):
        self.name = name
        self.tokens = tokens
        self.prediction = prediction
        self.sequence = sequence
        
    def plot_token(self) :
        tokens = np.array(self.tokens)  
        plt.figure(figsize=(10,6))
        for i in range(len(tokens) - 1):
            if tokens[i] == 0:
                color = 'black'
            elif tokens[i] == 1:
                color = 'blue'
            elif tokens[i] == 2:
                color = 'red'
            else:
                color = 'yellow'
            plt.plot([i, i+1], [tokens[i], tokens[i+1]], color=color, marker='o')
        plt.xlabel('Token')
        plt.ylabel('Label')
        plt.title('Label for each token')
        plt.xticks(rotation='vertical')
        plt.yticks(np.arange(2), ['0', '1'])  
        plt.grid(True)
        plt.show()

    def get_prediction(self):
        return self.prediction
    
    def get_seq(self):
        return self.sequence



In [12]:
from tqdm import tqdm
from Bio import SeqIO
path_out = "/media/concha-eloko/Linux/PhageDEPOdetection/Benchmarking"

Dpo_class_objects = {}
for fasta_file in tqdm(prediction_results) :
    dico_tmp = {}
    fastas = SeqIO.parse(f"{path_out}/{fasta_file}" , "fasta")
    proteins_id = [prot_id for (pred , prot_id) in prediction_results[fasta_file]]
    for record in fastas :
        if record.id in proteins_id and len(record.seq)>=200: 
            prediction, sequence_outputs = predict_sequence(model_classifier, str(record.seq))
            a = Dpo_model_pred(record.id, sequence_outputs, prediction , str(record.seq))
            dico_tmp[record.id] = a
    Dpo_class_objects[fasta_file] = dico_tmp

  0%|                                                   | 0/143 [00:00<?, ?it/s]


NameError: name 'predict_sequence' is not defined

In [13]:
Dpo_class_objects

{'NC_024137.1.multi.faa': {'lcl|NC_024137.1_prot_YP_009031349.1_68': <__main__.Dpo_model_pred at 0x7f2db31b91d0>},
 'NC_020873.1.multi.faa': {'lcl|NC_020873.1_prot_YP_007677121.1_223': <__main__.Dpo_model_pred at 0x7f2db2ef41d0>,
  'lcl|NC_020873.1_prot_YP_007677135.1_237': <__main__.Dpo_model_pred at 0x7f2db29f7790>},
 'NC_016767.1.multi.faa': {},
 'NC_021856.1.multi.faa': {'lcl|NC_021856.1_prot_YP_008318293.1_34': <__main__.Dpo_model_pred at 0x7f2db2ef6090>,
  'lcl|NC_021856.1_prot_YP_008318350.1_91': <__main__.Dpo_model_pred at 0x7f2db2efa0d0>,
  'lcl|NC_021856.1_prot_YP_008318429.1_170': <__main__.Dpo_model_pred at 0x7f2db274d7d0>},
 'NC_022088.2.multi.faa': {'lcl|NC_022088.2_prot_YP_008430873.1_89': <__main__.Dpo_model_pred at 0x7f2db2741950>},
 'NC_022773.1.multi.faa': {'lcl|NC_022773.1_prot_YP_008771853.1_30': <__main__.Dpo_model_pred at 0x7f2e90528f50>},
 'NC_019926.1.multi.faa': {'lcl|NC_019926.1_prot_YP_007237561.1_38': <__main__.Dpo_model_pred at 0x7f2db2ef42d0>,
  'lcl|NC_0

> Get the other tools results

In [1]:
import pandas as pd 

path_DePP_results = "/media/concha-eloko/Linux/PhageDEPOdetection/data_Benchmarking"
df_results = pd.read_csv(f"{path_DePP_results}/benchmark_dataframe.csv", header = 0 )

In [16]:
len(df_results[(df_results["label"] == 1)]["protein_seq"].unique())

151

In [3]:
genomes_id = df_results["genome_id"].unique().tolist()

def get_metrics(df, tool_score , threshold) : 
    tp , tn , fp , fn = 0 , 0 , 0 , 0
    for _,row in df.iterrows() : 
        if row["label"] == 0 : 
            if row[tool_score] >= threshold : 
                fp += 1 
            else :
                tn += 1 
        else :
            if row[tool_score] >= threshold : 
                tp += 1 
            else :
                fn += 1 
    return [tp , tn , fp , fn]
    
metrics = {}
for _,genome in enumerate(genomes_id) :  
    df_genome = df_results[(df_results["genome_id"] == genome) & (len(df_results["protein_seq"]) >= 200)]
    metric_phageDPO = get_metrics(df_genome , "scores_phageDPO" , 50)
    metric_DePP = get_metrics(df_genome , "scores_DePP" , 0.5)
    a = {"phageDPO" : metric_phageDPO , "DePP" : metric_DePP}
    metrics[genome] = a

In [8]:
# Create a set of all the positive sequences : 
import os 
from Bio import SeqIO

path_bench_dpo = "/media/concha-eloko/Linux/PhageDEPOdetection/Benchmarking_Dpos"
dpos_reference = {str(record.seq) for file in os.listdir(path_bench_dpo) for record in SeqIO.parse(f"{path_bench_dpo}/{file}", "fasta") if len(str(record.seq))>=200}


weirdos = []
for _,genome in enumerate(genomes_id) : 
    tp , tn , fp , fn = 0 , 0 , 0 , 0
    fastas = SeqIO.parse(f"{path_out}/{genome}.multi.faa" , "fasta")
    n_prot = len([record.id for record in fastas])
    n_dpos = len([record.id for record in fastas if str(record.seq) in dpos_reference])
    hits = Dpo_class_objects[f"{genome}.multi.faa"]
    if len(hits) == 0 : 
        tp = 0
        fn = n_dpos
        fp = 0
        tn = n_prot - n_dpos
    else : 
        for hit in hits : 
            seq = hits[hit].get_seq()
            if seq in dpos_reference : 
                tp += 1 
            else :
                fp += 1 
        fn = n_dpos - tp
        tn = n_prot - tp - fn
    a = {"DepolyX" : [tp , tn , fp , fn]}
    if fn < 0 : 
        weirdos.append(genome)
        print(f"N dpos = {n_dpos}, while M positive = {tp}")
        print(genome , a, alleged_dpos)
        print("\n")
    metrics[genome].update(a)
        
        

NameError: name 'path_out' is not defined

In [67]:
for hit in Dpo_class_objects["KC821630.1.multi.faa"] : 
    seq = Dpo_class_objects["KC821630.1.multi.faa"][hit].get_seq()
    print(seq in dpos_reference , hit)

True lcl|KC821630.1_prot_AGO49328.1_9
False lcl|KC821630.1_prot_AGO49349.1_30


In [4]:
import json
import pprint
pp = pprint.PrettyPrinter(width = 100, sort_dicts = True, compact = True)

pp.pprint(metrics)



{'AY349011.3': {'DePP': [1, 71, 5, 0], 'phageDPO': [1, 74, 2, 0]},
 'CP000711.1': {'DePP': [1, 56, 5, 0], 'phageDPO': [1, 60, 1, 0]},
 'DQ831957.1': {'DePP': [1, 55, 9, 0], 'phageDPO': [1, 58, 6, 0]},
 'DQ834250.1': {'DePP': [1, 59, 8, 0], 'phageDPO': [1, 61, 6, 0]},
 'FQ482084.1': {'DePP': [1, 39, 10, 0], 'phageDPO': [1, 46, 3, 0]},
 'FR671405.1': {'DePP': [1, 51, 5, 0], 'phageDPO': [0, 55, 1, 1]},
 'FR671406.1': {'DePP': [1, 49, 7, 0], 'phageDPO': [0, 55, 1, 1]},
 'FR671407.1': {'DePP': [1, 52, 5, 0], 'phageDPO': [0, 56, 1, 1]},
 'FR671410.1': {'DePP': [1, 47, 5, 0], 'phageDPO': [0, 51, 1, 1]},
 'FR671411.1': {'DePP': [1, 50, 5, 0], 'phageDPO': [0, 54, 1, 1]},
 'GQ413937.1': {'DePP': [1, 35, 8, 0], 'phageDPO': [1, 40, 3, 0]},
 'GU196281.1': {'DePP': [1, 38, 10, 0], 'phageDPO': [1, 47, 1, 0]},
 'GU323708.1': {'DePP': [1, 37, 7, 1], 'phageDPO': [2, 41, 3, 0]},
 'HG799490.1': {'DePP': [1, 43, 5, 0], 'phageDPO': [0, 47, 1, 1]},
 'HG799496.1': {'DePP': [1, 46, 6, 0], 'phageDPO': [0, 51, 1

In [76]:
for _,genome in enumerate(genomes_id) : 
    if genome in weirdos : 
        tp , tn , fp , fn = 0 , 0 , 0 , 0
        fastas = SeqIO.parse(f"{path_out}/{genome}.multi.faa" , "fasta")
        n_prot = len([record.id for record in fastas])
        n_dpos = len([record.id for record in fastas if str(record.seq) in dpos_reference])
        alleged_dpos = [record.id for record in fastas if str(record.seq) in dpos_reference]
        hits = Dpo_class_objects[f"{genome}.multi.faa"]
        for record in fastas : 
            print(record.seq , record.id)


In [81]:
dpos_reference

{'MADEKIRVTELPVKSEITPGGKMLVSQNGIDWQTDVGALMLKANNLSDVDAPKSRGNLNVYSKEEVDDKVSGGGIPDQIQEPDGFKYIGRVPSFAALVSVVPEKAGERVIVSGHVAGNDYGGGVFVARAGSVAINDGGTIMPVNNNFYWQRLVEDPGTLDVTHFGAKRDGVTDCATACLAMWNYTQSLGAGGSMIGIQFPAGEFAVSNIDISANYVGNFRLVGKGVVTTFGYFPATRIKLIGADNQAAFKVQARRSEIANLQIYGQYEVKANTRGFFKNTCVSGQYVHGVNWRSTYTGGPIFDLMDTLDTKFSEFYASYVYGGVIYGVPSGSESGSWDHLTAIELSNFNVQRCYGKQAFDLQKSGQSFIYNGWIEKTDFPGDLSNGQWIIQGLSMEDCVNPLDLTFTRAQLSQINLQGTSALRYDNPDKSRLLSTYEMGRNRVEAYGAQFFGSLSYDYLSSHYRLSNATDKAAWFNLGKMIVTNQNDASRIRFFGANGQASVPSDQGAFDSNNFGGGECLLTLRRVPGTGTRQDCAIEVHGNSPIADIRISRPYENDVEIYVQLKPQCGFVNVSLETSTNSRFDSGTRFLWTYSGAPVTDDAIAGMTLYSPRKTVAFGTFGAGLTIMEDKTLGFTGRDLIDGKMPFMHNGKVYLMPLVVSPDGSDSFARTGEIDGTKIDNLLGGNLNQGWVNSFKSGADAQNGSLNLSVKAGAAISVNTAIADLVSELTIVSGPANTGTAISTSVDFRKPNGGTGQNTLRVVFAGKVDGKNTIRLAKRVSGVTTTISPQDGFINDGQVLKIYTKGNVIRVYADDVLIWDLTDDQLLTGTYFGFGTASTNAGMIVSKLKFYKA',
 'MADKGFGVKKINLIGASGTPTLTSPNNLNLNAVNVAISTDVSIGGTCTAYEFSGATASWMVGNDGTDNYTFIGSGISTQVNDPELNLYKGQKYIFHNRSSGHPFRIQITPNGSVGGQYNTGVTNNDGSAQTDIIFEVPQDA

In [60]:
# Create a set of all the positive sequences : 
path_bench_dpo = "/media/concha-eloko/Linux/PhageDEPOdetection/Benchmarking_Dpos"
dpos_reference = {record.seq for file in os.listdir(path_bench_dpo) for record in SeqIO.parse(f"{path_bench_dpo}/{file}", "fasta") if len(str(record.seq))>200}
dpos_reference_dico = {record.description :record.seq for file in os.listdir(path_bench_dpo) for record in SeqIO.parse(f"{path_bench_dpo}/{file}", "fasta") if len(str(record.seq))>200}

len(dpos_reference)



149

***

# Get the metrics

> Get the N total of candidates 

In [16]:
# N proteins 
from tqdm import tqdm
import os 
 
path_bench = "/media/concha-eloko/Linux/PhageDEPOdetection/Benchmarking"

all_candidates = set()
n= 0
for fasta_file in (os.listdir(path_bench)) :
    fastas = SeqIO.parse(f"{path_bench}/{fasta_file}" , "fasta")
    tmp_results = []
    for record in fastas :
        if len(str(record.seq)) >= 200 :
            all_candidates.add(str(record.seq))
            n+= 1
# N = 4849


> Get the M positive Dpos (dataset)


In [17]:
path_bench_dpo = "/media/concha-eloko/Linux/PhageDEPOdetection/Benchmarking_Dpos"

m = len(os.listdir(path_bench_dpo))
m

# M = 154

154

In [18]:
# Create a set of all the positive sequences : 
path_bench_dpo = "/media/concha-eloko/Linux/PhageDEPOdetection/Benchmarking_Dpos"

dpos_reference = {record.seq for file in os.listdir(path_bench_dpo) for record in SeqIO.parse(f"{path_bench_dpo}/{file}", "fasta") if len(str(record.seq))>200}
len(dpos_reference)

dpos_reference_dico = {record.id :record.seq for file in os.listdir(path_bench_dpo) for record in SeqIO.parse(f"{path_bench_dpo}/{file}", "fasta") if len(str(record.seq))>200}



In [14]:
len(dpos_reference_dico)

154

***
### Get the scores for our model : 

In [19]:
# TP : 
tp = set()
fp = set()
for file_fasta in Dpo_class_objects :
    for prot in Dpo_class_objects[file_fasta] :
        obj = Dpo_class_objects[file_fasta][prot]
        if obj.get_seq() in dpos_reference:
            tp.add(obj.get_seq())
        else :
            fp.add(obj.get_seq())

len(tp) , len(fp)


(124, 123)

(117, 172) #Old 
(118, 118) # 3L
(118, 127) # 4L
==> See if the 5 additional are legit 

In [13]:
predicted_positives = set()
for file_fasta in Dpo_class_objects :
    for prot in Dpo_class_objects[file_fasta] :
        obj = Dpo_class_objects[file_fasta][prot]
        predicted_positives.add(obj.get_seq())
        
len(predicted_positives)

236

***
### Inspect the FN : 

In [14]:
FN_seq = []
for seq in dpos_reference :
    if seq not in predicted_positives :
        FN_seq.append(seq)

In [24]:
with open(f"/media/concha-eloko/Linux/PhageDEPOdetection/Dpos_reference.T12_FN.3L.2107.multi.fasta", "w") as outfile : 
    for fasta_file in tqdm(os.listdir(path_out)) :
        fastas = SeqIO.parse(f"{path_out}/{fasta_file}" , "fasta")
        tmp_results = []
        for record in fastas :
            if str(record.seq) in FN_seq :
                outfile.write(f">{record.description}\n{record.seq}\n\n")
                break
        else :
            print(record.description)

TypeError: 'module' object is not callable

In [23]:
len(set(FN_seq))

31

12 Divide and conquer<br>
14 real FN (mostly triple helix)
> Check the divide and conquer

In [38]:
#fasta_DNC = SeqIO.parse(f"/media/concha-eloko/Linux/PhageDEPOdetection/Divide_Conquer.txt", "fasta")

def divide_and_conquer(fasta, window=1000, slide=250):
    sequences = []
    for i in range(0, len(fasta) - window + 1, slide):
        sequences.append(fasta[i:i+window])
    return sequences

for record in FN_seq :
    aa_seq = str(record)
    aa_dnc = divide_and_conquer(aa_seq)
    for index, aa_div in enumerate(aa_dnc) :
        prediction, sequence_outputs = predict_sequence(model_classifier, aa_div)
        if prediction[0] == 1 :
            print(record.description , f"with a Dpo division : {index} fragment")
    

In [26]:
import pprint
pp = pprint.PrettyPrinter(width = 100, sort_dicts = True, compact = False)
pp.pprint(prediction_results)


{'AY349011.3.multi.faa': [((1, 1.0), 'lcl|AY349011.3_prot_AAQ54995.1_60'),
                          ((1, 1.0), 'lcl|AY349011.3_prot_AAQ55000.1_65')],
 'CP000711.1.multi.faa': [((1, 1.0), 'lcl|CP000711.1_prot_ABQ88383.1_2'),
                          ((1, 0.991202712059021), 'lcl|CP000711.1_prot_ABQ88395.1_14')],
 'DQ831957.1.multi.faa': [((1, 1.0), 'lcl|DQ831957.1_prot_ABI15697.1_20')],
 'DQ834250.1.multi.faa': [((1, 1.0), 'lcl|DQ834250.1_prot_ABI21803.1_19')],
 'FJ230960.1.multi.faa': [((1, 1.0), 'lcl|FJ230960.1_prot_ACI90991.1_86'),
                          ((1, 1.0), 'lcl|FJ230960.1_prot_ACI91003.1_98'),
                          ((1, 1.0), 'lcl|FJ230960.1_prot_ACI91006.1_101')],
 'FQ482084.1.multi.faa': [((1, 1.0), 'lcl|FQ482084.1_prot_CBX44498.1_37'),
                          ((1, 0.9999966621398926), 'lcl|FQ482084.1_prot_CBX44510.1_49')],
 'FR671405.1.multi.faa': [],
 'FR671406.1.multi.faa': [((1, 0.9979360103607178), 'lcl|FR671406.1_prot_CBW39009.1_42')],
 'FR671407.1.multi.f

***
### The alleged FP :
> Switch to the python base environment 

In [20]:
FP_seq = []
for seq in predicted_positives :
    if seq not in dpos_reference :
        FP_seq.append(seq)

In [22]:
p =0
for seq in FP_seq :
    if len(seq) > 400 :
        p += 1
len(FP_seq), p

(122, 83)

In [24]:
with open(f"/media/concha-eloko/Linux/PhageDEPOdetection/Dpos_reference.T12_FP.3L_2007.multi.fasta", "w") as outfile : 
    for fasta_file in tqdm(os.listdir(path_out)) :
        fastas = SeqIO.parse(f"{path_out}/{fasta_file}" , "fasta")
        tmp_results = []
        for record in fastas :
            if str(record.seq) in FP_seq :
                outfile.write(f">{record.description}\n{record.seq}\n\n")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 133/133 [00:00<00:00, 1510.59it/s]


In [None]:
for seq in fp :
    if len(seq) > 200 : 
        for fasta_file in Dpo_class_objects :
            for prot in Dpo_class_objects[fasta_file] :
                if Dpo_class_objects[fasta_file][prot].get_seq() == str(seq) :
                    print(fasta_file,Dpo_class_objects[fasta_file][prot].name , Dpo_class_objects[fasta_file][prot].prediction ,Dpo_class_objects[fasta_file][prot].get_seq() , sep = "\n")
                    Dpo_class_objects[fasta_file][prot].plot_token()
                    print("\n")



In [None]:
Make ESMfold on FP predictions and FN :

rsync -avzhe ssh \
/media/concha-eloko/Linux/PhageDEPOdetection/Dpos_reference.T12_FP.multi.fasta \
conchae@garnatxa.srv.cpd:/home/conchae/

rsync -avzhe ssh \
/media/concha-eloko/Linux/PhageDEPOdetection/Dpos_reference.T12_FN.3L.2107.multi.fasta \
conchae@garnatxa.srv.cpd:/home/conchae/

rsync -avzhe ssh \
/media/concha-eloko/Linux/PhageDEPOdetection/Dpos_reference.T12_FN.3L.2107.multi.fasta \
conchae@garnatxa.srv.cpd:/home/conchae/
