# The final DpoDetection Tool :
***

In [14]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from torch import nn 
import torch.nn.functional as F

import os
import numpy as np
import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

path_work = "/media/concha-eloko/Linux/depolymerase_building"

esm2_model_path = f"{path_work}/esm2_t12_35M_UR50D__fulltrain__finetuneddepolymerase.2103.4_labels/checkpoint-2255"
DpoDetection_path = f"{path_work}/Deposcope.esm2_t12_35M_UR50D.2203.full.model"

tokenizer = AutoTokenizer.from_pretrained(esm2_model_path)
esm2_finetuned = AutoModelForTokenClassification.from_pretrained(esm2_model_path)


In [15]:
class Dpo_classifier(nn.Module):
    def __init__(self, pretrained_model):
        super(Dpo_classifier, self).__init__()
        self.max_length = 1024
        self.pretrained_model = pretrained_model
        self.conv1 = nn.Conv1d(1, 64, kernel_size=5, stride=1)  # Convolutional layer
        self.conv2 = nn.Conv1d(64, 128, kernel_size=5, stride=1)  # Convolutional layer
        self.fc1 = nn.Linear(128 * (self.max_length - 2 * (5 - 1)), 32)  # calculate the output shape after 2 conv layers
        self.classifier = nn.Linear(32, 1)  # Binary classification

    def make_prediction(self, fasta_txt):
        input_ids = tokenizer.encode(fasta_txt, truncation=True, return_tensors='pt')
        with torch.no_grad():
            outputs = self.pretrained_model(input_ids)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            token_probs, token_ids = torch.max(probs, dim=-1)            
            tokens = token_ids.view(1, -1) # ensure 2D shape
            return tokens

    def pad_or_truncate(self, tokens):
        if tokens.size(1) < self.max_length:
            tokens = F.pad(tokens, (0, self.max_length - tokens.size(1)))
        elif tokens.size(1) > self.max_length:
            tokens = tokens[:, :self.max_length]
        return tokens

    def forward(self, sequences):
        batch_size = len(sequences)
        tokens_batch = []
        for seq in sequences:
            tokens = self.make_prediction(seq)
            tokens = self.pad_or_truncate(tokens)
            tokens_batch.append(tokens)
        
        outputs = torch.cat(tokens_batch).view(batch_size, 1, self.max_length)  # ensure 3D shape
        outputs = outputs.float()  # Convert to float
        
        out = F.relu(self.conv1(outputs))
        out = F.relu(self.conv2(out))
        out = out.view(batch_size, -1)  # Flatten the tensor
        out = F.relu(self.fc1(out))
        out = self.classifier(out)
        return out, outputs

In [16]:
model_classifier = Dpo_classifier(esm2_finetuned) # Create an instance of Dpo_classifier
model_classifier.load_state_dict(torch.load(DpoDetection_path), strict = False) # Load the saved weights ; weird Error with some of the keys 
model_classifier.eval() # Set the model to evaluation mode for inference


Dpo_classifier(
  (pretrained_model): EsmForTokenClassification(
    (esm): EsmModel(
      (embeddings): EsmEmbeddings(
        (word_embeddings): Embedding(33, 480, padding_idx=1)
        (dropout): Dropout(p=0.0, inplace=False)
        (position_embeddings): Embedding(1026, 480, padding_idx=1)
      )
      (encoder): EsmEncoder(
        (layer): ModuleList(
          (0-11): 12 x EsmLayer(
            (attention): EsmAttention(
              (self): EsmSelfAttention(
                (query): Linear(in_features=480, out_features=480, bias=True)
                (key): Linear(in_features=480, out_features=480, bias=True)
                (value): Linear(in_features=480, out_features=480, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
                (rotary_embeddings): RotaryEmbedding()
              )
              (output): EsmSelfOutput(
                (dense): Linear(in_features=480, out_features=480, bias=True)
                (dropout): Dropout(p=0.0, inpla

In [17]:
def predict_sequence(model, sequence):
    model.eval()
    with torch.no_grad():
        sequence = [sequence]  # Wrap the sequence in a list to match the model's input format
        outputs, sequence_outputs = model(sequence)
        probas = torch.sigmoid(outputs)  # Apply sigmoid activation for binary classification
        predictions = (probas > 0.5).float()  # Convert probabilities to binary predictions
        sequence_outputs_list = sequence_outputs.cpu().numpy().tolist()[0][0]
        prob_predicted = probas[0].item()
        return (predictions.item(), prob_predicted), sequence_outputs_list


def plot_token(tokens) :
    tokens = np.array(tokens)  # convert your list to numpy array for convenience
    plt.figure(figsize=(10,6))
    for i in range(len(tokens) - 1):
        if tokens[i] == 0:
            color = 'black'
        elif tokens[i] == 1:
            color = 'blue'
        elif tokens[i] == 2:
            color = 'red'
        else :
            color = 'green'
        plt.plot([i, i+1], [tokens[i], tokens[i+1]], color=color, marker='o')
    plt.xlabel('Token')
    plt.ylabel('Label')
    plt.title('Label for each token')
    plt.xticks(rotation='vertical')
    plt.yticks(np.arange(2), ['0', '1'])  
    plt.grid(True)
    plt.show()

***
# Predictions Ferriol

> Make predictions 

In [18]:
from Bio import SeqIO
from tqdm import tqdm 
from collections import Counter

path_out = "/media/concha-eloko/Linux/77_strains_phage_project"

prediction_results = {}
path_fasta = f"{path_out}/depolymerase_77.0907.fasta"
fastas = SeqIO.parse(f"{path_fasta}" , "fasta")
tmp_results = []
for record in tqdm(fastas) :
    #if len(record.seq) >= 200 :
    protein_seq = record.seq 
    prediction, sequence_outputs = predict_sequence(model_classifier, str(protein_seq))
    if record.description.count(",") == 0 :
        prot_id = record.description
    else :
        prot_id = "_".join(record.description.split(",")[0].split(" "))
        pass
    if prediction[0] == 1 :
        a = (prot_id , dict(Counter(sequence_outputs)))
        tmp_results.append(a)
    else :
        print("Missed : ",record.description, record.seq, sep = "\n")
        pass

2it [00:01,  1.34it/s]

Missed : 
K10PH82C1 cds 49, lytic tail fiber protein
MAKYDKSIPSEYDALFQKAADSHGVSYDLLRKLAFNESSFNPAAKSPTGPLGIMQFTKGTGAGMGLKITGGPDDERLNPALAIDAGARHLSDLVRKYNGDELKAALAYNQGEGPNGAPQIQAYDKGDWASISEEGRNYMRKLLDVAKSPQSGALEAFGGITPKGKGIPSDSAFAGIGTKSKVGDALPESTGFNVKGTAQPEPAVPYAKSFWEATGTTIDEFESRSTFFGFKDAAAAELQNSTLGVAVRAGRADNGFDVFKDTITPTRWNSHSWSPEELERIRNEVKNPNYINVVTGGSPENLDALIKLANENYEADQKASGAGLGAKLSAGVIGAGVDPLSYVPLVGVAGKGLKVVNKAFVVGAQAGALNVVSEALRTSIAGGEAHYADAALGGMLFGSGMSVLTDAVSRGLRKAGGTEIPNEFAGPALRLEARETALNTGGADMTLMPTEGRVFDKEHAGVPYADHPVESGAVILNNGAILSDTNPLNPRTLQEFSEINPERAAPGIKLGGFTEIGLKTLRSESPEIRSLASDLVRSPTGMESGTSGKFGATASDIHERLHSTDQRTYNQLYDAVREAMRDPEFSTGDMKLSREGIRQEIYKRAALAIERPELQASLTKGERKVMDIMKAHFDTKRELMENPGMFGRMDAQSIFPGSRHKGTYVPNVYDRSAKLAMVNKLGNDGLQEAIALSWLTSYRSRPEVKARVDEYLMELNGLKTVQEVTPEMVRKHAMDKAYGISHSDQFTQSSVIDDNITGLVGIENNNFLEARNMFDSDMPITLPDGSTFSVNDLRDFDMFRVMPAYDRRINGDIAIMGGTGKTTKELKDAIMALDKKSEGKGTMKGEVEALKDTVKILTGRARRNQDTVFETALRSLNDLAFFAKNFYMGPQNLTEISGMLAKGNVSAMLHGIPFMNDLATRTAPLKGSELKELHGIIFGKELDQLIR

25it [00:10,  2.50it/s]

Missed : 
K18PH07C1 cds 231, Putative tail spike protein
MADTTQFEQAVDQVIEDSERLHKVVNGSAIDTVIVEDGSTIPTLRKALLDNVYFKTPPQPWVAGTQATVFNQLYSFTNSAGTFWWYAPGASPSTPITLPADPSTSTAWKVYNDSVVVSEKFAPLNTPAFVGSPTAPTPAQGSNSGAIATTAFVNLAVAAAINSLAGSSPSYAALTVVGASTLNTLVVSGTSQFGGTLDASGVLGKFQKISLSGQTATLSFDYTAASNYLKTIISPNSVQTNNLTSAVIVNGTASADNTTMSLTGVGNNTFDYVYIRGNSSKASTEPRLKVTGTTELENVRVTGSLSGVNVGVDGLDILPNSIITTTTAEIGSDLTVNGITTLGSTTIQDLGVVSSLTVNGNSTLTGGFTAGSASSVAGNLSITGLLAVTGEATVSTNLTVTGNANLNNGGTGTTTVNNLNVLGTLTGVSIDVNGKDINPNSVLATTTIEAYGSLKGASLVITGEATAPKVTANFVGTTPGSVTTPSAGTTWTPGGTGSDLTKMNNIYNVDVTNTLTIGPWANLGSAFTATIYLFQDATGGHAVTLDASYKIINGGAISTTANSVTILQVTYCGRGTVYDVAVYQRP


29it [00:12,  2.83it/s]

Missed : 
K19PH14C4P1 cds 48, Putative tail spike protein
MLIYKGGHMATTIKTVMTYPLDGSTTDFNIPFEYLARKFVRVTLIGVDRKELILNQDYRFATKTTISTTRALGPADGYNLIEIRRYTSATERLVDFTDGSILRAYDLNISQVQTLHVAEEARDLTADTIGVNNDGDLDARGRRIVNVADAQDVGDAINLGQIQRWNDSALNSANRAKQEADRATARASEAATSAGNSASSASLSRDWAIKESPVEGTFKSSRSYALDSMAYRDASKSSADASAASAGAAKTSEINAKNSEIATKTSETNAKTSETNAKASEVRAIEEANKLENMNDLAGAIKEVLKPEGDTSGKVVWKYDIDSKVLRAYNGMSIGWDWDGTGSPFMNFYRGQGEPSGSLRLDANKVVRFNGMNSVDFDSTINAQAINGSWIHSTGDLVADATLYVHGGDYIYKDKWNPSGANKVTNLIRGSVGGFACDEFFSEIVGYYIERGWHLVGGDNDTWMRIKGNGNFEIAGNRGARIVINGGAIVEQDGNIRGSIWEGKWLRDFMNDRFLNDVWLGGQEWVFTNGGARIDYALGGGSVCTGYTQDATGGVRNAWLDGMFHRKLWIRYGNGSARVIGSA


34it [00:14,  2.47it/s]

Missed : 
K21lambda1 cds 28, tail spike protein
MADLSISIISDQASESNQAGWWHPLDSFQGVEYYGLCKEYGTVDYHQVEIVRRDADGTLTRGMCKNVDGTVAEFKNDVGHNQPSVVVDGAGYIHVFTSMHVNLLRYFRSARPGDVSQMVDATLDFPDVDWVWTYPITGRGPDGDAYCMMRVASRSTVGENKRAGILYRFDVGSLRWVRYAHVAETADRAIYPDDMAIDEDGVHLLFQWSAYPSSAVRHVGEYGIIGTDGLMRAVNNTPLPMPVAQGQLAYKPLQPGENPAISDGLKIGIQSAKFALNNGGLSHITYRFRTVDDPTGTWFGKFGVYVATWSGSSWNEELIAYVPPEQGNTSAALATTAQGGKRRVYFSVEYTSSGNTVAVIVLAENAGSGWVYSVLGNSAPTLLRLGSAPGNGGDVLYVSAPFEAKVYRYFVPEDYFPAQQFTNFDELLLTLT


39it [00:15,  3.25it/s]

Missed : 
K23PH08C2 cds 220, tail spike protein
MADTTQFEQAVDQVIEDSERLHKVVNGSAIDTVIVEDGSTIPTLRKALLDNVYFKTPPQPWVAGTQATVFNQLYSFTNSAGTFWWYAPGASPSTPITLPADPSTSTAWKVYNDSVVVSEKFAPLNTPAFVGSPTAPTPAQGSNSGAIATTAFVNLAVAAAINSLAGSSPSYAALTVVGASTLNTLVVSGTSQFGGTLDASGVLGKFQKISLSGQTATLSFDYTAASNYLKTIISPNSVQTNNLTSAVIVNGTASADNTTMSLTGVGNNTFDYVYIRGNSSKASTEPRLKVTGTTELENVRVTGSLSGVNVGVDGLDILPNSIITTTTAEIGSDLTVNGITTLGSTTIQDLGVISSLTVDGNSTLTGGFTAGSASSVAGNLSITGLLAVTGEATVSTNLTVTGNANLNNGGTGTTTVNNLNVLGTLTGVSIDVNGKDINPNSVLATTTIEAYGSLKGASLVITGEATAPKVTANFVGTTPGSVTTPSAGTTWTPGGTGSDLTKMNNIYNVDVTNTLTIGPWANLGSAFTATIYLFQDATGGHAVTLDASYKIINGGAISTTANSVTILQVTYCGRGTVYDVAVYQRP
Missed : 
K23PH08C2 cds 225, tail spike protein
MAIPTIPLQIWAESDVVLPNAHTANKISPIADLWDKGWDLGEKPACEELNYVLNMMTWWMSYISTEQIPGMAADYLRKDQNLSDVENKATARTNLEVYSKAEGDNRYVNVEGDTMTGPLTVPRITFPSDASDTAHITTTLGTDQTYLDFVIGDNPGVAGQPNVDIMRFRFVPVNNSATVSPFNMMELNATGTNTALLRVQGNITATGTMTTGTLASTTINNGGNIQTTSLGVSGTATLQNLVVNSNNATVGGRSIVRAVNSTAANANGDLYISIGVSDIRWSGEQNKVNVDFENYGSGGRFARGPDGSVLTGLIDAN

40it [00:15,  3.90it/s]

Missed : 
K23PH08C2 cds 228, tail spike protein
MATIELPVIKIKNLSDKTLVTGSDEIIIQSSTDTEKTSINKFITDIGLLKRSEVTGITGASVIGTHSGATVQQVLDSLSNSWQNLYYFSTTGNEIGTHVDTIALSPVQRTYGVQISTPLASFSPKVDNVLSNGTTSLRWSQVYAVNSVISTSNKKKKTNLRQITPTEAKAFYEIGKLDSVWQWLSKYSSEKGAARLHSGPTVQDAIKVMLKHGLDWTKYSAFCYDSWEANGDTPAGEEFAFRKEELLFWILRATIAVQEDLDKRLSALENSLSGN


56it [00:23,  2.17it/s]

Missed : 
K30lambda2_2 cds 20, Putative tail spike protein
MAITTRIIAQQVTALDGANSRVSKYPKFTVQLGYSVSSLAATELLDAATKSAASAAAAKTSETNAKASETASKNSQTAAKTSETNAAASAQLAQNLAGKTSLVTPLGVMTGSAEAKIASITIAANQSSSVHVLFALYATGNGANRDDIYNMEIVSLALPGPVTSVTADNIGSFLSHRVIGPANTNGFMVGLKSTIEGSNVTYDVYLKSRSSFRDPKMAFLSGSISVTPPTGPLVDGTAPAWKTTGFDTEVIYVNRAQVIDDGISLAKLKALRVTGDSGSEYLTLTARPTGGIISLNNRSIYLRPAGTEDTVASVEIAPNGNLVFPLTATKNHILSWQGGARIRANDTGTMVISGNNGSGQTTGFLAFRPNGDSATTTEIQMRSDGNLKQTAANVDEPNVLTRRDTVINLISSRAPEAGVTSEALADLDVNNTESGNDQWGRGVQLQQAGAGTKNTPGNTTGRLGTIATFRQSQYRLSQIFFDSNTGNGGTSLGVFVRSLRADSGTSPRPWFALYHEGNKPNIQSGIAGITIDGNGFVKKASPIAKLIAEIPSKEDSFFWTGVETVGGYVGCNAEAQGVFAVKTGLGRYTIKGSLGWNTEGWKFELPRDDNGNMLCFVESDWNEEEKELNIQVFTRKFDINTGNIIAGEPMEIPQGRWIDLRLEMPKVEIPEVEFPEDPEVE


62it [00:26,  2.65it/s]

Missed : 
K34PH164 cds 24, Putative tail spike protein
MANISDQLAADIHNAFSKYYTDLANQDQIFFGVGDVQITKQDGTTATIRSWNKVIGSVDTAAQRGQANTFTALQTFSAGINVSVGNINVMNDNSMVILGKNSDLALLKKSGQGGTIAVGSGTPFKIQRTNTATVSPASTVEDILTIGTDKKTTLAGALNTGGDVVANGFLYAQSIELSFGTPYIDFHFNYSTDDFTGRIIATAADQISVQRSHLRVDRDLRVFGMADIKGWAQCGVDLSANRTDFGSPAIGSLVSGGRIRSRMLGRGGNVDPSGAWGGFYVEEYVGTEHRIIMYMDGFGRTDAWSFRSGGTISTPKGDVLTTGSDVRLKTDFTQAPEKACERIERLGVCQYRMKGESRVRRGFIAQQADTVDKVYTYQDVEQEIDGERIKVMNVDYVAIIADLVASVQELRQELKELKGE


74it [00:32,  1.66it/s]

Missed : 
K41P2 cds 11, baseplate wedge subunit depolymerase
MIKAPSITSLRVDKLAANFVYLKWDDVGMDFYYVVEFARSRDMDGTIIPDEELIWTQLGYSYENEWFSDQVAPNTRYKFRIQTTHEGFDPSDWVISDELWTFDENAYAYTTMREFTPADSFINEKFAKNNRDYVDFNDDVIMASLMVEDFVYSPLYSDVSQISDKILKQESYHEIQDHIEHVCNDIDRTFLVYSNGLLYLFERFQNMAKVSNDKGQTWYYYKALNDRVGNPVSRTVSYQSTNTTYVLGYDRIFYGRTSTDIRWSSDEVRFSSDDVTFAKLGNQSGLDFDVDSYNTYARLPGGVSKYAEAIACSNEWLYVAAKNVMRRIALRNTPIDTDPGSPTFGERIFDEVSYTIVPGNDKIVVKKMDVLNDRLYVLVTGEVKKAMMDPTVKANVIPSNDAGVYLWDENAKTFTRVYGKTEDERFYITHEYTNMSTNGDEVYISVGNYKYPGTLPDPDLVEKYPEDVHSAVKYDLVTGYTASISINFATVRANQNDPTVWNFGPQEYYNEANFSWHFRDKVSTWITNDNRPLVVYPETLYTLVTDSASPASTIRVNHEVWDKGTVTIYLNNIKFTGFTKYTNGVLLYRSGGRIIGFYELSYRARDELTIFWKPDNTLMVASLVNQERENPYTPDIEPGLIDPDLSHMITRFAPQSYLDNQQFEKFGEYYLQYISLGSNSYYNKLLNLIRNKYPREKNNVEYLWSEINRRNIYLDKTKREAVVRFFESRASDFYSTKGIEASYTFLFKLLYNEDVSVEIESSNSLEYDILVSSTNISQDIVGRTIYTPTGRANVTYIEREYENGQLRWSMTLHNAQGNFIEGQVVKSEKTNFTGMVIRGVRGKQMANNSIDYINRGRSYYVMKIRSNLPTSRYKDDVLRFVHPVGFGFMGITMLTVFINSGLSMTHNETIIDILRNYRFDSGYPKFWPDRIASMDGNGN

86it [00:36,  3.24it/s]

Missed : 
K45PH128C2 cds 227, Putative tail spike protein
MADTTQFEQAVDQVIEDSERLHKVVNGSAIDTVIVEDGSTIPTLRKALLDNVYFKTPPQPWAAGTQTTVFNQLYSFTSSAGTFWWYAPGASPSTPVTLPADPSTSTAWKVYNDSVVVSEKFAPLNTPAFVGSPTAPTPAQGSNSGAIATTAFVNLAVAAAINSLTGSSPSYAALTVVGASTLNTLVVSGISQFGGTIDASGVLGKFQKISLSGQTATLSFDYTAASNYLKTIISPNSVQTNNLTSAVIVNGTASADNTTMSLTGVGNNTFDYVYIRGNSSKASTEPRLKVTGTTELENVRVTGSLSGVNVGVDGLDILPNSIVTTTTAEIGSDLTVNGVTTLGSATIQDLGVVSSLTVDGNSTLTGGFTAGSASSVAGNLSITGLLAVTGAVTVSTNLTVTGNANLNNGGTGTTTVNNLNVLGTLTGVSVDVNGKNINPNSVLATTTIEAYGSLKGASLVITGEATAPKVTANFVGTIPGSVTTPSAGTTWTPGGTGSDLTKMNNIYNVDVTNTLTIGPWANLGSAFTATIYLFQDATGGHAVTLDASYKIINGGTISTAANSVTILQVTYCGRGTVYDVAVYQRP
Missed : 
K45PH128C2 cds 232, Putative tail spike protein
MAIPTIPLQIWAESDVVLPNAHTANKISPIADLWDKGWDLGEKPACEELNYVLNMMTWWMTYISEEQIPGLSNDYLRKDQNLSDVSDIPTARSTLDVYSKGESDTRYVNVSGDTMTGALTVPRITFPSDSSDTAHITTTTGVDQVYLDFVIGDNVGTAGQPSVDVMRFRFVPVNNSDSVSPFNMMELNATSNGVALLKVQGNITATGTTTTGTVSSTTINNGGNIQTTSLGVGGTATLQNLVVNSNNATIGGRSVVRAVNSTAANANGDVSISIGVSDIRWSGEQNKVQVDFENYGS

87it [00:37,  3.95it/s]

Missed : 
K45PH128C2 cds 235, tail spike protein
MSSDLPVIKIRNLSDKVLVSDTDELIIQSSVDTEKTTISKFISDIGILKKGDIMDVSGASLVGTHSGSTVQQVLDSLSNKWQNLSNGYYFATSGNEVGTYVDTISISPVQRSYGVQISTSLSAFSPKRDNFISNGTTALRWNQVYAVNSVISTSNKKKKTNLRQISTAEIKAFYEIGKLDSVWQWLAKYSSEKGLARLHSGPTVQDAIKIMLKYGLDWTKYSAFCYDKWEADGDNPAGEEYAFRKEELLFWILRATIAVQEDLDKRLSVLEDSLSGN


94it [00:39,  3.27it/s]

Missed : 
K49PH164C2 cds 24, Putative tail spike protein
MATISDQLAADIHNAFSKYYTDLANQDQIFFGVGDVQITKQDGTTATVRSWNKVIGSVDTAAQRGTVNTFTALQTFSAGINVSVGNINVMNDNSMIILGKNSDLALLKKQGQGGTIAVGSGTPFKIQRASTATVAVSSAMEDIFVIGVDKQTTLPGALSAGGNIDNTSKGKVLTQAIELSMSTPYIDFHYNSSSADFTARLIQDQANRLTAQVASFWVQDGRVTASSTAPSNPASGAQLTGNPVRSMLRGRGAYGDVDGAYSQFYIEEQVGTEHRMVLYLDGYGRTDAWLFRAAGTISTPKGDVMTQGSDVRLKEGFTEAPASACERIERLGVCQYRMKGESRVRRGFIAQQADTVDKVYTYQGEEQEIEGEKFRVMNVDYVAIIADLVASVQELRQELKELKGE


95it [00:40,  2.07it/s]

Missed : 
K50PH164C1 cds 28, tail spike protein
MAIYRTGQASMDAQGYITGYGSKWREQLTLIRSGATIIFLSNPLQFGVITEVISDTSMRAVTTNKAVVPKVDYVIFLHDSITVDGLAQDVAETLRYYQGKETEFAHFIEFLKEFDFKKLEDLTNQTKQSAAAAKVSETNAKASEGKAKTSETNAKNSENAALSSKNAAKASQDAAKASETAALASKNAAAASQTAAKTSETNAKASENAAAASKNAAKTSETNAKASEEAAEASKNAAKNSENAAKASQTAAKTSETNAKNSENAAKASQAAAKTSETNAKNSENKAKEYADIAQGISSPMIQYNWPVGTGANERFVKIAKLTDPGSSESHVTLMITNGGNYGARQGSIDFLDASARSLGTTVINASNVRQFMQIRRLGDPSLAEDNQLRYGVVKGDGFFEIWAYQRAFINNVKVAILAKAGRVDLYIPSGYVSQEGAPDGWVKSEAIRVYDEVNKPSRSDLGLANVMEIGAFGLGGNGISYSDITSNADLMQRMKEKGGHFWRASQKSGSTSNIISHGSGVFSRCGDTNSAINIDYNSGKVVILAANDSSLAAGNVKVNTLYGTANKPSKSDVGLSNVTNDAQVKKAGDTMTGDLDISKGTPSIRLKSASGNAHLWFMNADGNERGVIWTPENSDSLGEVHIRAKTKGGTTGGDFIVRHDGRIEARDAKINYKIAARTADFANDDTNTGSTNLRVSGKQHTPVVLVRDADSNLSIGFKLNNMNQKLLGIDVDGDIAFGENADQRQNSKIVTRKMMDAGFSVAGLMNFTNGFAGTWEAENINDRTLDLNSLMIKKSNPGSIYVYQCISQDGGNNITNKPSGVTGNFILYVESIRKVSDTDFTNRQILFGTESNREFTRYCSNGTWSAWRESVVSGMNQDVSVKSMSASGRLSGGELAVGGAGALNGNLGVGGGTASKVPSSDKGIVIGRGAMVREGGEGRLILSASGGTDRQ

101it [00:42,  2.70it/s]

Missed : 
K54lambda1_1_1 cds 225, Putative tail spike protein
MADTTQFEQAVDQVIEDSERLHKVVNGSAIDTVIVEDGSTIPTLRKALLDNVYFKTPPQPWVAGTQATVFNQLYSFTNSAGTFWWYAPGASPSTPITLPADPSTSTAWKVYNDSVVVSEKFAPLNTPAFVGSPTAPTPAQGSNSGAIATTAFVNLAVAAAINSLAGSSPSYAALTVVGASTLNTLVVSGTSQFGGTLDASGVLGKFQKISLSGQTATLSFDYTAASNYLKTIISPNSVQTNNLTSAVIVNGTASADNTTMSLTGVGNNTFDYVYIRGNSSKASTEPRLKVTGTTELENVRVTGSLSGVNVGVDGLDILPNSIITTTTAEIGSDLTVNGITTLGSTTIQDLGVISSLTVDGNSTLTGGFTAGSASSVAGNLSITGLLAVTGEATVSTNLTVTGNANLNNGGTGTTTVNNLNVLGTLTGVSIDVNGKDINPNSVLATTTIEAYGSLKGASLVITGEATAPKVTANFVGTTPGSVTTPSAGTTWTPGGTGSDLTKMNNIYNVDVTNTLTIGPWANLGSAFTATIYLFQDATGGHAVTLDASYKIINGGAISTTANSVTILQVTYCGRGTVYDVAVYQRP
Missed : 
K54lambda1_1_1 cds 230, Putative tail spike protein
MAIPTIPLQIWAESDVVLPNAHTANKISPIADLWDKGWDLGEKPACEELNYVLNMMTWWMSYISTEQIPGMAADYLRKDQNLSDVENKATARTNLEVYSKAEGDNRYVNVEGDTMTGPLTVPRITFPSDASDTAHITTTLGTDQTYLDFVIGDNPGVAGQPNVDIMRFRFVPVNNSATVSPFNMMELNATGTNTALLRVQGNITATGTMTTGTLASTTINNGGNIQTTSLGVSGTATLQNLVVNSNNATVGGRSIVRAVNSTAANANGDLYISIGVSDIRWSGEQNKVN

102it [00:43,  3.39it/s]

Missed : 
K54lambda1_1_1 cds 233, tail spike protein
MATIELPVIKIKNLSDKTLVTGSDEIIIQSSTDTEKTSINKFITDIGLLKRSEVTGITGASVIGTHSGATVQQVLDSLSNSWQNLYYFSTTGNEIGTHVDTIALSPVQRTYGVQISTPLASFSPKVDNVLSNGTTSLRWSQVYAVNSVISTSNKKKKTNLRQITPTEAKAFYEIGKLDSVWQWLSKYSSEKGAARLHSGPTVQDAIKVMLKHGLDWTKYSAFCYDSWEANGDTPAGEEFAFRKEELLFWILRATISVQEDLDKRLSALENSLSGN


125it [00:54,  1.60it/s]

Missed : 
K65PH164 cds 12, baseplate wedge subunit depolymerase
MTIAPFVTSLRIHKLSANQVNIRWDDVGANFYYFVELAETRNRAGEVIPADNLSWSSLGYTADNDWFEQNRIEPLTYYKMRVQTTSAGFEPSEWVETEEFQTFEENAYTFEHMQEFSLVKEFIKQKFSLNNMSYVNFNTSAMMASLMTESFQFSPEYSHLSAIENFVVGESGYHEIQGPIEAVCVDKNRTMLGEIDGILYLFERFQHMVKVSNDKGQNWQYVQLFNDRVGNPVSRVVIYQSKTTSYVLGYDKIFYGRKSSDVRWSSNEVKFSDNEVTFAKLGDQLKLGFEVELFGTYASLPADVTKYAEAFTCNDDYLYVVAKDTVRKVKLKDAPIDTDPLSPTFGEKVFEKEASHITGNPKSVCFKMDSVGGKIFALITGEVKTLGLDPTDPRNVVDSATKGVYVYQEGTNTWKRVFGNTDEEKRRIEHLWTSMSTDGKEIFFSSANFKTTEYAQDIELETKYPELISTAVKNVNPIQYHSDKHYHMMSFRADEFSRWETFVPGPMRFYAEPWFVWMAREGNRCWISTADHAVVIYNDILYQKRVDAAAQGTTERILSEVWDKGDATFYCPPVSFNGFLQYASGIMFHEPDGKLIGYYAFDYRVRDQVTLNWKPTDVMFKAFLQNQTREEDWTPEHTPGLRDPDLRPYLTKMMPDSYLLQDSNFEHFCKYYLQFLSDGNGTHYNSLVNLVKNKYPREENAWEYLWSEVYKRNIYLSKDARDAVVRFFEARKNDFYATKGIEDSYKFLFKLLYNEDVEIDIESKNTTEYDIIVESTNISDDLVGRTIYTASGRSNVTYIEREYRDGHLLWRITIHNLSGRFIEGQEIKSERTDFEGIIVQGVRGKDMLSNNIDYINRSRSYYVMKIKSQLPTSRFRDDVLRFVHPVGFGFIGITLLTMFINSGLNMKHVETIINKLKNYKWDAGLPSVYPDRVAII

127it [00:55,  1.87it/s]

Missed : 
K65PH164 cds 197, Putative tail spike protein
MADQNLKQIQFKRTSTENKAPGADIVARGEIALNTHGRTLAIYTKDEADNVVQLAGKGVPFLDTSGTLSVDGTTTLKDNVTISPNKAINFETTDLSGAIVRHIVGKCATNDGWYIGAGGTSNSGILEIGTIDDGAETIQFVQRGAGNVEARKLVLLDGSGNTTLPGDLRLSTNKTVKINSGSTLVLEMGVGSNDVYIKNRRGVGVLQLTNDSNLTFRNSQVYYAMDGRGPGKSGTLLTNVENNRQAWQYTISAATASTARWVKVATIKHPGMSSSQLDLMISGGIDSGHGRHYVDFITLSGRNLTSWSTNSLDNWVEWRRVGSPSKTNVPEYYVVKNDSATDADASFDFYAKIPRYGNGLYVTVLNTAGYNGQDSGTVIIYETNQDTGDTGPSGSILVSMKQIFDSLAKPDFGDTTGTLPVNRGGTGATNVGDARNNLGLRTAAVRDVGESNGNLMEVGAFGIGGNGKSLVDITSDVDLMTRLKALGGTVFRANTASGYTGAPYYSHGTGFYGRASDTMAALNIDYATGNVRVFAINDSGLASGRVNSNVLYGTANKPSKADVGLGNLTNDTQVKKAGDTMTGDLAAPNLHASGTGTASVYVNAGSGNAHVWFRTDANERGVIWATPNTANLGQINIRAKTTGGTSAGDFSFRSDGRLGVPVAVKVGGAAMLTKDGNITSGSMFGGNLNNYLNSIKNDIVSGDNKQVSKTGDTMTGNLTINANLKVENPNGTMADFGSENSDKYSRITLARKIGSGAAVAMLKITPEGYVQFGYQDAVANPSPTKYIRVKPDGLDVEGDLVFNQTYCGTEEAVDISNKTIDLNNLVIKRTDPGTRQLYKCVSSGGGSKIANKPTSDGNFVLEVLSLRKVSDNDWTCKQTFTTKNNTTVGTYVRYCQNGSWTAWEEVVSGVQPINLGGTGATSVAAARNNLGVGEGQTVTFGNLV

140it [01:01,  2.50it/s]

Missed : 
K7PH164C4 cds 20, Putative tail spike protein
MAITTRIIAQQVTALDGANSRVSKYPKFTVQLGYSVSSLAATELLDAATRSAASAAAAKTSETNAKASETASKNSQTAAKTSETNAAASAQLAQNVAGKASLVTPLGVMTGSAEAKIASITIASNQSSSVHLLFALYATGNGANRDDIYNMEIVSLALPGPVTSVTADNIGSFLSHRVIGPANTNGFMVGLKSTIEGSNVTYDVYLKSRSSFRDPKMAFLSGSISVTPPTGPLVDGTAPAWRTTGFDTDVIYVNRAQVIDDGISLARIKQLAITNGKTDSTILLLSYLNEIGILSTNKKSISLRPGGTSDSSIAATEFLPNGNIILPNGDTGNQTISWLGGPRIRVNSNGSFVLSTNNPSNQTSGFITFRPQGDQVTSTELQIRDDGNIKQTAPQSSAGNALIRQDAAIQHIMDKAPAAGITTNPLSDLNVIPTPEGTDPWGADGVRVFQSGVSTKNTPDGTTGRLGTILNVRHTQYRIMQFFMQSNATAPILHIRSLRADQGNTPPAWFKVYTEYSKPNIQSDIAGITIDGNGFVKKASPIAKLIAEIPSKEDSFFWTGVETVGGYVGCNAEAQGVFAVKTGLGKYTIKGSLGWNTEGWKFELPRDDNGNMLCFVESDWNEEEKELNIQVFTRKFDINTGNIIAGEPMEIPQGRWIDLRLEMPKVEIPEVELPEDPEV


147it [01:04,  2.28it/s]


In [19]:
tmp_results

[('K10PH82C1_cds_45', {0.0: 761, 2.0: 263}),
 ('K10PH82C1_cds_50', {0.0: 486, 1.0: 538}),
 ('K10PH82C1_cds_51', {0.0: 681, 1.0: 343}),
 ('K11PH164C1_cds_39', {0.0: 720, 2.0: 304}),
 ('K11PH164C1_cds_45', {0.0: 699, 1.0: 325}),
 ('K11PH164C1_cds_46', {0.0: 602, 1.0: 422}),
 ('K12P1_1_cds_43', {0.0: 770, 2.0: 254}),
 ('K13PH07C1L_cds_10', {0.0: 640, 1.0: 384}),
 ('K13PH07C1L_cds_11', {0.0: 852, 2.0: 72, 1.0: 100}),
 ('K13PH07C1L_cds_12', {1.0: 214, 0.0: 810}),
 ('K13PH07C1L_cds_54', {0.0: 763, 2.0: 261}),
 ('K13PH07C1S_cds_10', {0.0: 640, 1.0: 384}),
 ('K13PH07C1S_cds_11', {0.0: 675, 1.0: 349}),
 ('K13PH07C1S_cds_53', {0.0: 762, 2.0: 262}),
 ('K14PH164C1_cds_24', {0.0: 509, 1.0: 515}),
 ('K15PH90_cds_49', {0.0: 787, 2.0: 237}),
 ('K15PH90_cds_54', {0.0: 906, 2.0: 118}),
 ('K15PH90_cds_55', {1.0: 543, 0.0: 481}),
 ('K16PH164C3_cds_43', {0.0: 703, 2.0: 321}),
 ('K16PH164C3_cds_48', {0.0: 533, 1.0: 491}),
 ('K17alfa61_cds_23', {0.0: 611, 1.0: 413}),
 ('K17alfa62_cds_64', {0.0: 609, 1.0: 415

In [20]:
results_77 = [item[0].replace("_cds", "__cds") for item in tmp_results]

In [21]:
results_77

['K10PH82C1__cds_45',
 'K10PH82C1__cds_50',
 'K10PH82C1__cds_51',
 'K11PH164C1__cds_39',
 'K11PH164C1__cds_45',
 'K11PH164C1__cds_46',
 'K12P1_1__cds_43',
 'K13PH07C1L__cds_10',
 'K13PH07C1L__cds_11',
 'K13PH07C1L__cds_12',
 'K13PH07C1L__cds_54',
 'K13PH07C1S__cds_10',
 'K13PH07C1S__cds_11',
 'K13PH07C1S__cds_53',
 'K14PH164C1__cds_24',
 'K15PH90__cds_49',
 'K15PH90__cds_54',
 'K15PH90__cds_55',
 'K16PH164C3__cds_43',
 'K16PH164C3__cds_48',
 'K17alfa61__cds_23',
 'K17alfa62__cds_64',
 'K17alfa62__cds_66',
 'K18PH07C1__cds_243',
 'K18PH07C1__cds_245',
 'K19PH14C4P1__cds_43',
 'K1PH164C1__cds_8',
 'K1PH164C1__cds_53',
 'K2064PH2__cds_25',
 'K2069PH1__cds_25',
 'K22PH164C1__cds_10',
 'K22PH164C1__cds_11',
 'K22PH164C1__cds_50',
 'K23PH08C2__cds_233',
 'K24PH164C1__cds_8',
 'K24PH164C1__cds_55',
 'K25PH129C1__cds_56',
 'K25PH129C1__cds_60',
 'K26PH128C1__cds_44',
 'K26PH128C1__cds_49',
 'K26PH128C1__cds_50',
 'K27PH129C1__cds_43',
 'K27PH129C1__cds_48',
 'K28PH129__cds_24',
 'K29PH164C1__c

In [19]:
from Bio import SeqIO
from tqdm import tqdm 
from collections import Counter

path_out = "/media/concha-eloko/Linux/77_strains_phage_project"

prediction_results = {}
path_fasta = f"{path_out}/all_dpos.77_phages.multi.fasta"
fastas = SeqIO.parse(f"{path_fasta}" , "fasta")
tmp_results = []
for record in tqdm(fastas) :
    if len(record.seq) >= 200 :
        protein_seq = record.seq 
        prediction, sequence_outputs = predict_sequence(model_classifier, str(protein_seq))
        if record.description.count(",") == 0 :
            prot_id = record.description
        else :
            prot_id = "_".join(record.description.split(",")[0].split(" "))
            pass
        if prediction[0] == 1 :
            a = (prot_id , dict(Counter(sequence_outputs)))
            tmp_results.append(a)
        else :
            pass

132it [02:03,  1.07it/s]


In [20]:
tmp_results

[('K10PH82C1_cds_50', {0.0: 501, 1.0: 523}),
 ('K10PH82C1_cds_51', {0.0: 685, 1.0: 339}),
 ('K11PH164C1_cds_45', {0.0: 694, 1.0: 330}),
 ('K11PH164C1_cds_46', {0.0: 595, 1.0: 429}),
 ('K13PH07C1L_cds_10', {0.0: 626, 1.0: 398}),
 ('K13PH07C1L_cds_11', {0.0: 917, 1.0: 107}),
 ('K13PH07C1L_cds_12', {0.0: 810, 1.0: 214}),
 ('K13PH07C1S_cds_10', {0.0: 626, 1.0: 398}),
 ('K13PH07C1S_cds_11', {0.0: 648, 1.0: 376}),
 ('K14PH164C1_cds_24', {0.0: 534, 1.0: 490}),
 ('K15PH90_cds_55', {1.0: 543, 0.0: 481}),
 ('K16PH164C3_cds_48', {0.0: 534, 1.0: 490}),
 ('K17alfa61_cds_23', {0.0: 847, 1.0: 177}),
 ('K17alfa62_cds_64', {0.0: 603, 1.0: 421}),
 ('K17alfa62_cds_66', {0.0: 614, 1.0: 410}),
 ('K18PH07C1_cds_243', {0.0: 701, 1.0: 323}),
 ('K18PH07C1_cds_245', {0.0: 716, 1.0: 308}),
 ('K1PH164C1_cds_8', {0.0: 630, 1.0: 394}),
 ('K21lambda1_cds_28', {0.0: 815, 2.0: 209}),
 ('K22PH164C1_cds_10', {0.0: 661, 1.0: 363}),
 ('K22PH164C1_cds_11', {0.0: 688, 1.0: 336}),
 ('K23PH08C2_cds_233', {0.0: 677, 1.0: 347})

In [21]:
folds_label = {1.0 : "right-handed beta-helix", 2.0 : "6-bladed beta-propeller", 3.0 : "triple-helix"}
fold_dpoes = {}

for dpo in tmp_results :
    for label in dpo[1] : 
        if label in folds_label :
            fold = folds_label[label]
            fold_dpoes[dpo[0]] = fold
            break
fold_dpoes

{'K10PH82C1_cds_50': 'right-handed beta-helix',
 'K10PH82C1_cds_51': 'right-handed beta-helix',
 'K11PH164C1_cds_45': 'right-handed beta-helix',
 'K11PH164C1_cds_46': 'right-handed beta-helix',
 'K13PH07C1L_cds_10': 'right-handed beta-helix',
 'K13PH07C1L_cds_11': 'right-handed beta-helix',
 'K13PH07C1L_cds_12': 'right-handed beta-helix',
 'K13PH07C1S_cds_10': 'right-handed beta-helix',
 'K13PH07C1S_cds_11': 'right-handed beta-helix',
 'K14PH164C1_cds_24': 'right-handed beta-helix',
 'K15PH90_cds_55': 'right-handed beta-helix',
 'K16PH164C3_cds_48': 'right-handed beta-helix',
 'K17alfa61_cds_23': 'right-handed beta-helix',
 'K17alfa62_cds_64': 'right-handed beta-helix',
 'K17alfa62_cds_66': 'right-handed beta-helix',
 'K18PH07C1_cds_243': 'right-handed beta-helix',
 'K18PH07C1_cds_245': 'right-handed beta-helix',
 'K1PH164C1_cds_8': 'right-handed beta-helix',
 'K21lambda1_cds_28': '6-bladed beta-propeller',
 'K22PH164C1_cds_10': 'right-handed beta-helix',
 'K22PH164C1_cds_11': 'right-h

In [22]:
with open(f"/media/concha-eloko/Linux/PPT_clean/in_vitro/Celia/dpos_folds.celia.tsv", "w") as outfile : 
    outfile.write(f"protein_id\tFold\n")
    for protein,fold in fold_dpoes.items():
        outfile.write(f"{protein}\t{fold}\n")

> Save / Open predictions

In [None]:
import os
from tqdm import tqdm
from Bio import SeqIO

path_bea = "/media/concha-eloko/Linux/PPT_clean/in_vitro/Bea"

#dpos = set([prot_id[1] for file in prediction_results for prot_id in prediction_results[file]])

#with open("/media/concha-eloko/Linux/PPT_clean/in_vitro/Bea/DepoScope_predictions.tsv", "w") as outfile : 
#    for dpo in dpos :
#        outfile.write(dpo + "\n")

dpos = open("/media/concha-eloko/Linux/PPT_clean/in_vitro/Bea/DepoScope_predictions.tsv").read().split("\n")