In [1]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from torch import nn 
import torch.nn.functional as F

import os
import numpy as np
import tqdm
import matplotlib.pyplot as plt
from DepoScope_functions import Dpo_classifier , find_longest_non_zero_suite_with_n_zeros , predict_sequence, plot_token

import warnings
warnings.filterwarnings("ignore") 

path_work = "/media/concha-eloko/Linux/depolymerase_building"

esm2_model_path = f"{path_work}/esm2_t12_35M_UR50D-finetuned-depolymerase.labels_4/checkpoint-6015"
DpoDetection_path = f"{path_work}/DepoDetection.T12.4Labels.1908.model"

tokenizer = AutoTokenizer.from_pretrained(esm2_model_path)
esm2_finetuned = AutoModelForTokenClassification.from_pretrained(esm2_model_path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_classifier = Dpo_classifier(esm2_finetuned) 
model_classifier.load_state_dict(torch.load(DpoDetection_path), strict = False) 
model_classifier.eval() 


Dpo_classifier(
  (pretrained_model): EsmForTokenClassification(
    (esm): EsmModel(
      (embeddings): EsmEmbeddings(
        (word_embeddings): Embedding(33, 480, padding_idx=1)
        (dropout): Dropout(p=0.0, inplace=False)
        (position_embeddings): Embedding(1026, 480, padding_idx=1)
      )
      (encoder): EsmEncoder(
        (layer): ModuleList(
          (0-11): 12 x EsmLayer(
            (attention): EsmAttention(
              (self): EsmSelfAttention(
                (query): Linear(in_features=480, out_features=480, bias=True)
                (key): Linear(in_features=480, out_features=480, bias=True)
                (value): Linear(in_features=480, out_features=480, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
                (rotary_embeddings): RotaryEmbedding()
              )
              (output): EsmSelfOutput(
                (dense): Linear(in_features=480, out_features=480, bias=True)
                (dropout): Dropout(p=0.0, inpla

In [4]:
path_out = "/media/concha-eloko/Linux/PPT_clean/in_vitro/Others"
from Bio import SeqIO
from tqdm import tqdm 


#with open(f"{path_out}/Others.multi.fasta", "w") as outfile : 
predictions = {}
for multi_faa in tqdm(os.listdir(path_out)) :
    if multi_faa.count("sequence") > 0 : 
        fastas = SeqIO.parse(f"{path_out}/{multi_faa}" , "fasta")
        tmp_results = []
        for record in fastas :
            protein_seq = str(record.seq)
            if len(protein_seq) >= 200 :
                prediction, sequence_outputs = predict_sequence(model_classifier, protein_seq)
                if prediction[0] == 1 :
                    name = record.description.split("|")[1].split(" [")[0]
                    a = (prediction , record.description)
                    tmp_results.append(a)
                    #outfile.write(f">{name}\n{record.seq}\n")
                else :
                    pass
        phage = record.description.split("|")[1].split("_prot")[0]
        predictions[phage] = tmp_results


100%|███████████████████████████████████████████| 32/32 [03:19<00:00,  6.23s/it]


In [8]:
path_out = "/media/concha-eloko/Linux/PPT_clean/in_vitro/Others_2nd"
from Bio import SeqIO
from tqdm import tqdm 


#with open(f"{path_out}/Others_2nd.raw.multi.fasta", "w") as outfile : 
predictions = {}
for multi_faa in tqdm(os.listdir(path_out)) :
    fastas = SeqIO.parse(f"{path_out}/{multi_faa}" , "fasta")
    tmp_results = []
    for record in fastas :
        protein_seq = str(record.seq)
        if len(protein_seq) >= 200 :
            prediction, sequence_outputs = predict_sequence(model_classifier, protein_seq)
            if prediction[0] == 1 :
                name = record.description#.split("|")[1].split(" [")[0]
                a = (prediction , record.description)
                tmp_results.append(a)
                #outfile.write(f">{name}\n{record.seq}\n")
            else :
                pass
    phage = record.description#.split("|")[1].split("_prot")[0]
    predictions[phage] = tmp_results

100%|███████████████████████████████████████████| 13/13 [05:17<00:00, 24.45s/it]


In [9]:
import pprint

def clean_print(dico) :
	""" 
	Inputs : a dico
	Outputs : pretty printed dico
	"""
	pp = pprint.PrettyPrinter(width = 200, sort_dicts = True, compact = True)
	out = pp.pprint(dico)
	return out 
    
clean_print(predictions)

{'MT197175.1_prot_QIW86415.1_45': [((1.0, 1.0), 'MZ322895.1_prot_QWY13631.1_35'), ((1.0, 0.9999998807907104), 'ON146449.1_prot_UPW35138.1_1'), ((1.0, 1.0), 'ON146449.1_prot_UPW35150.1_13'),
                                   ((1.0, 1.0), 'MZ571831.1_prot_UEW68236.1_80'), ((1.0, 1.0), 'MZ612130.1_prot_QYC51043.1_10'), ((1.0, 1.0), 'MT197175.1_prot_QIW86415.1_45'),
                                   ((1.0, 1.0), 'MT197175.1_prot_QIW86419.1_49'), ((1.0, 1.0), 'MT197175.1_prot_QIW86428.1_58'), ((1.0, 1.0), 'MZ571832.1_prot_UEP19662.1_17'),
                                   ((1.0, 1.0), 'MZ571832.1_prot_UEP19667.1_22'), ((1.0, 1.0), 'MZ571834.1_prot_UEP19705.1_4'), ((1.0, 0.9998761415481567), 'ON881905.1_prot_UTN90143.1_15'),
                                   ((1.0, 1.0), 'MT197176.1_prot_QJI52618.1_44'), ((1.0, 1.0), 'MT197176.1_prot_QJI52623.1_49'), ((1.0, 1.0), 'MT197176.1_prot_QJI52632.1_58'),
                                   ((1.0, 1.0), 'MZ571827.1_prot_UEW68146.1_56'), ((1.0, 1.0

In [5]:
len(predictions)

12

In [None]:
rsync -avzhe ssh \
/media/concha-eloko/Linux/PPT_clean/in_vitro/Others/Others.multi.fasta \
conchae@garnatxa.srv.cpd:/home/conchae/PhageDepo_pdb/others_esmfold


rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/PhageDepo_pdb/others_esmfold \
/media/concha-eloko/Linux/PPT_clean/in_vitro/Others 

In [None]:
rsync -avzhe ssh \
/media/concha-eloko/Linux/PPT_clean/in_vitro/Others_2nd/Others_2nd.multi.fasta \
conchae@garnatxa.srv.cpd:/home/conchae/PhageDepo_pdb/others_esmfold


rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/PhageDepo_pdb/others_esmfold \
/media/concha-eloko/Linux/PPT_clean/in_vitro/Others_2nd 