In [2]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from torch import nn 
import torch.nn.functional as F

import os
import numpy as np
import tqdm
import matplotlib.pyplot as plt
from DepoScope_functions import Dpo_classifier , find_longest_non_zero_suite_with_n_zeros , predict_sequence, plot_token

import warnings
warnings.filterwarnings("ignore") 

path_work = "/media/concha-eloko/Linux/depolymerase_building"

esm2_model_path = f"{path_work}/esm2_t12_35M_UR50D-finetuned-depolymerase.labels_4/checkpoint-6015"
DpoDetection_path = f"{path_work}/DepoDetection.T12.4Labels.1908.model"

tokenizer = AutoTokenizer.from_pretrained(esm2_model_path)
esm2_finetuned = AutoModelForTokenClassification.from_pretrained(esm2_model_path)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_classifier = Dpo_classifier(esm2_finetuned) 
model_classifier.load_state_dict(torch.load(DpoDetection_path), strict = False) 
model_classifier.eval() 


Dpo_classifier(
  (pretrained_model): EsmForTokenClassification(
    (esm): EsmModel(
      (embeddings): EsmEmbeddings(
        (word_embeddings): Embedding(33, 480, padding_idx=1)
        (dropout): Dropout(p=0.0, inplace=False)
        (position_embeddings): Embedding(1026, 480, padding_idx=1)
      )
      (encoder): EsmEncoder(
        (layer): ModuleList(
          (0-11): 12 x EsmLayer(
            (attention): EsmAttention(
              (self): EsmSelfAttention(
                (query): Linear(in_features=480, out_features=480, bias=True)
                (key): Linear(in_features=480, out_features=480, bias=True)
                (value): Linear(in_features=480, out_features=480, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
                (rotary_embeddings): RotaryEmbedding()
              )
              (output): EsmSelfOutput(
                (dense): Linear(in_features=480, out_features=480, bias=True)
                (dropout): Dropout(p=0.0, inpla

In [4]:
path_out = "/media/concha-eloko/Linux/PPT_clean/in_vitro/Others"
from Bio import SeqIO
from tqdm import tqdm 


#with open(f"{path_out}/Others.multi.fasta", "w") as outfile : 
predictions = {}
for multi_faa in tqdm(os.listdir(path_out)) :
    if multi_faa.count("sequence") > 0 : 
        fastas = SeqIO.parse(f"{path_out}/{multi_faa}" , "fasta")
        tmp_results = []
        for record in fastas :
            protein_seq = str(record.seq)
            if len(protein_seq) >= 200 :
                prediction, sequence_outputs = predict_sequence(model_classifier, protein_seq)
                if prediction[0] == 1 :
                    name = record.description.split("|")[1].split(" [")[0]
                    a = (prediction , record.description)
                    tmp_results.append(a)
                    #outfile.write(f">{name}\n{record.seq}\n")
                else :
                    pass
        phage = record.description.split("|")[1].split("_prot")[0]
        predictions[phage] = tmp_results


100%|███████████████████████████████████████████| 32/32 [03:19<00:00,  6.23s/it]


In [12]:
import pprint

def clean_print(dico) :
	""" 
	Inputs : a dico
	Outputs : pretty printed dico
	"""
	pp = pprint.PrettyPrinter(width = 200, sort_dicts = True, compact = True)
	out = pp.pprint(dico)
	return out 
    
clean_print(predictions)

{'AB716666.1': [((1.0, 1.0), 'lcl|AB716666.1_prot_BAP15736.1_24 [protein=putative tail tubular protein B] [protein_id=BAP15736.1] [location=26714..29086] [gbkey=CDS]'),
                ((1.0, 1.0), 'lcl|AB716666.1_prot_BAP15746.1_34 [protein=hypothetical protein] [protein_id=BAP15746.1] [location=41063..43018] [gbkey=CDS]')],
 'AB897757.1': [((1.0, 1.0), 'lcl|AB897757.1_prot_BAQ02789.1_9 [protein=hypothetical protein] [protein_id=BAQ02789.1] [location=complement(40780..41895)] [gbkey=CDS]'),
                ((1.0, 1.0), 'lcl|AB897757.1_prot_BAQ02835.1_55 [protein=tail spike protein] [protein_id=BAQ02835.1] [location=complement(321593..323803)] [gbkey=CDS]'),
                ((1.0, 1.0), 'lcl|AB897757.1_prot_BAQ02836.1_56 [protein=tailspike protein] [protein_id=BAQ02836.1] [location=complement(323855..325810)] [gbkey=CDS]'),
                ((1.0, 1.0), 'lcl|AB897757.1_prot_BAQ02837.1_57 [protein=putative tail fiber protein] [protein_id=BAQ02837.1] [location=complement(325887..327995)] 

In [11]:
len(predictions)

15

In [None]:
rsync -avzhe ssh \
/media/concha-eloko/Linux/PPT_clean/in_vitro/Others/Others.multi.fasta \
conchae@garnatxa.srv.cpd:/home/conchae/PhageDepo_pdb/others_esmfold


rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/PhageDepo_pdb/others_esmfold \
/media/concha-eloko/Linux/PPT_clean/in_vitro/Others 