# Decipher the proteins with B helix that were missed by our methods
***

In [None]:
import os 
import pandas as pd 
from tqdm import tqdm
from Bio import SeqIO
from collections import Counter, defaultdict
from multiprocessing.pool import ThreadPool
from concurrent.futures import ProcessPoolExecutor

path_fasta = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/part_III_ptA/input_db/all_prophage_proteins.db.fasta"
path_current = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_model = f"/home/conchae/PhageDepo_pdb/script_files/esm2_t30_150M_UR50D-finetuned-depolymerase/checkpoint-198"
path_work = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_task = f"{path_work}/Rafa_task"
path_labels = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_labeling/phageboost/info"


df_labels = pd.read_csv(f"{path_labels}/prophage_data.clusters_80.phageboost_70.2504.tsv", sep = "\t" , skiprows=1)
df_labels.columns = ["Prophage_name","KL_type","Infected_ancestor","n_clades","siblings","n_ancestors","n_KL_swaps","old_KL_types","all_old_KL_types"]

df_current = pd.read_csv(f"{path_current}/DF_Dpo.final.1005.tsv", sep = "\t", header = 0)
fasta_seqs = SeqIO.parse(path_fasta , "fasta")

dico_seq = defaultdict(list)
for record in fasta_seqs:
    tmp_prot_name = record.id
    sequence = str(record.seq)
    dico_seq[sequence].append(tmp_prot_name)
        
seq_set = set(df_current["seq"])

# Load the model : 
from transformers import AutoModelForTokenClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(path_model)
model = AutoModelForTokenClassification.from_pretrained(path_model)

def model_out(sequence) :
    input_ids = tokenizer.encode(sequence, return_tensors='pt', truncation= True, max_length = 1024)
    outputs = model(input_ids)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    labels = model.config.id2label
    tokens = []
    for token_id, token_probs in zip(input_ids[0], probs[0]):
        top_label_id = token_probs.argmax().item()
        tokens.append(int(labels[top_label_id].split("_")[1]))
    return tokens

def longest_run_of_ones(tokens):
    str_lst = ''.join(map(str, tokens))
    runs = list(map(len, str_lst.split('0')))
    longest_run = max(runs)
    start_pos = runs.index(longest_run)
    end_pos = start_pos + longest_run - 1
    return longest_run, start_pos, end_pos


def beta_helix_assess(sequence):
    tokens = model_out(sequence)
    longest_run, start_pos, end_pos = longest_run_of_ones(tokens)
    if int(longest_run) > 180 :
        if sequence not in seq_set:
            protein_names = dico_seq[sequence]
            with open(f"{path_work}/Dpo_from_the_dead.tsv" , "a+") as outfile:
                for protein_name in protein_names:
                    outfile.write(f"{protein_name}\t{start_pos}\t{end_pos}\t{sequence}\n")

                    
                    
if __name__ == '__main__':
    results = map(beta_helix_assess, list(dico_seq.keys()))
    # If you want to force computation and get a list of results:
    results = list(results)
    
            
if __name__ == '__main__':
    with ThreadPool(20) as p:
        p.map(beta_helix_assess, list(dico_seq.keys()))
        

In [None]:
#!/bin/bash
#BATCH --job-name=Anubis__
#SBATCH --qos=short 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=30
#SBATCH --mem=100gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=Anubis__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate embeddings

python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/script_files/anubis

In [None]:
import os 
import pandas as pd 

path_fasta = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/part_III_ptA/input_db/all_prophage_proteins.db.fasta"
path_current = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_model = f"/home/conchae/PhageDepo_pdb/script_files/esm2_t30_150M_UR50D-finetuned-depolymerase/checkpoint-198"
path_work = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"

df_anubis = pd.read_csv(f"{path_work}/Dpo_from_the_dead.tsv", sep = "\t", names = ["prot_name", "start", "end","sequence"])

df_seq = df_anubis.drop_duplicates(subset = ["sequence"], keep = "first")
df_seq.to_csv(f"{path_work}/Anubis_Dpo.index.csv" , sep = "\t", index = False)

df_seq = pd.read_csv(f"{path_work}/Anubis_Dpo.index.csv" , sep = "\t", header = 0)

with open(f"{path_work}/Anubis_Dpo.fasta", "w") as outfile :
    n = 0
    for _,row in df_seq.iterrows() :
        outfile.write(f">{n}\n{row['sequence']}\n")
        n += 1 
        


In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Anubis_Dpo.fasta \
/media/concha-eloko/Linux/PPT_clean/ 