# Decipher the proteins with B helix that were missed by our methods
***

In [None]:
import os 
import pandas as pd 
from tqdm import tqdm
from Bio import SeqIO
from collections import Counter, defaultdict
from multiprocessing.pool import ThreadPool
from concurrent.futures import ProcessPoolExecutor

path_fasta = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/part_III_ptA/input_db/all_prophage_proteins.db.fasta"
path_current = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_model = f"/home/conchae/PhageDepo_pdb/script_files/esm2_t30_150M_UR50D-finetuned-depolymerase/checkpoint-198"
path_work = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_task = f"{path_work}/Rafa_task"
path_labels = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_labeling/phageboost/info"


df_labels = pd.read_csv(f"{path_labels}/prophage_data.clusters_80.phageboost_70.2504.tsv", sep = "\t" , skiprows=1)
df_labels.columns = ["Prophage_name","KL_type","Infected_ancestor","n_clades","siblings","n_ancestors","n_KL_swaps","old_KL_types","all_old_KL_types"]

df_current = pd.read_csv(f"{path_current}/DF_Dpo.final.1005.tsv", sep = "\t", header = 0)
fasta_seqs = SeqIO.parse(path_fasta , "fasta")

dico_seq = defaultdict(list)
for record in fasta_seqs:
    tmp_prot_name = record.id
    sequence = str(record.seq)
    dico_seq[sequence].append(tmp_prot_name)
        
seq_set = dict(zip(df_current["seq"], df_current["index"]))

# Load the model : 
from transformers import AutoModelForTokenClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(path_model)
model = AutoModelForTokenClassification.from_pretrained(path_model)

def model_out(sequence) :
    input_ids = tokenizer.encode(sequence, return_tensors='pt', truncation= True, max_length = 1024)
    outputs = model(input_ids)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    labels = model.config.id2label
    tokens = []
    for token_id, token_probs in zip(input_ids[0], probs[0]):
        top_label_id = token_probs.argmax().item()
        tokens.append(int(labels[top_label_id].split("_")[1]))
    return tokens

def longest_run_of_ones(tokens):
    str_lst = ''.join(map(str, tokens))
    runs = list(map(len, str_lst.split('0')))
    longest_run = max(runs)
    start_pos = runs.index(longest_run)
    end_pos = start_pos + longest_run - 1
    return longest_run, start_pos, end_pos


def beta_helix_assess(sequence):
    tokens = model_out(sequence)
    longest_run, start_pos, end_pos = longest_run_of_ones(tokens)
    if int(longest_run) > 180 :
        if sequence in seq_set:
            with open(f"{path_work}/Double_caught_Dpos.LLM.tsv" , "a+") as outfile:
                outfile.write(f"{seq_set[sequence]}")
        else :
            pass
            #protein_names = dico_seq[sequence]
            #with open(f"{path_work}/Dpo_from_the_dead.tsv" , "a+") as outfile:
            #    for protein_name in protein_names:
            #        outfile.write(f"{protein_name}\t{start_pos}\t{end_pos}\t{sequence}\n")

                   
                    
if __name__ == '__main__':
    results = map(beta_helix_assess, list(dico_seq.keys()))
    # If you want to force computation and get a list of results:
    results = list(results)
    
            
if __name__ == '__main__':
    with ThreadPool(20) as p:
        p.map(beta_helix_assess, list(dico_seq.keys()))
        

In [None]:
#!/bin/bash
#BATCH --job-name=Anubis_caught_
#SBATCH --qos=short 
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=30
#SBATCH --mem=80gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=Anubis_caught_%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate embeddings

python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/script_files/catch_hands.py

In [None]:
import os 
import pandas as pd 

path_fasta = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/part_III_ptA/input_db/all_prophage_proteins.db.fasta"
path_current = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_model = f"/home/conchae/PhageDepo_pdb/script_files/esm2_t30_150M_UR50D-finetuned-depolymerase/checkpoint-198"
path_work = f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"

df_anubis = pd.read_csv(f"{path_work}/Dpo_from_the_dead.tsv", sep = "\t", names = ["prot_name", "start", "end","sequence"])

df_seq = df_anubis.drop_duplicates(subset = ["sequence"], keep = "first")
df_seq.to_csv(f"{path_work}/Anubis_Dpo.index.csv" , sep = "\t", index = False)

df_seq = pd.read_csv(f"{path_work}/Anubis_Dpo.index.csv" , sep = "\t", header = 0)

with open(f"{path_work}/Anubis_Dpo.fasta", "w") as outfile :
    n = 0
    for _,row in df_seq.iterrows() :
        outfile.write(f">{n}\n{row['sequence']}\n")
        n += 1 
        


In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Anubis_Dpo.fasta \
/media/concha-eloko/Linux/PPT_clean/ 

***
# ESM2 embeddings on the dpo domains of Anubis

In [None]:
import os 
import pandas as pd 
from Bio import SeqIO

path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"


dico = {str(dpo.seq):dpo.id for dpo in SeqIO.parse(f"{path_work}/Anubis_Dpo.fasta" , "fasta")}
df_info_anubis = pd.read_csv(f"{path_work}/Anubis_Dpo.index.csv", sep = "\t", header = 0)

with open(f"{path_work}/anubis_Dpo_domains.multi.fasta","w") as outfile :
    for seq,index in dico.items() :
        start = df_info_anubis[df_info_anubis["sequence"] == seq]["start"].values[0]
        end = df_info_anubis[df_info_anubis["sequence"] == seq]["end"].values[0]
        domain_aa = seq[start:end]
        outfile.write(f">anubis__{index}\n{domain_aa}\n")

In [None]:
#!/bin/bash
#BATCH --job-name=ESM_2__
#SBATCH --qos=short
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=40 
#SBATCH --mem=100gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=ESM_2__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate embeddings

python /home/conchae/software/esm/scripts/extract.py \
esm2_t33_650M_UR50D \
/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/anubis_Dpo_domains.multi.fasta \
/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/anubis_Dpo_domains.multi.fasta.esm_out \
--repr_layers 33 \
--include mean per_tok

In [None]:
import torch
import os 
import pandas as pd
from tqdm import tqdm 


path_esm = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/anubis_Dpo_domains.multi.fasta.esm_out"

embeddings_esm = {}
for file in tqdm(os.listdir(path_esm)) :
    if os.path.isdir(f"{path_esm}/{file}") == True :
        for file_2 in os.listdir(f"{path_esm}/{file}") :
            path_file = f"{path_esm}/{file}/{file_2}"
        continue
    else :
        path_file = f"{path_esm}/{file}"
    #index = file.split(" ")[0] # VC version
    #index = "__".join("".join(file.split(",")[0]).split(" ")) # 77 phages version
    #print(index)
    index = file.split(".")[0]
    embb = torch.load(f"{path_file}")["mean_representations"][33].tolist()
    embeddings_esm[index] = embb
    
#with open(f"/home/conchae/77_strains_phage_project/vConTACT2_77_phages_04102022/VCs_proteins.esm2.embedding.csv" , "w") as outfile :
with open(f"/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/anubis.esm2.embedding.csv" , "w") as outfile :
    for index in tqdm(embeddings_esm) :
        outfile.write(f"{index},")
        for _,  emb in enumerate(embeddings_esm[index]) :
            outfile.write(f"{emb},")
        outfile.write("\n")

In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/anubis.esm2.embedding.csv  \
/media/concha-eloko/Linux/PPT_clean/ 

***
# Feed the DF_Dpo.final.1005.tsv into DF_Dpo.final.2605.tsv

In [None]:
import os 
import pandas as pd 
from Bio import SeqIO

path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_labels = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_labeling/phageboost/info"

dico_anubis = {str(dpo.seq):dpo.id for dpo in SeqIO.parse(f"{path_work}/Anubis_Dpo.fasta" , "fasta")}
df_info_anubis = pd.read_csv(f"{path_work}/Anubis_Dpo.index.csv", sep = "\t", header = 0)
df_labels = pd.read_csv(f"{path_labels}/prophage_data.clusters_80.phageboost_70.2504.tsv", sep = "\t", skiprows=1)
df_labels.columns = ["Prophage_name","KL_type","Infected_ancestor","n_clades","siblings","n_ancestors","n_KL_swaps","old_KL_types","all_old_KL_types"]
df_anubis = pd.read_csv(f"{path_work}/Dpo_from_the_dead.tsv", sep = "\t", names = ["prot_name", "start", "end","sequence"])

df_info = pd.read_csv(f"{path_work}/DF_Dpo.final.1005.tsv" , sep = "\t", header =0)


df_anubis["Prophage_name"] = df_anubis["prot_name"].apply(lambda x : "__".join(x.split("__")[0:-1]) + ".fasta")
df_anubis["index"] = df_anubis["sequence"].apply(lambda x : f"anubis__{dico_anubis[x]}")
df_anubis["Dataset"] = "anubis"


df_labels_matters = df_labels[["Prophage_name","KL_type","Infected_ancestor"]]
merged = pd.merge(df_anubis ,df_labels_matters, on ="Prophage_name" ,left_index=True)
merged = merged.drop(["start","end"], axis = 1)

clean_anubis = merged[["Prophage_name","KL_type","Infected_ancestor","prot_name","Dataset","index","sequence"]]
clean_anubis["Prophage_name"] = clean_anubis["Prophage_name"].apply(lambda x : x.split(".fasta")[0])
clean_anubis.columns = ['Phage', 'KL_type_LCA', 'Infected_ancestor', 'Protein_name', 'Dataset', 'index', 'seq']

DF = pd.concat([df_info, clean_anubis] , axis = 0)

DF.to_csv(f"{path_work}/DF_Dpo.final.2605.tsv" , sep = "\t", header =0)

> The Venn diagram first step : 

In [None]:
import os 
import pandas as pd 

path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_labels = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_labeling/phageboost/info"

df_caught = pd.read_csv(f"{path_work}/Double_caught_Dpos.LLM.tsv", header = None)
DF_penultimate = pd.read_csv(f"{path_work}/DF_Dpo.final.2605.tsv" , sep = "\t", header = None, index_col = 0)
DF_penultimate.columns = ['Phage', 'KL_type_LCA', 'Infected_ancestor', 'Protein_name', 'Dataset', 'index', 'seq']
# len = 978

In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/DF_Dpo.final.2605.tsv  \
/media/concha-eloko/Linux/PPT_clean/ 
    

> Local 

In [5]:
import os 
import pandas as pd 

path_work = "/media/concha-eloko/Linux/PPT_clean"

DF_penultimate = pd.read_csv(f"{path_work}/DF_Dpo.final.2605.tsv" , sep = "\t", header = None, index_col = 0)
DF_penultimate.columns = ['Phage', 'KL_type_LCA', 'Infected_ancestor', 'Protein_name', 'Dataset', 'index', 'seq']
# len = 978

In [8]:
len(DF_penultimate["Phage"].unique())

38419

In [None]:
# Get the phage with the most Dpo :
from collections import Counter

dico_count = dict(Counter(DF["Phage"]))
Dpos = set(value for key,value in dico_count.items())
# {1, 2, 3, 4, 5, 6, 7, 11, 13}

for phage , dpos in dico_count.items() :
    if dpos == 11 :
        print(phage)
        
# GCF_003037395.1__phage28

DF[DF["Phage"] == "GCF_002850635.1__phage1"]
DF[DF["Infected_ancestor"] == "n2542"]


***
# Verify the presence of the Dpo domain with foldseek

In [1]:
import subprocess
import os

path_project = "/media/concha-eloko/Linux/PPT_clean/ficheros_28032023"
path_pdb = f"{path_project}/Anubis_out"
path_db = f"/media/concha-eloko/Linux/depolymerase_building/RefDepo_domains/RefDepo_domains_db"
path_tmp = f"{path_project}/tmp_anubis"

def seek_beta_helix(path_in) :
    dir_out = f"{path_project}/seekfold_anubis"
    protein_id = path_in.split("/")[-1].split(".pdb")[0]
    path_out = f"{dir_out}/{protein_id}.out"
    if os.path.isfile(path_out) == False :
        output_frmt = f"query,target,pident,alnlen,gapopen,qstart,qend,tstart,tend,bits,prob"
        seek = f"foldseek easy-search {path_in} {path_db} {path_out} {path_tmp} --format-output {output_frmt}"
        #seek = f"foldseek easy-search {path_in} {path_db} {path_out}.html {path_tmp} --format-mode 3"
        seek_process = subprocess.Popen(seek, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        press_out, press_err = seek_process.communicate()
        print (press_out, press_err)

paths = [f"{path_pdb}/{file}" for file in os.listdir(f"{path_pdb}")]

for path in paths :
    seek_beta_helix(path)

In [10]:
path_project = "/media/concha-eloko/Linux/PPT_clean/ficheros_28032023"
dir_out = f"{path_project}/seekfold_anubis"
import pandas as pd
import os 

outputs = [f"{dir_out}/{file}" for file in os.listdir(dir_out) if file[-3:]=="out"]


path_info = f"/media/concha-eloko/Linux/depolymerase_building/depolymerase_fold.csv"
info_df = pd.read_csv(path_info , sep = "\t", header = 0)
dico_folds_ppt = {"jelly-roll" : [],
              "alpha/alpha toroid" : [],
              "right-handed beta-helix" : [] ,
              "TIM beta/alpha-barrel" : [],
              "6-bladed beta-propeller" : [] ,
              "Flavodoxin-like" : [] ,
              "Alpha/Beta hydrolase fold" : [] ,
              "Other" : [],
             }

outputs = [f"{dir_out}/{file}" for file in os.listdir(dir_out) if file[-3:]=="out"]
header_seekfold = ["query","target","pident","alnlen","gapopen","qstart","qend","tstart","tend","bits","prob"]
depo_results = {}
no_fold = []
for results in outputs :
    results_df = pd.read_csv(f"{results}", sep = "\t" , names = header_seekfold)
    for _,row in results_df.iterrows() :
        fold = info_df[info_df["ID"] == row["target"]]["X_Group_Name"].values[0]
        if fold != "jelly-roll" :
            if row["prob"] >= 0.5 :
                if row["query"] not in dico_folds_ppt[fold] :
                    dico_folds_ppt[fold].append(row["query"])
                    break
            elif fold == "right-handed beta-helix" and row["prob"] >= 0.2 :
                if row["query"] not in dico_folds_ppt[fold] :
                    dico_folds_ppt[fold].append(row["query"])
                    break
    else :
        no_fold.append(row["query"])
                
for fold in dico_folds_ppt : 
    print(f"The {fold} presented {len(dico_folds_ppt[fold])} depolymerases.\n")

The jelly-roll presented 0 depolymerases.

The alpha/alpha toroid presented 0 depolymerases.

The right-handed beta-helix presented 767 depolymerases.

The TIM beta/alpha-barrel presented 0 depolymerases.

The 6-bladed beta-propeller presented 230 depolymerases.

The Flavodoxin-like presented 0 depolymerases.

The Alpha/Beta hydrolase fold presented 0 depolymerases.

The Other presented 0 depolymerases.



In [3]:
len(no_fold)

639

In [9]:
import os 
import pandas as pd 
from Bio import SeqIO

path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_labels = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/prophage_labeling/phageboost/info"

dico_anubis = {str(dpo.seq):dpo.id for dpo in SeqIO.parse(f"{path_work}/Anubis_Dpo.fasta" , "fasta")}
df_info_anubis = pd.read_csv(f"{path_work}/Anubis_Dpo.index.csv", sep = "\t", header = 0)
df_labels = pd.read_csv(f"{path_labels}/prophage_data.clusters_80.phageboost_70.2504.tsv", sep = "\t", skiprows=1)
df_labels.columns = ["Prophage_name","KL_type","Infected_ancestor","n_clades","siblings","n_ancestors","n_KL_swaps","old_KL_types","all_old_KL_types"]
df_anubis = pd.read_csv(f"{path_work}/Dpo_from_the_dead.tsv", sep = "\t", names = ["prot_name", "start", "end","sequence"])

df_info = pd.read_csv(f"{path_work}/DF_Dpo.final.1005.tsv" , sep = "\t", header =0)


df_anubis["Prophage_name"] = df_anubis["prot_name"].apply(lambda x : "__".join(x.split("__")[0:-1]) + ".fasta")
df_anubis["index"] = df_anubis["sequence"].apply(lambda x : f"anubis__{dico_anubis[x]}")
df_anubis["Dataset"] = "anubis"


df_labels_matters = df_labels[["Prophage_name","KL_type","Infected_ancestor"]]

df_anubis[df_anubis["index"] == "anubis__80"]



FileNotFoundError: [Errno 2] No such file or directory: '/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/Anubis_Dpo.fasta'

In [4]:
import pprint
pp = pprint.PrettyPrinter(width = 100, compact = True)
pp.pprint(no_fold)

['80.pdb', '921.pdb', '877.pdb', '1183.pdb', '1405.pdb', '644.pdb', '738.pdb', '1342.pdb', '98.pdb',
 '838.pdb', '843.pdb', '1320.pdb', '1343.pdb', '59.pdb', '224.pdb', '202.pdb', '1258.pdb',
 '1149.pdb', '12.pdb', '1398.pdb', '352.pdb', '1171.pdb', '581.pdb', '1030.pdb', '284.pdb',
 '326.pdb', '1325.pdb', '1168.pdb', '1093.pdb', '397.pdb', '1122.pdb', '321.pdb', '765.pdb',
 '1182.pdb', '629.pdb', '613.pdb', '518.pdb', '285.pdb', '949.pdb', '1593.pdb', '1153.pdb',
 '68.pdb', '1220.pdb', '364.pdb', '1316.pdb', '1371.pdb', '1104.pdb', '118.pdb', '881.pdb',
 '157.pdb', '1194.pdb', '505.pdb', '57.pdb', '1625.pdb', '45.pdb', '960.pdb', '955.pdb', '969.pdb',
 '988.pdb', '305.pdb', '1062.pdb', '571.pdb', '194.pdb', '1086.pdb', '1378.pdb', '545.pdb',
 '241.pdb', '658.pdb', '850.pdb', '643.pdb', '354.pdb', '1524.pdb', '852.pdb', '859.pdb', '440.pdb',
 '1424.pdb', '1139.pdb', '1613.pdb', '288.pdb', '544.pdb', '53.pdb', '54.pdb', '543.pdb', '496.pdb',
 '856.pdb', '593.pdb', '366.pdb', '1092.pdb',