# GOAL : generate ESMfold prediction for the selected proteins after V3
### I. Generate the sequence file
### II. The embedding script
***

I.

In [None]:
# Build the sequence idex file :
import os 
from Bio import SeqIO
import pandas as pd 

path_millard = "/home/conchae/ML_depolymerase/get_candidates/millard"

good = ["tail","EPS","receptor","baseplate","collar","lipase","depolymerase","cps","lyase"] 
bad = ["endolysin"]
data = {}
for rec in SeqIO.parse(f"{path_millard}/millard_depo.v3.fasta" , "fasta") :
    if rec.seq not in data :
        for des in good :
            if rec.description.count(des) > 0:
                data[rec.seq] = rec.description
                break
    else :
        pass
    
with open(f"{path_millard}/df_sequences.esmfold.v3.fasta" ,"w") as outfile :
    for seq in data :
        if data[seq].count("tail length tape measure protein") < 1 and data[seq].count("lysozyme") < 1:
            outfile.write(f">{data[seq]}\n{seq}\n")
    
    for index_seq, seq in enumerate(list(data.keys())) :
        for prot in data[seq] :
            outfile.write(f"{index_seq}\t{prot}\t{seq}\n")    
    
with open(f"{path_millard}/df_sequences.index.v2.csv" ,"w") as outfile :
    for index_seq, seq in enumerate(list(data.keys())) :
        for prot in data[seq] :
            outfile.write(f"{index_seq}\t{prot}\t{seq}\n")
        
df = pd.read_csv(f"{path_millard}/df_sequences.index.v2.csv", sep="\t", names = ["index","id","sequence"])       
df = df.drop_duplicates(subset=["index"], keep="first")
df.to_csv(f"{path_millard}/df_sequences.index.clean.v2.csv", sep="\t", columns = ["index","sequence"], index=False)


df = pd.read_csv(f"{path_millard}/df_sequences.index.clean.v2.csv", sep="\t")
with open(f"{path_millard}/millard_depo.indexed.v2.fasta" , "w") as outfile :
    dico_interest = df.to_dict("records")
    for row in dico_interest :
        outfile.write(f">{row['index']}\n{row['sequence']}\n")

***
II. 

In [None]:
#from multiprocessing.pool import ThreadPool
#from multiprocessing import Pool
import os 
import time

from transformers import AutoTokenizer, EsmForProteinFolding
import torch

from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

torch.backends.cuda.matmul.allow_tf32 = True
tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
# model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)
path_proteins = "/home/conchae/ML_depolymerase/get_candidates/millard"

# done = [prot.split(".pdb")[0] for prot in os.listdir(f"{path_proteins}/millard_pdb")]


def tokenize_fasta(sequences, fasta_csv) :
    """
    The function takes as an input a either a multifasta file or a dataframe with two columns.
    If the input is a dataframe, the shape would consist of two columns with :
    - 'id', which corresponds to the protein name
    - 'sequence', which corresponds to the aa sequence
    The function returns a list of tuples (a,b) with a as the id and b as the tokenized inputs
    
    """
    if fasta_csv == "csv" :
        dico_seq = sequences.to_dict('records')
    elif fasta_csv == "fasta" :
        from Bio import SeqIO
        dico_seq = {record.id : str(record.seq) for record in SeqIO.parse(sequences, "fasta") if record.id not in done}
    tokenized_sequences = []
    for idd in dico_seq :
        tokenized_input = tokenizer(dico_seq[idd], return_tensors="pt", add_special_tokens=False)['input_ids']
        a = (idd , tokenized_input)
        tokenized_sequences.append(a)
    return tokenized_sequences

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs


def esmfold_prediction(proteins) :
    """
    The function takes as an input :
    - 'tokenized_sequences', the output of the function tokenize_fasta
    - 'path_out', the path of the directory when the pdb files are to be written
    The function generates the pdb files in the path_out
    
    # Changed for the map process
    
    """
    path_out = f"{path_proteins}/millard_pdb"
    #try :
    #    os.mkdir(f"{path_out}")
    #except Exception as e :
        #print(e)
    #    pass
    with open(f"{path_proteins}/control_esm_fold.single","a+") as outfile_control :
        for protein in proteins :
            outfile_control.write(f"prediction of {protein[0]}.\nTensor : {protein[1]}__ {time.ctime(time.time())}\n")
            pdb_files = []
            output = ""
            with torch.no_grad():
                output = model(protein[1])
            outfile_control.write(f"model done with {protein[0]}__{time.ctime(time.time())}.\n")
            pdb_txt = convert_outputs_to_pdb(output)
            with open(f"{path_out}/{protein[0]}.pdb" ,"w") as outfile :
                outfile.write(pdb_txt[0])

                
# The final function :           
def esmfold_prediction_parralel(protein) :
    """
    The function takes as an input :
    - 'tokenized_sequences', the output of the function tokenize_fasta
    - 'path_out', the path of the directory when the pdb files are to be written
    The function generates the pdb files in the path_out
    
    # Changed for the map process
    
    """
    path_out = f"{path_proteins}/millard_pdb"
    #try :
    #    os.mkdir(f"{path_out}")
    #except Exception as e :
        #print(e)
    #    pass
    with open(f"{path_proteins}/control_esm_fold.parra","a+") as outfile_control :
        outfile_control.write(f"prediction of {protein[0]}.\nTensor : {protein[1]}__ {time.ctime(time.time())}\n")
        pdb_files = []
        output = ""
        with torch.no_grad():
            output = model(protein[1])
        outfile_control.write(f"model done with {protein[0]}__{time.ctime(time.time())}.\n")
        pdb_txt = convert_outputs_to_pdb(output)
        with open(f"{path_out}/{protein[0]}.pdb" ,"w") as outfile :
            outfile.write(pdb_txt[0])
                
                
# The predictions
#__try__ = tokenize_fasta(f"{path_proteins}/millard_depo.v3.fasta" , "fasta")
# esmfold_prediction(__try__)

#if __name__ == '__main__':
#    with ThreadPool(60) as p:
#        p.map(esmfold_prediction, __try__)
if __name__ == '__main__':
    with open(f"{path_proteins}/control_esm_fold.parrallel","a+") as outfile :
        outfile.write(f"Main is strating.\nWe are dealing with {len(__try__)} seqeunces.\n")
    with ThreadPool(5) as p:
        p.map(esmfold_prediction_parralel, __try__)

In [None]:
#!/bin/bash
#BATCH --job-name=ESM_fold_millard_
#SBATCH --qos=long-mem
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=60 
#SBATCH --mem=200gb 
#SBATCH --time=10-00:00:00 
#SBATCH --output=ESM_fold__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate embeddings

python /home/conchae/ML_depolymerase/scripts/esmfold_pred.py


In [None]:
rsync -avzhe ssh conchae@garnatxa.srv.cpd:/home/conchae/ML_depolymerase/get_candidates/millard/millard_depo.indexed.v3.fasta /media/concha-eloko/Linux/depolymerase_building
rsync -avzhe ssh conchae@garnatxa.srv.cpd:/home/conchae/ML_depolymerase/get_candidates/millard/embeddings.proteins.v3.csv /media/concha-eloko/Linux/depolymerase_building
rsync -avzhe ssh conchae@garnatxa.srv.cpd:/home/conchae/ML_depolymerase/get_candidates/millard/df_sequences.index.v3.csv /media/concha-eloko/Linux/depolymerase_building
rsync -avzhe ssh conchae@garnatxa.srv.cpd:/home/conchae/ML_depolymerase/get_candidates/millard/proteinID_annotation.v3.json /media/concha-eloko/Linux/depolymerase_building
    
rsync -avzhe ssh conchae@garnatxa.srv.cpd:/home/conchae/ML_depolymerase/get_candidates/millard/df_sequences.esmfold.v3.fasta /media/concha-eloko/Linux/depolymerase_building
rsync -avzhe ssh conchae@garnatxa.srv.cpd:/home/conchae/ML_depolymerase/get_candidates/millard/sword_out /media/concha-eloko/Linux/depolymerase_building
rsync -avzhe ssh conchae@garnatxa.srv.cpd:/home/conchae/ML_depolymerase/get_candidates/millard/millard_pdb /media/concha-eloko/Linux/depolymerase_building
rsync -avzhe ssh conchae@garnatxa.srv.cpd:/home/conchae/ML_depolymerase/get_candidates/millard/DBsuite_depo3_outputs /media/concha-eloko/Linux/depolymerase_building

    

In [None]:
import os 
import time


path_proteins = "/home/conchae/ML_depolymerase/get_candidates/millard"

done = [prot.split(".pdb")[0] for prot in os.listdir(f"{path_proteins}/millard_pdb")]

for file in tqdm(os.listdir(f"{path_proteins}/DBsuite_depo3")) :
    for prot in done :
        if file.count(prot) > 0 :
            os.system(f"cp {path_proteins}/DBsuite_depo3/{file} {path_proteins}/DBsuite_depo3_outputs")
            break


> The threadpool version 

In [None]:
from multiprocessing.pool import ThreadPool
import os 
import time

from transformers import AutoTokenizer, EsmForProteinFolding
import torch

from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

torch.backends.cuda.matmul.allow_tf32 = True
tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
# model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)
path_proteins = "/home/conchae/ML_depolymerase/get_candidates/millard"

done = [prot.split(".pdb")[0] for prot in os.listdir(f"{path_proteins}/millard_pdb")]


def tokenize_fasta(sequences, fasta_csv) :
    """
    The function takes as an input a either a multifasta file or a dataframe with two columns.
    If the input is a dataframe, the shape would consist of two columns with :
    - 'id', which corresponds to the protein name
    - 'sequence', which corresponds to the aa sequence
    The function returns a list of tuples (a,b) with a as the id and b as the tokenized inputs
    
    """
    if fasta_csv == "csv" :
        dico_seq = sequences.to_dict('records')
    elif fasta_csv == "fasta" :
        from Bio import SeqIO
        dico_seq = {record.id : str(record.seq) for record in SeqIO.parse(sequences, "fasta") if record.id not in done}
    tokenized_sequences = []
    for idd in dico_seq :
        tokenized_input = tokenizer(dico_seq[idd], return_tensors="pt", add_special_tokens=False)['input_ids']
        a = (idd , tokenized_input)
        tokenized_sequences.append(a)
    return tokenized_sequences

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs


def esmfold_prediction(proteins) :
    """
    The function takes as an input :
    - 'tokenized_sequences', the output of the function tokenize_fasta
    - 'path_out', the path of the directory when the pdb files are to be written
    The function generates the pdb files in the path_out
    
    # Changed for the map process
    
    """
    path_out = f"{path_proteins}/millard_pdb"
    #try :
    #    os.mkdir(f"{path_out}")
    #except Exception as e :
        #print(e)
    #    pass
    with open(f"{path_proteins}/control_esm_fold.single","a+") as outfile_control :
        outfile_control.write(f"prediction of {protein[0]}.\nTensor : {protein[1]}__ {time.ctime(time.time())}\n")
        pdb_files = []
        output = ""
        with torch.no_grad():
            output = model(protein[1])
        outfile_control.write(f"model done with {protein[0]}__{time.ctime(time.time())}.\n")
        pdb_txt = convert_outputs_to_pdb(output)
        with open(f"{path_out}/{protein[0]}.pdb" ,"w") as outfile :
            outfile.write(pdb_txt[0])

                
                
def esmfold_prediction_parralel(protein) :
    """
    The function takes as an input :
    - 'tokenized_sequences', the output of the function tokenize_fasta
    - 'path_out', the path of the directory when the pdb files are to be written
    The function generates the pdb files in the path_out
    
    # Changed for the map process
    
    """
    path_out = f"{path_proteins}/millard_pdb"
    #try :
    #    os.mkdir(f"{path_out}")
    #except Exception as e :
        #print(e)
    #    pass
    with open(f"{path_proteins}/control_esm_fold.parra","a+") as outfile_control :
        outfile_control.write(f"prediction of {protein[0]}.\nTensor : {protein[1]}__ {time.ctime(time.time())}\n")
        pdb_files = []
        output = ""
        with torch.no_grad():
            output = model(protein[1])
        outfile_control.write(f"model done with {protein[0]}__{time.ctime(time.time())}.\n")
        pdb_txt = convert_outputs_to_pdb(output)
        with open(f"{path_out}/{protein[0]}.pdb" ,"w") as outfile :
            outfile.write(pdb_txt[0])
                
                
# The predictions
#__try__ = tokenize_fasta(f"{path_proteins}/millard_depo.v3.fasta" , "fasta")
# esmfold_prediction(__try__)

#if __name__ == '__main__':
#    with ThreadPool(60) as p:
#        p.map(esmfold_prediction, __try__)
if __name__ == '__main__':
    with open(f"{path_proteins}/control_esm_fold.parrallel","a+") as outfile :
        outfile.write(f"Main is strating.\nWe are dealing with {len(__try__)} seqeunces.\n")
    with ThreadPool(5) as p:
        p.map(esmfold_prediction_parralel, __try__)