In [None]:
from transformers import AutoTokenizer, EsmForProteinFolding
import torch
import pandas as pd
import os

#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_memory_allocated=12884901888'
torch.backends.cuda.matmul.allow_tf32 = True

from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)

gc.collect()
torch.cuda.empty_cache()

model.esm = model.esm.half()
model = model.cuda()

#model.trunk.set_chunk_size(64)

In [None]:
def tokenized_sequences(sequences, fasta_or_csv) :
    """
    The function takes as an input a either a multifasta file or a dataframe with two columns.
    If the input is a dataframe, the shape would consist of two columns with :
    - 'id', which corresponds to the protein name
    - 'sequence', which corresponds to the aa sequence
    The function returns a list of tuples (a,b) with a as the id and b as the tokenized inputs
    """
    starting = 0
    if fasta_or_csv == "csv" :
        dico_seq = {}
        for i, row in sequences.iterrows():
            if row["id"] >= starting : 
                dico_seq[row["id"]] =  row["sequence"]
    elif fasta_or_csv == "fasta" :
        from Bio import SeqIO
        dico_seq = {record.id : str(record.seq) for record in SeqIO.parse(sequences, "fasta")}
    tokenized_sequences = []
    for idd in dico_seq :
        tokenized_input = tokenizer(dico_seq[idd], return_tensors="pt", add_special_tokens=False)['input_ids']
        a = (idd , tokenized_input)
        tokenized_sequences.append(a)
    return tokenized_sequences

def convert_outputs_to_pdb(outputs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to(device).numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.to(device).numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs

def esmfold_prediction(tokenized_sequences , path_out) :
    """
    The function takes as an input :
    - 'tokenized_sequences', the output of the function tokenize_fasta
    - 'path_out', the path of the directory when the pdb files are to be written
    The function generates the pdb files in the path_out
    """
    for protein in tokenized_sequences :
        pdb_files = []
        output = ""
        with torch.no_grad():
            output = model(protein[1].cuda())
        pdb_txt = convert_outputs_to_pdb(output)
        with open(f"{path_out}/{protein[0]}.pdb" ,"w") as outfile :
            outfile.write(pdb_txt[0])
        torch.cuda.empty_cache()

In [None]:
path_data = "/home/jupyter/data"

df = pd.read_csv(f"{path_data}/Results_III_sequences.v3.csv" , sep = "\t", names = ["id","sequence"])

In [None]:
eg_tokenized = tokenized_sequences(df , "csv")


In [None]:
esmfold_prediction(eg_tokenized, "/home/jupyter/output" )


In [None]:
from transformers import AutoTokenizer, EsmForProteinFolding
import torch

from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

torch.backends.cuda.matmul.allow_tf32 = True
tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
#model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)

# GPU 
model = model.cuda()
model.esm = model.esm.half()
torch.backends.cuda.matmul.allow_tf32 = True
model.trunk.set_chunk_size(64)

def tokenize_fasta(sequences, fasta_csv) :
    """
    The function takes as an input a either a multifasta file or a dataframe with two columns.
    If the input is a dataframe, the shape would consist of two columns with :
    - 'id', which corresponds to the protein name
    - 'sequence', which corresponds to the aa sequence
    The function returns a list of tuples (a,b) with a as the id and b as the tokenized inputs
    
    """
    if fasta_csv == "csv" :
        dico_seq = {}
        dico_seq_tmp = sequences.to_dict('records')
        for row in dico_seq :
            dico_seq[row["id"]] = row["sequence"]
    elif fasta_csv == "fasta" :
        from Bio import SeqIO
        dico_seq = {record.id : str(record.seq) for record in SeqIO.parse(sequences, "fasta")}
    tokenized_sequences = []
    for idd in dico_seq :
        tokenized_input = tokenizer(dico_seq[idd], return_tensors="pt", add_special_tokens=False)['input_ids']
        a = (idd , tokenized_input)
        tokenized_sequences.append(a)
    
    return tokenized_sequences

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs


def esmfold_prediction(tokenized_sequences, path_out) :
    """
    The function takes as an input :
    - 'tokenized_sequences', the output of the function tokenize_fasta
    - 'path_out', the path of the directory when the pdb files are to be written
    The function generates the pdb files in the path_out
    
    """
    try :
        os.mkdir(f"{path_out}")
    except Exception as e :
        #print(e)
        pass
    pdb_files = []
    with torch.no_grad():
        for protein in tokenized_sequences :
            output = model(protein[1])
            pdb_txt = convert_outputs_to_pdb(output)
            with open(f"{path_out}/{protein[0]}.pdb" ,"w") as outfile :
                outfile.write(pdb_txt)

# The predictions
path_ppt = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"
df = pd.read_csv(f"{path_ppt}/Results_III_sequences.v3.csv", sep = "\t", names = ["id","sequence"])

dico_seq = df.to_dict('records')

if __name__ == "main" : 
    
ppt_seq = tokenize_fasta(df , "csv")
esmfold_prediction(ppt_seq, f"{path_ppt}/ppt_pdb")



In [None]:
from multiprocessing.pool import ThreadPool
import pandas as pd 
from transformers import AutoTokenizer, EsmForProteinFolding
import torch

import time

from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

torch.backends.cuda.matmul.allow_tf32 = True
tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
# model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)

def tokenize_fasta(sequences, fasta_csv) :
    """
    The function takes as an input a either a multifasta file or a dataframe with two columns.
    If the input is a dataframe, the shape would consist of two columns with :
    - 'id', which corresponds to the protein name
    - 'sequence', which corresponds to the aa sequence
    The function returns a list of tuples (a,b) with a as the id and b as the tokenized inputs
    
    """
    if fasta_csv == "csv" :
        dico_seq = {}
        dico_seq_tmp = sequences.to_dict('records')
        for row in dico_seq_tmp :
            dico_seq[row["id"]] = row["sequence"]
    elif fasta_csv == "fasta" :
        from Bio import SeqIO
        dico_seq = {record.id : str(record.seq) for record in SeqIO.parse(sequences, "fasta")}
    tokenized_sequences = []
    for idd in dico_seq :
        tokenized_input = tokenizer(dico_seq[idd], return_tensors="pt", add_special_tokens=False)['input_ids']
        a = (idd , tokenized_input)
        tokenized_sequences.append(a)
    
    return tokenized_sequences

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs


def esmfold_prediction(protein) :
    """
    The function takes as an input :
    - 'tokenized_sequences', the output of the function tokenize_fasta
    - 'path_out', the path of the directory when the pdb files are to be written
    The function generates the pdb files in the path_out
    
    # Changed for the map process
    
    """
    path_ppt = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"
    path_out = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/ppt_v3_pdb"
    #try :
    #    os.mkdir(f"{path_out}")
    #except Exception as e :
        #print(e)
    #    pass
    pdb_files = []
    output = ""
    with open(f"{path_ppt}/control_pdb.txt","a+") as outfile_control : 
        outfile_control.write(f"Prediction for {protein[0]}__ {time.ctime(time.time())}\n")
        with torch.no_grad():
            output = model(protein[1])
        outfile_control.write(f"Prediciton done for the protein __{time.ctime(time.time())}\n")
        pdb_txt = convert_outputs_to_pdb(output)
        with open(f"{path_out}/{protein[0]}.pdb" ,"w") as outfile :
            outfile.write(pdb_txt[0])

              
# The predictions
path_ppt = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session"
df = pd.read_csv(f"{path_ppt}/Results_III_sequences.v3.csv", sep = "\t", names = ["id","sequence"])
__try__ = tokenize_fasta(df , "csv")


if __name__ == '__main__':
    with ThreadPool(60) as p:
        p.map(esmfold_prediction, __try__)

In [None]:
#!/bin/bash
#BATCH --job-name=ESM_fold_PPT_
#SBATCH --qos=long
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=60 
#SBATCH --mem=200gb 
#SBATCH --time=10-00:00:00 
#SBATCH --output=ESM_fold_PPT_%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate embeddings

python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/15122022_session/script_files/part_III/esmfold_ppt_v3.py