In [None]:
#!/bin/bash
#SBATCH --job-name=esmfold_ppt_
#SBATCH --partition=gpu
#SBATCH --time=7:00:00
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=8
#SBATCH --mem=40G
#SBATCH --output=ppt_out_%j.out
#SBATCH --error=ppt_out_err__%j.err


source /opt/sci-soft/software/Anaconda3/5.3.0/etc/profile.d/conda.sh
module load Anaconda3/5.3.0
conda create -n transformers_2
conda activate transformers

python /home/rsanjuan/Robby/test.py

In [None]:
source /clinicfs/userhomes/home/rsanjuan/.bashrc
source home/rsanjuan/.bashrc
source home/rsanjuan/.bashrc

In [None]:
from transformers import AutoTokenizer, EsmForProteinFolding
import torch
import pandas as pd
import os

#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_memory_allocated=12884901888'
torch.backends.cuda.matmul.allow_tf32 = True

from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")

import gc
gc.collect()
torch.cuda.empty_cache()

model = model.cuda()
model.esm = model.esm.half()

model.trunk.set_chunk_size(64)

In [None]:
def tokenized_sequences(sequences, fasta_or_csv) :
    """
    The function takes as an input a either a multifasta file or a dataframe with two columns.
    If the input is a dataframe, the shape would consist of two columns with :
    - 'id', which corresponds to the protein name
    - 'sequence', which corresponds to the aa sequence
    The function returns a list of tuples (a,b) with a as the id and b as the tokenized inputs
    """
    starting = 0
    if fasta_or_csv == "csv" :
        dico_seq = {}
        for i, row in sequences.iterrows():
            if row["id"] >= starting and row["id"] not in not_processed :
                dico_seq[row["id"]] =  row["sequence"]
    elif fasta_or_csv == "fasta" :
        from Bio import SeqIO
        dico_seq = {"__".join(record.description.split(",")) : str(record.seq) for record in SeqIO.parse(sequences, "fasta")}
    tokenized_sequences = []
    for idd in dico_seq :
        if int(idd) >= starting :
            tokenized_input = tokenizer(dico_seq[idd], return_tensors="pt", add_special_tokens=False)['input_ids']
            a = (idd , tokenized_input)
            tokenized_sequences.append(a)
    return tokenized_sequences

def convert_outputs_to_pdb(outputs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to(device).numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.to(device).numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs

def esmfold_prediction(tokenized_sequences , path_out) :
    """
    The function takes as an input :
    - 'tokenized_sequences', the output of the function tokenize_fasta
    - 'path_out', the path of the directory when the pdb files are to be written
    The function generates the pdb files in the path_out
    """
    for protein in tokenized_sequences :
        pdb_files = []
        output = ""
        try :
            with torch.no_grad():
                output = model(protein[1].cuda())
            pdb_txt = convert_outputs_to_pdb(output)
            with open(f"{path_out}/{protein[0]}.pdb" ,"w") as outfile :
                outfile.write(pdb_txt[0])
            torch.cuda.empty_cache()
            gc.collect()
        except Exception as e :
            with open("/home/rsanjuan/Robby/anubis.log" , "a+") as outfile :
                outfile.write(f"{protein[0]} ; {e}\n")
            continue

In [None]:
path_data = "/home/rsanjuan/Robby/PhageDEPOdetection/data"
eg_tokenized = tokenized_sequences( f"{path_data}/anubis.indices.fasta" , "fasta")
esmfold_prediction(eg_tokenized, "/home/rsanjuan/Robby/phagedepo_out" )
