# The goal here is to make esmfold prediction on the FP then sceen them with foldseek
***
# I. ESMfold script
***
# II. Foldseek script
***
# III. Foldseek scan
***

I.
> Move the files around

In [None]:
rsync -avzhe ssh \
/media/concha-eloko/Linux/PhageDEPOdetection/Dpos_reference.T12_FP.multi.fasta \
conchae@garnatxa.srv.cpd:/home/conchae/PhageDepo_pdb/benchmarking


> ESMfold script

In [None]:
from multiprocessing.pool import ThreadPool
import pandas as pd 
from transformers import AutoTokenizer, EsmForProteinFolding
import torch

from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

torch.backends.cuda.matmul.allow_tf32 = True
tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")

def tokenized_sequences(sequences, fasta_or_csv) :
    """
    The function takes as an input a either a multifasta file or a dataframe with two columns.
    If the input is a dataframe, the shape would consist of two columns with :
    - 'id', which corresponds to the protein name
    - 'sequence', which corresponds to the aa sequence
    The function returns a list of tuples (a,b) with a as the id and b as the tokenized inputs
    """
    starting = 0
    if fasta_or_csv == "csv" :
        dico_seq = {}
        for i, row in sequences.iterrows():
            if row["id"] >= starting and row["id"] not in not_processed : 
                dico_seq[row["id"]] =  row["sequence"]
    elif fasta_or_csv == "fasta" :
        from Bio import SeqIO
        dico_seq = {record.description.split()[0] : str(record.seq) for record in SeqIO.parse(sequences, "fasta") if len(str(record.seq)) > 400}
    tokenized_sequences = []
    for idd in dico_seq :
        if os.path.isfile(f"/home/conchae/PhageDepo_pdb/benchmarking/{idd}.pdb") == False :
            print(idd)
            tokenized_input = tokenizer(dico_seq[idd], return_tensors="pt", add_special_tokens=False)['input_ids']
            a = (idd , tokenized_input)
            tokenized_sequences.append(a)
    return tokenized_sequences

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs


def esmfold_prediction(protein) :
    """
    The function takes as an input :
    - 'tokenized_sequences', the output of the function tokenize_fasta
    - 'path_out', the path of the directory when the pdb files are to be written
    The function generates the pdb files in the path_out    
    # Changed for the map process
    """
    path_out = "/home/conchae/PhageDepo_pdb/benchmarking"
    with torch.no_grad():
        output = model(protein[1])
        pdb_txt = convert_outputs_to_pdb(output)
        with open(f"{path_out}/{protein[0]}.pdb" ,"w") as outfile :
            outfile.write(pdb_txt[0])

# The predictions
path_data = "/home/conchae/PhageDepo_pdb/benchmarking"
FP_tokenized = tokenized_sequences(f"{path_data}/Dpos_reference.T12_FP.multi.fasta" , "fasta")
FP_tokenized_r = FP_tokenized.reverse()
results = list(map(esmfold_prediction ,FP_tokenized_r))



In [None]:
#!/bin/bash
#BATCH --job-name=ESM_fold_millard_
#SBATCH --qos=long-mem
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=60 
#SBATCH --mem=200gb 
#SBATCH --time=10-00:00:00 
#SBATCH --output=ESM_fold__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate embeddings

python /home/conchae/PhageDepo_pdb/script_files/esmfold_FP.py
