# Goal : Scan the 3D predcitions with the PD fold database (12/09/2023). 

### A. Make the DB
### B. Run Foldseek
### C. Get the hits with good proba
### D. Report the folds
***

# A

In [None]:
# Generates the database :
# mkdir /media/concha-eloko/Linux/PhageDEPOdetection/PD_fold_database_foldseek
foldseek createdb \
/media/concha-eloko/Linux/PhageDEPOdetection/PD_fold_database \
/media/concha-eloko/Linux/PhageDEPOdetection/PD_fold_database_foldseek/PD_fold_database_foldseek

# Generate index files : 
foldseek createindex \
/media/concha-eloko/Linux/PhageDEPOdetection/PD_fold_database_foldseek/PD_fold_database_foldseek \ 
/media/concha-eloko/Linux/PhageDEPOdetection/PD_fold_database

# Move to server :
rsync -avzhe ssh \
/media/concha-eloko/Linux/PhageDEPOdetection/PD_fold_database \
conchae@garnatxa.srv.cpd:/home/conchae/depolymerase
    
rsync -avzhe ssh \
/media/concha-eloko/Linux/PhageDEPOdetection/PD_fold_database_foldseek \
conchae@garnatxa.srv.cpd:/home/conchae/depolymerase

***
## B  

In [None]:
import subprocess
import os

path_project = "/media/concha-eloko/Linux/PhageDEPOdetection"
path_pdb = f"{path_project}/phagedepo_out"
path_db = f"/media/concha-eloko/Linux/PhageDEPOdetection/PD_fold_database_foldseek/PD_fold_database_foldseek"
path_tmp = f"{path_project}/tmp"

def seek_beta_helix(path_in) :
    dir_out = f"{path_project}/PD_fold_seekfold"
    protein_id = path_in.split("/")[-1].split(".pdb")[0]
    path_out = f"{dir_out}/{protein_id}.out"
    output_frmt = f"query,target,pident,alnlen,gapopen,qstart,qend,tstart,tend,bits,prob"
    seek = f"foldseek easy-search {path_in} {path_db} {path_out} {path_tmp} --format-output {output_frmt}"
    #seek = f"foldseek easy-search {path_in} {path_db} {path_out}.html {path_tmp} --format-mode 3"
    seek_process = subprocess.Popen(seek, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    press_out, press_err = seek_process.communicate()
    print (press_out, press_err)

paths = [f"{path_pdb}/{file}" for file in os.listdir(f"{path_pdb}")]


for path in paths :
    seek_beta_helix(path)

***
## C.

In [1]:
import pandas as pd
import os 

path_project = "/media/concha-eloko/Linux/PhageDEPOdetection"
dir_out = f"{path_project}/PD_fold_seekfold"
outputs = [f"{dir_out}/{file}" for file in os.listdir(dir_out) if file[-3:]=="out"]

header_seekfold = ["query","target","pident","alnlen","gapopen","qstart","qend","tstart","tend","bits","prob"]

depo_ppt = {}
very_good = 0
good = 0
decent = 0 
passable = 0
for results in outputs :
    results_df = pd.read_csv(f"{results}", sep = "\t" , names = header_seekfold)
    try : 
        if results_df["prob"][0] > 0.1 :
            depo_ppt[results_df["query"][0]] = (results_df["target"][0] , results_df["prob"][0])
            if results_df["prob"][0] > 0.9 :
                very_good += 1
            elif results_df["prob"][0] > 0.5 :
                good += 1
            elif results_df["prob"][0] > 0.25 :
                decent += 1
            else :
                passable += 1
                
    except IndexError :
        pass

import pprint
#pp = pprint.PrettyPrinter(width = 100, sort_dicts = True, compact = True)


print(f"So far, out of {len((outputs))} proteins searched, {len(depo_ppt)} seemed to be positive in the phagedepo dataset ({len(depo_ppt)/len(outputs)} %).\nThe proportion of very good is {very_good/len(depo_ppt)}\nThe proportion of good is {good/len(depo_ppt)}\nThe proportion of decent is {decent/len(depo_ppt)}\nThe rest is {passable/len(depo_ppt)}")


So far, out of 680 proteins searched, 398 seemed to be positive in the phagedepo dataset (0.5852941176470589 %).
The proportion of very good is 0.7889447236180904
The proportion of good is 0.10301507537688442
The proportion of decent is 0.052763819095477386
The rest is 0.05527638190954774


In [6]:
path_info = f"/media/concha-eloko/Linux/depolymerase_building/depolymerase_fold.csv"
info_df = pd.read_csv(path_info , sep = "\t", header = 0)

info_df["Folds"].unique()
info_df

Unnamed: 0,Activity,ID,Folds
0,xylanase,2d97.pdb,jelly-roll
1,Polysaccharide lyase 8,7fhv.pdb_A,alpha/alpha toroid
2,Polysaccharide lyase 8,7fhv.pdb_B,alpha/alpha toroid
3,Polysaccharide lyase 8,7fhy.pdb_A,alpha/alpha toroid
4,Polysaccharide lyase 8,7fhy.pdb_B,alpha/alpha toroid
...,...,...,...
61,GH5_4,4YZPA_1_1_313.pdb,TIM beta/alpha-barrel
62,GH5_8,1WKYA_1_1_302.pdb,TIM beta/alpha-barrel
63,GH5_34,2Y8KA_1_1_313.pdb,TIM beta/alpha-barrel
64,GH15,1UG9A_3_270_687.pdb,alpha/alpha toroid


***
## C.

In [9]:
import pandas as pd
import os 
from tqdm import tqdm

path_project = "/media/concha-eloko/Linux/PhageDEPOdetection"
dir_out = f"{path_project}/PD_fold_seekfold"

path_info = f"/media/concha-eloko/Linux/depolymerase_building/depolymerase_fold.csv"
info_df = pd.read_csv(path_info , sep = "\t", header = 0)

dico_folds = {"jelly-roll" : [],
              "alpha/alpha toroid" : [],
              "right-handed beta-helix" : [] ,
              "TIM beta/alpha-barrel" : [],
              "6-bladed beta-propeller" : [] ,
              "Flavodoxin-like" : [] ,
              "Alpha/Beta hydrolase fold" : [] ,
              "Other" : [],
             }

outputs = [f"{dir_out}/{file}" for file in os.listdir(dir_out) if file[-3:]=="out"]
header_seekfold = ["query","target","pident","alnlen","gapopen","qstart","qend","tstart","tend","bits","prob"]
depo_results = {}
for results in tqdm(outputs) :
    results_df = pd.read_csv(f"{results}", sep = "\t" , names = header_seekfold)
    for _,row in results_df.iterrows() :
        fold = info_df[info_df["ID"] == row["target"]]["Folds"].values[0]
        if row["prob"] >= 0.5 :
            if row["query"] not in dico_folds[fold] :
                dico_folds[fold].append(row["query"])
        elif fold == "right-handed beta-helix" and row["prob"] >= 0.2 :
            if row["query"] not in dico_folds[fold] :
                dico_folds[fold].append(row["query"])
                
for fold in dico_folds : 
    print(f"The {fold} presented {len(dico_folds[fold])} depolymerases.\n")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1926/1926 [00:20<00:00, 94.25it/s]

The jelly-roll presented 101 depolymerases.

The alpha/alpha toroid presented 11 depolymerases.

The right-handed beta-helix presented 844 depolymerases.

The TIM beta/alpha-barrel presented 10 depolymerases.

The 6-bladed beta-propeller presented 119 depolymerases.

The Flavodoxin-like presented 0 depolymerases.

The Alpha/Beta hydrolase fold presented 0 depolymerases.

The Other presented 4 depolymerases.






In [16]:
dico_folds["TIM beta/alpha-barrel"]

['4206.pdb',
 '914.pdb',
 '1043.pdb',
 '5668.pdb',
 '4208.pdb',
 '1903.pdb',
 '6513.pdb',
 '3091.pdb',
 '4171.pdb',
 '5113.pdb']

> Make files :

In [17]:
from Bio import SeqIO

def extract_sequence_from_pdb(pdb_path):
    for record in SeqIO.parse(pdb_path, "pdb-atom"):
        return str(record.seq)

In [18]:
path_project = "/media/concha-eloko/Linux/PhageDEPOdetection"
path_pdb = f"{path_project}/phagedepo_out"

with open(f"{path_project}/rare_folds.tsv" , "w") as outfile :    
    outfile.write(f"Fold\tprotein_id\taa_sequence\n")
    for fold in dico_folds :
        for file in dico_folds[fold] : 
            try : 
                if fold in ["TIM beta/alpha-barrel" , "alpha/alpha toroid"] : 
                    seq = extract_sequence_from_pdb(f"{path_pdb}/{file}")
                    os.system(f"cp {path_pdb}/{file} /media/concha-eloko/Linux/PhageDEPOdetection/rare_folds_pdb")
                    outfile.write(f"{fold}\t{file}\t{seq}\n")
            except Exception as e :
                pass



In [None]:
pp = pprint.PrettyPrinter(width = 100, sort_dicts = True, compact = True)

pp.pprint(decipher_phagedepo)