# Running ScaleAP on the depolymerase dataset 
***

### 1. Importing the modules 

In [3]:
import pandas as pd
import numpy as np
import os
import subprocess
from scipy.spatial import distance
from tqdm import tqdm

***
### 2. Loading the Dataframes 

In [4]:
path_DeepPredic = "/media/concha-eloko/Linux/prediction_depolymerase_tropism/DeepDom/predictions_PPT"

embedding_df = pd.read_csv(f"{path_DeepPredic}/ProtTransBertBFDEmbedder.Deepdom_domain.embedding.csv", header=0)

In [6]:
for label in embedding_df["ID"][0:10] :
    print(label)

GCF_021530775.1__HHBOICAB_25__HHBOICAB_25_801__444_domain__22-466
GCF_021521005.1__LCHDJNDO_42__LCHDJNDO_42_5035__444_domain__22-466
GCF_900494015.1__PDBOAHND_54__PDBOAHND_54_2553__136_domain__140-276
GCF_900494015.1__PDBOAHND_54__PDBOAHND_54_2813__255_domain__1-256
GCF_900494015.1__PDBOAHND_54__PDBOAHND_54_2796__275_domain__126-401
GCF_900494015.1__PDBOAHND_54__PDBOAHND_54_2702__84_domain__10-94
GCF_900494015.1__PDBOAHND_54__PDBOAHND_54_2819__240_domain__11-251
GCF_900494015.1__PDBOAHND_54__PDBOAHND_54_2693__584_domain__10-594
GCF_900494015.1__PDBOAHND_54__PDBOAHND_54_2572__203_domain__618-821
GCF_900494015.1__PDBOAHND_54__PDBOAHND_54_2731__449_domain__1-450


In [18]:
embedding_df.columns

Index(['ID', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '1014', '1015', '1016', '1017', '1018', '1019', '1020', '1021', '1022',
       '1023'],
      dtype='object', length=1025)

In [22]:
embedding_df_comp = embedding_df.drop("ID",axis=1).copy()


***
## 3.a Preparing the Input 


<div class="alert alert-block alert-info"> 
Finally going for the similarity computed from Standardized Euclidian distance

> Standardized Euclidian 

In [29]:
%%timeit
from sklearn.preprocessing import StandardScaler

    # First step : Standardize the DF :
scaler = StandardScaler()
embedding_df_comp_scaled = scaler.fit_transform(embedding_df_comp)


with open(f"{path_DeepPredic}/seuclid_sim.txt","w") as outfile :
    outfile.write(f"{len(embedding_df)-1}\n")
    for i_object_n, df_n in tqdm(enumerate(embedding_df_comp_scaled)):
        for i_object_m, df_m in enumerate(embedding_df_comp_scaled): 
            if i_object_m > i_object_n :
                dist = distance.euclidean(df_n, df_m)
                sim=(1/(1+dist))
                outfile.write(f"{sim}\n")

1000it [00:09, 103.72it/s]
1000it [00:11, 84.42it/s]
1000it [00:12, 81.35it/s]
1000it [00:11, 83.83it/s]
1000it [00:12, 80.53it/s]
1000it [00:12, 81.67it/s]
1000it [00:11, 83.94it/s]
1000it [00:11, 84.56it/s]

12.1 s ± 224 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)





> Euclidian distance 

In [20]:
%%timeit

with open(f"{path_DeepPredic}/euclid_dist.txt","w") as outfile :
    outfile.write(f"{len(embedding_df)-1}\n")
    for i_object_n, df_n in tqdm(enumerate(embedding_df_comp.values)):
        for i_object_m, df_m in enumerate(embedding_df_comp.values): 
            if i_object_m > i_object_n :
                sim = distance.euclidean(df_n, df_m)
                outfile.write(f"{sim}\n")



1000it [00:10, 93.97it/s]
1000it [00:11, 85.48it/s]
1000it [00:13, 75.29it/s]
1000it [00:12, 77.64it/s]
1000it [00:12, 82.09it/s]
1000it [00:12, 79.67it/s]
1000it [00:11, 83.94it/s]
1000it [00:12, 82.46it/s]

12.4 s ± 511 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)





In [6]:
embedding_df_comp.values

array([[ 0.01094471, -0.02100378,  0.00441607, ...,  0.0033936 ,
        -0.02176292, -0.01044327],
       [ 0.01094471, -0.02100378,  0.00441607, ...,  0.0033936 ,
        -0.02176292, -0.01044327],
       [ 0.02202449, -0.03982536, -0.0075569 , ..., -0.00263371,
         0.00755109,  0.00217497],
       ...,
       [-0.02490843, -0.06961409,  0.0001383 , ...,  0.05758946,
        -0.01806621,  0.02298909],
       [-0.05795157, -0.0385082 ,  0.01458298, ..., -0.01609902,
        -0.00114209, -0.02109662],
       [-0.02262551, -0.0148143 ,  0.01336187, ..., -0.00515807,
        -0.0069646 ,  0.03947451]])

In [7]:
distance = pd.read_csv(f"{path_DeepPredic}/euclid_dist.txt")
distance

Unnamed: 0,999
0,0.000000
1,2.058916
2,1.823655
3,1.587911
4,1.642102
...,...
499495,2.323570
499496,1.482692
499497,2.469739
499498,1.551706


In [30]:
similarity = pd.read_csv(f"{path_DeepPredic}/seuclid_sim.txt")
similarity

Unnamed: 0,999
0,1.000000
1,0.019628
2,0.021240
3,0.019342
4,0.020404
...,...
499495,0.017503
499496,0.023469
499497,0.015411
499498,0.019880


> Using the Jaccard similarity

In [9]:
from math import*
  
def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)



In [10]:
%%timeit
with open(f"{path_DeepPredic}/jaccard_similarity.txt","w") as outfile :
    outfile.write(f"{len(embedding_df)-1}\n")
    for i_object_n, df_n in tqdm(enumerate(embedding_df_comp.values)):
        for i_object_m, df_m in enumerate(embedding_df_comp.values): 
            if i_object_m > i_object_n :
                sim = jaccard_similarity(df_n, df_m)
                outfile.write(f"{sim}\n")

1000it [04:42,  3.54it/s]
1000it [04:41,  3.55it/s]
1000it [04:36,  3.61it/s]
1000it [04:51,  3.43it/s]
1000it [04:43,  3.53it/s]
1000it [04:43,  3.53it/s]
1000it [04:54,  3.40it/s]
1000it [04:40,  3.57it/s]

4min 44s ± 5.84 s per loop (mean ± std. dev. of 7 runs, 1 loop each)





> Iterrating the DF with to_dict 

In [None]:
#%%timeit

with open(f"{path_DeepPredic}/euclid_dist.to_dict.txt","w") as outfile :
    outfile.write(f"{len(embedding_df)-1}\n")
    for i_object_n, df_n in tqdm(enumerate(embedding_df_comp.to_dict)):
        for i_object_m, df_m in enumerate(embedding_df_comp.values): 
            if i_object_m > i_object_n :
                sim = distance.euclidean(df_n, df_m)
                outfile.write(f"{sim}\n")


In [15]:
df_dict = embedding_df_comp.to_dict("records")
for row in df_dict :
    print(row)
    print(type(row))
    break

{'0': 0.010944708999999999, '1': -0.021003777, '2': 0.0044160679999999996, '3': 0.025020916, '4': 0.029725766, '5': -0.011487311000000002, '6': 0.0050324066999999995, '7': -0.007687006999999999, '8': -0.008533104, '9': 0.0031625396, '10': 0.021872709, '11': -0.037935946, '12': 0.0072103604, '13': 0.055501506, '14': 0.018329808, '15': -0.024559965, '16': 0.01403402, '17': 0.042735767, '18': -0.009278702, '19': 0.01919487, '20': 0.010608316999999999, '21': 0.012370038, '22': -0.01688994, '23': -0.006845807, '24': 0.0042248929999999995, '25': 0.031367723, '26': 0.004645036, '27': 0.024645282, '28': 0.018317653, '29': 0.023295613, '30': -0.0050248583, '31': -0.018076232, '32': -0.033455, '33': -0.03189947, '34': 0.0069305026, '35': 0.024884889, '36': 0.28984222, '37': 0.003781024, '38': -0.04863037, '39': 0.015170323999999999, '40': -0.008154415, '41': 0.03256594, '42': 0.020230057, '43': 0.014607469, '44': -0.06774522, '45': -0.034952376, '46': -0.0051975218, '47': -0.008625318, '48': 0.0

***
## 3.b  On the server 

In [None]:
import pandas as pd
import numpy as np
import os
import subprocess
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Defining the path
path_DeepPredic = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/ML_work/ScaleAP"

# Openning the relevant Dataframe
embedding_df = pd.read_csv(f"{path_DeepPredic}/ProtTransBertBFDEmbedder.Deepdom_domain.embedding.csv", header=0)
embedding_df_comp = embedding_df.drop("ID",axis=1).copy()

# First step : Standardize the DF :
scaler = StandardScaler()
embedding_df_comp_scaled = scaler.fit_transform(embedding_df_comp)

# Second step : Writting the similarity file :
with open(f"{path_DeepPredic}/Deepdom.seuclid_sim.medium.txt","w") as outfile :    
    outfile.write(f"{len(embedding_df)}\n")
    for i_object_n, df_n in tqdm(enumerate(embedding_df_comp_scaled)):
        for i_object_m, df_m in enumerate(embedding_df_comp_scaled): 
            dist = distance.euclidean(df_n, df_m)
            sim=(1/(1+dist))
            outfile.write(f"{sim}\n")
                
# ********************************************************************************************************************************************************************        
#!/bin/bash
#BATCH --job-name=ScaleAP_pt1_
#SBATCH --partition=short
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=5 
#SBATCH --mem=20gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=ScaleAP_pt1_%j.log 

module restore la_base
source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate ScaleAP

python3 /home/conchae/prediction_depolymerase_tropism/prophage_prediction/ML_work/ScaleAP/script_file/similarity_file.py

In [None]:
import pandas as pd
import numpy as np
import os
import subprocess
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Defining the path
path_DeepPredic = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/ML_work/ScaleAP"

# Openning the relevant Dataframe
embedding_df = pd.read_csv(f"{path_DeepPredic}/ProtTransBertBFDEmbedder.Deepdom_domain.embedding.csv", header=0)
embedding_df_comp = embedding_df.drop("ID",axis=1).copy()

# First step : Standardize the DF :
scaler = StandardScaler()
embedding_df_comp_scaled = scaler.fit_transform(embedding_df_comp)

# Second step : Writting the similarity file :

#with open(f"{path_DeepPredic}/Deepdom.seuclid_sim.txt","w") as outfile :
with open(f"{path_DeepPredic}/Deepdom.seuclid_sim.test","w") as outfile :    
    outfile.write(f"{len(embedding_df)-1}\n")
    for i_object_n, df_n in tqdm(pd.DataFrame(embedding_df_comp_scaled).iterrows()):
        for i_object_m, df_m in pd.DataFrame(embedding_df_comp_scaled).iterrows(): 
            dist = distance.euclidean(df_n, df_m)
            sim=(1/(1+dist))
            outfile.write(f"{sim}\n")
                

> Check the integrity of the file : 

In [None]:
import pandas as pd
import numpy as np
import os
import subprocess
from tqdm import tqdm

# Defining the path
path_DeepPredic = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/ML_work/ScaleAP"

similarities_df = pd.read_csv(f"{path_DeepPredic}/Deepdom.seuclid_sim.medium.txt", header = 0)

***
### 4. Converting the similarity file

In [None]:
/home/conchae/software/ScaleAP/convert -i sample_graph.txt -o sample_graph.bin

# ********************************************************************************************************************************************************************        
#!/bin/bash
#BATCH --job-name=ScaleAP_pt2_
#SBATCH --partition=medium
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=5 
#SBATCH --mem=150gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=ScaleAP_pt2_%j.log 

module restore la_base
source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate ScaleAP

/home/conchae/software/ScaleAP/convert -i /home/conchae/prediction_depolymerase_tropism/prophage_prediction/ML_work/ScaleAP/Deepdom.seuclid_sim.medium.txt -o /home/conchae/prediction_depolymerase_tropism/prophage_prediction/ML_work/ScaleAP/Deepdom.seuclid_sim.bin


***
## 5. Running the algorithm

In [None]:
# ********************************************************************************************************************************************************************        
#!/bin/bash
#BATCH --job-name=ScaleAP_pt1_
#SBATCH --partition=short
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=5 
#SBATCH --mem=20gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=ScaleAP_pt1_%j.log 

module restore la_base
source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate ScaleAP

/home/conchae/software/ScaleAP/ScaleAP -i /home/conchae/prediction_depolymerase_tropism/prophage_prediction/ML_work/ScaleAP/Deepdom.seuclid_sim.bin -l 0.95 -t 1000 -p -0.20 -L /home/conchae/prediction_depolymerase_tropism/prophage_prediction/ML_work/ScaleAP/Deepdom.seuclid_sim.log -D -R



***
## 6. Scanning the results 