# Make best_parameters dico file : 

In [None]:
import os 
import json
from tqdm import tqdm

path_work = "/media/concha-eloko/Linux/PPT_clean/trainer_best_parameters"

In [None]:
def make_json(OPTUNA_PATH) : 
    DICO_OPTUNA = {}
    for file in os.listdir(OPTUNA_PATH):
        if file.endswith("json"):
            kl_type = file.split("_")[0]
            with open(f"{OPTUNA_PATH}/{file}", "r") as f:
                best_parameters = json.load(f)
            DICO_OPTUNA[kl_type] = best_parameters
    return DICO_OPTUNA

In [None]:
model_name = { 
"ensemble_20112024_log_optimized_SAGE_ultraF" : "SAGE_uf",
"ensemble_20112024_log_optimized_TropiGAT" : "TropiGAT",
"ensemble_20112024_log_optimized_TropiGAT_ultraF" : "TropiGAT_uf",
"ensemble_20112024_log_optimized_SAGE" : "SAGE"
}

best_para_dico = {}
for rep in tqdm(os.listdir(path_work)) : 
    dico_rep = make_json(f"{path_work}/{rep}")
    best_para_dico[model_name[rep]] = dico_rep
    
with open(f"{path_work}/DAG_models_best_para.json", "w") as fp:
    json.dump(best_para_dico, fp)

In [None]:
best_para_dico["TropiGAT"]

# Fixing the ultrafiltration process: 

In [3]:
import os
import json
import random
import warnings
from collections import Counter
from itertools import product
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import (accuracy_score, f1_score, matthews_corrcoef,
                             precision_score, recall_score, roc_auc_score)
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, label_binarize
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch_geometric.data import DataLoader, HeteroData
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import GATv2Conv, HeteroConv, to_hetero
from torch_geometric.utils import negative_sampling
from tqdm import tqdm

import TropiGAT_graph
import TropiGAT_models

warnings.filterwarnings("ignore")

# Constants
# **************************************************
ultrafiltration = True
# **************************************************

path_work = "/media/concha-eloko/Linux/PPT_clean"

bacteria_data_df = pd.read_csv(f"{path_work}/results_kleborate_count.tsv", sep = "\t", header = 0)
DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df_v2.tsv", sep = "\t" ,  header = 0)


def load_and_preprocess_data():
    """Load and preprocess the prophage data."""
    df_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df_v2.tsv", sep="\t", header=0)
    df_info = df_info.drop_duplicates(subset=["Protein_name"])
    
    df_prophages = df_info.drop_duplicates(subset=["Phage"], keep="first")
    dico_prophage_info = {row["Phage"]: {"prophage_strain": row["prophage_id"], "ancestor": row["Infected_ancestor"]} 
                          for _, row in df_prophages.iterrows()}
    
    return df_info, dico_prophage_info

def filter_prophages(df_info, dico_prophage_info):
    """Filter prophages to remove duplicates and ensure diversity."""
    def get_filtered_prophages(prophage):
        combinations = []
        to_exclude = set()
        to_keep = set()
        to_keep.add(prophage)
        df_prophage_group = df_info[
            (df_info["prophage_id"] == dico_prophage_info[prophage]["prophage_strain"]) & 
            (df_info["Infected_ancestor"] == dico_prophage_info[prophage]["ancestor"])
        ]
        if len(df_prophage_group) == 1:
            return df_prophage_group, to_exclude, to_keep
        
        depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage]["domain_seq"].values)
        for prophage_tmp in df_prophage_group["Phage"].unique():
            if prophage_tmp != prophage:
                tmp_depo_set = set(df_prophage_group[df_prophage_group["Phage"] == prophage_tmp]["domain_seq"].values)
                if depo_set == tmp_depo_set:
                    to_exclude.add(prophage_tmp)
                elif tmp_depo_set not in combinations:
                    to_keep.add(prophage_tmp)
                    combinations.append(tmp_depo_set)
                else:
                    to_exclude.add(prophage_tmp)
        return df_prophage_group, to_exclude, to_keep

    good_prophages = set()
    excluded_prophages = set()

    for prophage in tqdm(dico_prophage_info.keys()):
        if prophage not in excluded_prophages and prophage not in good_prophages:
            _, excluded_members, kept_members = get_filtered_prophages(prophage)
            good_prophages.update(kept_members)
            excluded_prophages.update(excluded_members)

    df_info_filtered = df_info[df_info["Phage"].isin(good_prophages)]
    df_info_final = df_info_filtered[~df_info_filtered["KL_type_LCA"].str.contains("\\|")]

    return df_info_final


def ultrafilter_prophages(df_info):
    """Perform ultra-filtration to remove duplicate prophages within KL types."""
    duplicate_prophage = []
    dico_kltype_duplica = {}

    for kltype in df_info["KL_type_LCA"].unique():
        df_kl = df_info[df_info["KL_type_LCA"] == kltype][["Phage", "Protein_name", "KL_type_LCA", "Infected_ancestor", "index", "seq", "domain_seq"]]
        prophages_tmp_list = df_kl["Phage"].unique().tolist()
        set_sets_depo = []
        duplicated = {}  
        for prophage_tmp in prophages_tmp_list: 
            set_depo = frozenset(df_kl[df_kl["Phage"] == prophage_tmp]["domain_seq"].values)
            for past_set in set_sets_depo:
                if past_set == set_depo:
                    duplicated[past_set] = duplicated.get(past_set, 0) + 1
                    duplicate_prophage.append(prophage_tmp)
                    break
            else:
                set_sets_depo.append(set_depo)
                duplicated[set_depo] = 1
        dico_kltype_duplica[kltype] = duplicated

    df_info_ultrafiltered = df_info[~df_info["Phage"].isin(duplicate_prophage)]
    return df_info_ultrafiltered


def prepare_kltypes(df_info):
    """Prepare KL types for training."""
    df_prophages = df_info.drop_duplicates(subset=["Phage"])
    dico_prophage_count = dict(Counter(df_prophages["KL_type_LCA"]))
    kltypes = [kltype for kltype, count in dico_prophage_count.items() if count >= 10]
    return kltypes, dico_prophage_count



In [4]:
df_info, dico_prophage_info = load_and_preprocess_data()

In [8]:
df_info_final = filter_prophages(df_info, dico_prophage_info)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15981/15981 [00:17<00:00, 894.50it/s]


In [9]:
len(df_info_final["Phage"].unique())

8871

In [11]:
df_info_final_uf = ultrafilter_prophages(df_info)

In [12]:
len(df_info_final_uf["Phage"].unique())

4271

# Get the log files with the best parameters : 


In [None]:
rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/ensemble_20112024_log_optimized_TropiGAT \
/media/concha-eloko/Linux/PPT_clean/trainer_best_parameters


rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/ensemble_20112024_log_optimized_TropiGAT_ultraF \
/media/concha-eloko/Linux/PPT_clean/trainer_best_parameters


rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/ensemble_20112024_log_optimized_SAGE \
/media/concha-eloko/Linux/PPT_clean/trainer_best_parameters


rsync -avzhe ssh \
conchae@garnatxa.srv.cpd:/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/ensemble_20112024_log_optimized_SAGE_ultraF \
/media/concha-eloko/Linux/PPT_clean/trainer_best_parameters


In [None]:
import os
import json

path_json = "/media/concha-eloko/Linux/PPT_clean/trainer_best_parameters/ensemble_20112024_log_optimized_TropiGAT"
OPTUNA_PATH = path_json


with open(f"{path_json}/KL136_optuna_best_params.json", "r") as f:  # Use context manager to open the file
    best_parameters = json.load(f)

In [None]:
DICO_OPTUNA = {}
for file in os.listdir(OPTUNA_PATH):
    if file.endswith("json"):
        kl_type = file.split("_")[0]
        with open(f"{OPTUNA_PATH}/{file}", "r") as f:
            best_parameters = json.load(f)
        DICO_OPTUNA[kl_type] = best_parameters


# Other

In [None]:
import os 
import pandas as pd
from collections import Counter


path_work = "/media/concha-eloko/Linux/PPT_clean"

bacteria_data_df = pd.read_csv(f"{path_work}/results_kleborate_count.tsv", sep = "\t", header = 0)

DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df_v2.tsv", sep = "\t" ,  header = 0)

DF_info_lvl_0 = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_amb = DF_info[DF_info["KL_type_LCA"].str.contains("\\|")]

#DF_info_lvl_0_filter1 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

In [None]:
import os
import random
import warnings
from collections import Counter
from multiprocessing.pool import ThreadPool
import json
import math
import logging

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import (accuracy_score, f1_score, matthews_corrcoef,
                             precision_score, recall_score, roc_auc_score)
import optuna
from sklearn.model_selection import StratifiedKFold
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch_geometric.nn import GATv2Conv, HeteroConv
from tqdm import tqdm

In [None]:
import TropiGAT_graph
import TropiGAT_models
import 

In [None]:
! python -m pip install optuna


# Not sure what happends there :

In [None]:
import os 
import pandas as pd
from collections import Counter


path_work = "/media/concha-eloko/Linux/PPT_clean"

bacteria_data_df = pd.read_csv(f"{path_work}/results_kleborate_count.tsv", sep = "\t", header = 0)

DF_info = pd.read_csv(f"{path_work}/TropiGATv2.final_df_v2.tsv", sep = "\t" ,  header = 0)

DF_info_lvl_0 = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_amb = DF_info[DF_info["KL_type_LCA"].str.contains("\\|")]

#DF_info_lvl_0_filter1 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

In [None]:
path_prophage = "/media/concha-eloko/Linux/prediction_depolymerase_tropism/prophage_work/prophage_prediction"
f_labels=["Prophage name","K-serotype monophyletic group","Id monophyletic group","Number of clades","Number of leafs","Number of new ancestors","Number of k-type swap","Nodes k-types","Nodes k-types all"]

df_prophages_1 = pd.read_csv(f"{path_prophage}/prophage_data.clusters_80.phageboost_70.final.tsv", sep="\t", names =f_labels) 
df_prophages_2 = pd.read_csv(f"{path_work}/prophage_data.clusters_80.phageboost_70.2504.tsv", sep="\t", names =f_labels, skiprows=1) 



In [None]:
df_prophages_1

In [None]:
df_prophages_2_amb = df_prophages_2[df_prophages_2["K-serotype monophyletic group"].str.contains("\\|")]
df_prophages_2_amb

In [None]:
df_prophages_2

In [None]:
max(df_prophages_2["Number of new ancestors"])