***
### Import modules : 

In [22]:
import os 
import pandas as pd 
from tqdm import tqdm
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef


# Measure the performances formally :


## Opening the DF

> Open the results files : 

In [46]:
path_project = "/media/concha-eloko/Linux/PPT_clean"

# Classic version : 
# PPT_results.matrices.tailored.tsv : Tailored version
# PPT_results.classic_1112.tsv : Classic version 
# PPT_results.matrices.tailored_bit50.tsv : tailored and bit50
# PPT_results.matrices.1512.tsv
# PPT_results.classic_0101.bit50.tsv
tropigat_results = pd.read_csv(f"{path_project}/PPT_results.classic_1001.bit75.tsv", header = 0, sep = "\t")
df_folds = pd.read_csv(f"{path_project}/in_vitro/dpos_folds.all_matrices.tsv", header = 0, sep = "\t")
#tropigat_results.protein_id.tolist()

In [4]:
tropigat_results[tropigat_results["phage"] == "K17alfa62"]

Unnamed: 0,phage,protein_id,predictions_seqbased,predictions_tropigat
89,K17alfa62,K17alfa62__cds_66,No hits,KL62:1.0 ; KL43:0.999 ; KL29:0.986 ; KL52:0.93...
90,K17alfa62,K17alfa62__cds_64,KL17: 0.611,KL17:1.0 ; KL128:0.998 ; KL29:0.912 ; KL70:0.8...


***
### Read the matrices :

In [18]:
path_finetuning = "/media/concha-eloko/Linux/PPT_clean/in_vitro/fine_tuning"

bea_df = pd.read_csv(f"{path_finetuning}/bea_fine_tuning.df", sep = "\t", header = 0)
bea_df["Protein"] = bea_df["Protein"].apply(lambda x : x.replace("_", "__"))
pool_bea = set([kltype.strip() for kltypes in bea_df["Target"] for kltype in kltypes.split(",") if kltype.count("wzi") == 0 if kltype.count("pass") == 0])

ferriol_df = pd.read_csv(f"{path_finetuning}/ferriol_fine_tuning.df", sep = "\t", header = 0)
ferriol_df["Target"] = ferriol_df["Target"].apply(lambda x : x.replace("K", "KL"))
pool_ferriol = set([kltype.strip() for kltypes in ferriol_df["Target"] for kltype in kltypes.split(",") if kltype.count("wzi") == 0 if kltype.count("pass") == 0])

towndsend_df = pd.read_csv(f"{path_finetuning}/towndsend_fine_tuning.df", sep = "\t", header = 0)
towndsend_df["Protein"] = towndsend_df["Protein"].apply(lambda x : x.replace("_", "__"))
pool_towndsend = set([kltype.strip() for kltypes in towndsend_df["Target"] for kltype in kltypes.split(",") if kltype.count("wzi") == 0 if kltype.count("pass") == 0])

dico_matrices = {"ferriol" : {"matrix" : ferriol_df, "pool" : pool_ferriol}, 
                 "bea" : {"matrix": bea_df, "pool" : pool_bea}, 
                 "towndsend" : {"matrix" : towndsend_df, "pool" : pool_towndsend}}


# targets dico : 
dico_hits = {}
for author in dico_matrices :
    matrix = dico_matrices[author]["matrix"]
    for _, row in matrix.iterrows() : 
        for phage in matrix["Phages"].unique() : 
            all_targets = set()
            targets = matrix[matrix["Phages"] == phage]["Target"].values
            for calls in targets : 
                actual_targets = [x.strip() for x in calls.split(",")]
                all_targets.update(actual_targets)
            dico_hits[phage] = all_targets



> TropiGATv2 DF : 

In [7]:
from collections import Counter
DF_info = pd.read_csv(f"{path_project}/TropiGATv2.final_df_v2.filtered.tsv", sep = "\t" ,  header = 0)
#DF_info_lvl_0 = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
#DF_info_lvl_0 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

DF_info_lvl_0 = DF_info.copy()
df_prophages = DF_info_lvl_0.drop_duplicates(subset = ["Phage"])
dico_prophage_count = dict(Counter(df_prophages["KL_type_LCA"]))

KLtypes = [kltype for kltype in dico_prophage_count if dico_prophage_count[kltype] >= 20]



In [17]:
# targets dico : 
dico_hits = {}
for author in dico_matrices :
    matrix = dico_matrices[author]["matrix"]
    for _, row in matrix.iterrows() : 
        for phage in matrix["Phages"].unique() : 
            all_targets = set()
            targets = matrix[matrix["Phages"] == phage]["Target"].values
            for calls in targets : 
                actual_targets = [x.strip() for x in calls.split(",")]
                all_targets.update(actual_targets)
            dico_hits[phage] = all_targets

***
### Make the raw results file : 

> Old 

In [None]:
top_n = 15

path_project = "/media/concha-eloko/Linux/PPT_clean"

# Classic version : 
# PPT_results.matrices.tailored.tsv : Tailored version
# PPT_results.classic_1112.tsv : Classic version 
# PPT_results.matrices.tailored_bit50.tsv : tailored and bit50
# classic_0101
# SAGE_0201
# PPT_results.classic_0101.bit50.tsv
tropigat_results = pd.read_csv(f"{path_project}/PPT_results.classic_1001.bit50.tsv", header = 0, sep = "\t")

with open(f"{path_project}/raw_metrics.classic_1001.bit50.tsv", "w") as outfile :
    outfile.write(f"Phage\tProtein\tTropiGAT_predictions\tTropiGAT_good_calls\tTropiSeq_predictions\tTropiSeq_good_calls\tTargets\n")
    for _, row in tropigat_results.iterrows() : 
        targets = dico_hits[row["phage"]]
        outfile.write(f"{row['phage']}\t{row['protein_id']}\t")
        # TropiGAT part : 
        # write the pred
        top_n_predictions = ";".join([x for x in row["predictions_tropigat"].split(";")][0:top_n-1])
        outfile.write(top_n_predictions + "\t")
        # check the calls
        tropigat_pred = [x.split(":")[0].strip() for x in row["predictions_tropigat"].split(";")]
        top_KLtypes_pred = set(tropigat_pred[0: top_n-1])
        good_calls = top_KLtypes_pred.intersection(targets)
        if len(good_calls) > 0 : 
            outfile.write(",".join(list(good_calls)) + "\t")
        else : 
            outfile.write("0" + "\t")
        # TropiSeq part : 
        # write the pred
        outfile.write(row["predictions_seqbased"] + "\t")
        if row["predictions_seqbased"] != "No hits" : 
            tropiseq_pred = [x.split(":")[0].strip() for x in row["predictions_seqbased"].split(";")]
            top_predictions = set(tropiseq_pred[0: top_n-1])
            good_calls = top_predictions.intersection(targets)
            if len(good_calls) > 0 : 
                outfile.write(",".join(list(good_calls)) + "\t")
            else :
                outfile.write("0" + "\t")
        else :
            outfile.write("0\t")
        target_clean = ",".join(list(targets))
        outfile.write(target_clean + "\n")

    

In [22]:
top_n = 15

path_project = "/media/concha-eloko/Linux/PPT_clean"

# Classic version : 
# PPT_results.matrices.tailored.tsv : Tailored version
# PPT_results.classic_1112.tsv : Classic version 
# PPT_results.matrices.tailored_bit50.tsv : tailored and bit50
# classic_0101
# SAGE_0201
# PPT_results.classic_0101.bit50.tsv
tropigat_results = pd.read_csv(f"{path_project}/PPT_results.classic_1001.bit75.tsv", header = 0, sep = "\t")

with open(f"{path_project}/raw_metrics.classic_1001.bit75.detailed.tsv", "w") as outfile :
    outfile.write(f"Phage\tProtein\tFolds\tTropiGAT_predictions\tTropiGAT_good_calls\tTropiSeq_predictions\tTropiSeq_good_calls\tTargets\n")
    for _, row in tropigat_results.iterrows() : 
        if row["phage"] in dico_hits :
            targets = dico_hits[row["phage"]]
            prot_id = row['protein_id'].replace("__cds", "_cds")
            try : 
                #targets = dico_hits[row["phage"]]
                #prot_id = row['protein_id'].replace("__cds", "_cds")
                fold = df_folds[df_folds["protein_id"] == prot_id]["Fold"].values[0]
            except Exception as e :
                fold = "unknown"
            outfile.write(f"{row['phage']}\t{row['protein_id']}\t{fold}\t")
            # TropiGAT part : 
            # write the pred
            top_n_predictions = ";".join([x for x in row["predictions_tropigat"].split(";")][0:top_n-1])
            outfile.write(top_n_predictions + "\t")
            # check the calls
            tropigat_pred = [x.split(":")[0].strip() for x in row["predictions_tropigat"].split(";")]
            top_KLtypes_pred = set(tropigat_pred[0: top_n-1])
            good_calls = top_KLtypes_pred.intersection(targets)
            if len(good_calls) > 0 : 
                outfile.write(",".join(list(good_calls)) + "\t")
            else : 
                outfile.write("0" + "\t")
            # TropiSeq part : 
            # write the pred
            outfile.write(row["predictions_seqbased"] + "\t")
            if row["predictions_seqbased"] != "No hits" : 
                tropiseq_pred = [x.split(":")[0].strip() for x in row["predictions_seqbased"].split(";")]
                top_predictions = set(tropiseq_pred[0: top_n-1])
                good_calls = top_predictions.intersection(targets)
                if len(good_calls) > 0 : 
                    outfile.write(",".join(list(good_calls)) + "\t")
                else :
                    outfile.write("0" + "\t")
            else :
                outfile.write("0\t")
            target_clean = ",".join(list(targets))
            outfile.write(target_clean + "\n")


    

***
## Working on the final matrices file :

In [10]:
import os 
import pandas as pd 

path_project = "/media/concha-eloko/Linux/PPT_clean"


raw_df = pd.read_csv(f"{path_project}/raw_metrics.classic_1001.bit75.detailed.tsv" , sep = "\t", header = 0)
# raw_df[raw_df["Phage"] == "K17alfa62"]
phages_to_ignore = ["K2064PH2","K2069PH1"]

tropigat_results = raw_df.copy()
#tropigat_results = tropigat_results[~tropigat_results["Phage"].isin(phages_to_ignore)]
#tropigat_results

In [11]:
from collections import Counter
DF_info = pd.read_csv(f"{path_project}/TropiGATv2.final_df_v2.filtered.tsv", sep = "\t" ,  header = 0)
#DF_info_lvl_0 = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
#DF_info_lvl_0 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

DF_info_lvl_0 = DF_info.copy()
df_prophages = DF_info_lvl_0.drop_duplicates(subset = ["Phage"])
dico_prophage_count = dict(Counter(df_prophages["KL_type_LCA"]))

KLtypes = [kltype for kltype in dico_prophage_count if dico_prophage_count[kltype] >= 20]


In [12]:
none_beta_helix = tropigat_results[tropigat_results["Folds"] != "right-handed beta-helix"]["Protein"].values



> check the folds :

In [13]:
from collections import Counter 

folds = dict(Counter(raw_df["Folds"]))
folds



{'6-bladed beta-propeller': 74,
 'right-handed beta-helix': 137,
 'triple-helix': 8,
 'unknown': 11}

raw_df[raw_df["Folds"] == "unknown"]

The 11 unknown are actually triple helix depolymerase

> Check what is the label when they predict the same thing : 

In [38]:
good_inter = 0
bad_inter = 0
inter_count = 0

for _, row in raw_df.iterrows() :
    phage = raw_df["Phage"].values[0]
    tropigat_pred = set([pred.split(":")[0].strip() for pred in row["TropiGAT_predictions"].split(";")])
    tropiseq_pred = set([pred.split(":")[0].strip() for pred in row["TropiSeq_predictions"].split(";") if row["TropiSeq_predictions"] != "No hits"])
    inter = tropigat_pred.intersection(tropiseq_pred)
    if len(inter) > 0 :
        #inter_count += 1
        checkable_targets = set()
        targets = set(row["Targets"].split(","))
        for tar in inter :
            for author in dico_matrices :
                matrice = dico_matrices[author]["matrix"]
                if phage in matrice["Phages"].values :
                    checkable_targets = dico_matrices[author]["pool"]
                    break
                else :
                    continue
        real_inter = checkable_targets.intersection(inter)
        if len(real_inter) > 0 :
            #print(real_inter)
            inter_tar = real_inter.intersection(targets)
            if len(inter_tar) > 0 and real_inter != {'KL102'}:
                good_inter += 1
            else :
                bad_inter += 1 


In [54]:
#good_inter , bad_inter , inter_count

In [53]:
#23/94*100

***
## Make the matrices files : 

In [19]:
from tqdm import tqdm

top_n = 15
labels_tropigat = {}
count_kltypes = {}

for kltype in tqdm(dico_prophage_count) : 
    n = 0
    pred_labels , real_labels = [] , []
    for author in dico_matrices :
        if kltype in dico_matrices[author]["pool"] : 
            matrix = dico_matrices[author]["matrix"]
            for phage in matrix["Phages"].unique() :
                top_predictions = set()
                predictions = tropigat_results[tropigat_results["Phage"] == phage]["TropiGAT_predictions"].values
                for calls in predictions : 
                    predicted_kltypes = [x.split(":")[0].strip() for x in calls.split(";")]
                    top_predictions.update(predicted_kltypes[0:top_n-1])
                if kltype in top_predictions : 
                    pred_labels.append(1)
                else : 
                    pred_labels.append(0)
                if kltype in dico_hits[phage] : 
                    real_labels.append(1)
                    n += 1 
                else :
                    real_labels.append(0)
    labels_tropigat[kltype] = {"y_pred" : pred_labels, "real_labels" : real_labels}
    count_kltypes[kltype] = n 

labels_tropigat
sorted_dict = dict(sorted(labels_tropigat.items(), key=lambda item: int(item[0].split("KL")[1])))


100%|█████████████████████████████████████████| 128/128 [00:01<00:00, 82.25it/s]


In [20]:
def make_labels(pred_df, predictor = "tropigat" , top_n = 15, to_ignore = []) :
    dico_pred = {"tropigat" : "TropiGAT_predictions" ,
                 "tropiseq" : "TropiSeq_predictions"}
    col = dico_pred[predictor]
    labels_tropigat = {}
    count_kltypes = {}
    for kltype in tqdm(dico_prophage_count) : 
        n = 0
        pred_labels , real_labels = [] , []
        for author in dico_matrices :
            if kltype in dico_matrices[author]["pool"] : 
                matrix = dico_matrices[author]["matrix"]
                for phage in matrix["Phages"].unique() :
                    #if phage not in to_ignore : 
                    if phage not in to_ignore and phage in pred_df["Phage"].tolist(): 
                        top_predictions = set()
                        predictions = pred_df[pred_df["Phage"] == phage][col].values
                        for calls in predictions : 
                            predicted_kltypes = [x.split(":")[0].strip() for x in calls.split(";")]
                            top_predictions.update(predicted_kltypes[0:top_n-1])
                        if kltype in top_predictions : 
                            pred_labels.append(1)
                        else : 
                            pred_labels.append(0)
                        if kltype in dico_hits[phage] : 
                            real_labels.append(1)
                            n += 1 
                        else :
                            real_labels.append(0)
        labels_tropigat[kltype] = {"y_pred" : pred_labels, "real_labels" : real_labels}
        sorted_dict = dict(sorted(labels_tropigat.items(), key=lambda item: int(item[0].split("KL")[1])))
        count_kltypes[kltype] = n 
    return sorted_dict , labels_tropigat, count_kltypes


def decript_dic (sorted_dict , labels_tropigat, count_kltypes) :
    for kltype in sorted_dict : 
        try : 
            if len(labels_tropigat[kltype]["real_labels"]) > 1 : 
                if Counter(labels_tropigat[kltype]["y_pred"])[1] > 0 :
                    f1 = f1_score(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"], average='binary')
                    precision = precision_score(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"], average='binary')
                    recall = recall_score(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"], average='binary')
                    mcc = matthews_corrcoef(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"])
                    accuracy = accuracy_score(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"])
                    auc = roc_auc_score(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"])
                    print(kltype ,count_kltypes[kltype], round(f1,5),round(accuracy,5), round(recall,5),round(precision,5),round(auc,5), round(mcc,5), sep = "\t")
                #aucs.append(auc)
        except ValueError : 
            print(kltype , "error", sep = "\t")



### Work on TropiGAT : 

> Full predictions : 

In [23]:
tropigat_results_gat = tropigat_results[(tropigat_results["Targets"].str.count("pass") == 0)]

sorted_dic_tropigat, labels_tropigat, count_kltypes = make_labels(tropigat_results_gat, to_ignore = phages_to_ignore)
decript_dic(sorted_dic_tropigat, labels_tropigat, count_kltypes)

100%|█████████████████████████████████████████| 128/128 [00:01<00:00, 90.53it/s]


KL2	16	0.37037	0.85714	0.3125	0.45455	0.62712	0.29945
KL3	12	0.33333	0.4	0.25	0.5	0.4375	-0.13363
KL5	1	0.0	0.96875	0.0	0.0	0.49206	-0.01587
KL7	2	0.0	0.45312	0.0	0.0	0.23387	-0.18531
KL8	1	0.0	0.6875	0.0	0.0	0.34921	-0.08187
KL9	1	0.0	0.75	0.0	0.0	0.38095	-0.06971
KL10	1	0.22222	0.89062	1.0	0.125	0.94444	0.33333
KL12	1	0.0	0.40625	0.0	0.0	0.20635	-0.14749
KL13	3	0.0	0.85859	0.0	0.0	0.44271	-0.0625
KL14	10	0.06667	0.29412	0.3	0.0375	0.29679	-0.24022
KL15	1	0.04651	0.35938	1.0	0.02381	0.6746	0.09118
KL16	4	0.22222	0.88235	0.5	0.14286	0.69783	0.22133
KL17	2	0.10526	0.46875	1.0	0.05556	0.72581	0.1584
KL18	1	0.11111	0.75	1.0	0.05882	0.87302	0.20949
KL19	1	0.125	0.78125	1.0	0.06667	0.88889	0.22771
KL21	1	0.18182	0.85938	1.0	0.1	0.92857	0.29277
KL22	10	0.22951	0.52525	0.7	0.13725	0.60281	0.12398
KL23	1	0.04651	0.35938	1.0	0.02381	0.6746	0.09118
KL24	5	0.08955	0.38384	0.6	0.04839	0.48617	-0.01252
KL25	3	0.26087	0.82828	1.0	0.15	0.91146	0.35134
KL27	2	0.09524	0.40625	1.0	0.05	0.69355	0.13912
K

> with right-handed beta-helix : 

In [24]:
tropigat_results_helix = tropigat_results.copy()
tropigat_results_helix = tropigat_results_helix[(tropigat_results_helix["Folds"].isin(["right-handed beta-helix"])) & (tropigat_results_helix["Targets"].str.count("pass") == 0)]
#tropigat_results_helix.reset_index(drop=True, inplace=True)

sorted_dic_tropigat_helix, labels_tropigat_helix, count_kltypes = make_labels(tropigat_results_helix, to_ignore = phages_to_ignore)
decript_dic(sorted_dic_tropigat_helix, labels_tropigat_helix, count_kltypes)


100%|████████████████████████████████████████| 128/128 [00:01<00:00, 120.92it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KL2	8	0.52632	0.89888	0.625	0.45455	0.77546	0.47878
KL3	3	0.66667	0.66667	1.0	0.5	0.75	0.5
KL5	1	0.0	0.95745	0.0	0.0	0.48913	-0.02174
KL8	1	0.0	0.78723	0.0	0.0	0.40217	-0.07175
KL9	error
KL10	1	0.25	0.87234	1.0	0.14286	0.93478	0.35245
KL12	error
KL13	2	0.0	0.95	0.0	0.0	0.48718	-0.02564
KL14	2	0.06452	0.34831	1.0	0.03333	0.66667	0.10541
KL15	1	0.11765	0.68085	1.0	0.0625	0.83696	0.20523
KL16	2	0.28571	0.88764	1.0	0.16667	0.94253	0.38407
KL17	2	0.23529	0.7234	1.0	0.13333	0.85556	0.30792
KL18	1	0.15385	0.76596	1.0	0.08333	0.88043	0.25181
KL19	error
KL21	error
KL22	8	0.30435	0.6	0.875	0.18421	0.72222	0.267
KL23	1	0.18182	0.80851	1.0	0.1	0.90217	0.28361
KL24	4	0.23529	0.8375	0.5	0.15385	0.67763	0.20988
KL25	3	0.35294	0.8625	1.0	0.21429	0.92857	0.42857
KL27	1	0.1	0.61702	1.0	0.05263	0.80435	0.17899
KL28	1	0.08696	0.76404	1.0	0.04545	0.88068	0.18603
KL29	error
KL30	2	0.0	0.875	0.0	0.0	0.44872	-0.05338
KL36	4	0.0	0.80851	0.0	0.0	0.44186	-0.10523
KL38	1	0.08696	0.625	1.0	0.04545	0.80909	0.16763


  _warn_prf(average, modifier, msg_start, len(result))


KL48	1	0.09524	0.59574	1.0	0.05	0.79348	0.17131
KL52	error
KL53	1	0.125	0.75	1.0	0.06667	0.87273	0.22293
KL57	2	0.4	0.89286	1.0	0.25	0.94444	0.4714
KL60	1	0.125	0.70213	1.0	0.06667	0.84783	0.21535
KL62	1	0.16667	0.78723	1.0	0.09091	0.8913	0.26673
KL63	2	0.0	0.825	0.0	0.0	0.42308	-0.06727
KL64	1	0.4	0.94643	1.0	0.25	0.97273	0.48617
KL70	error
KL74	2	0.13333	0.53571	1.0	0.07143	0.75926	0.19245
KL110	1	0.5	0.77778	1.0	0.33333	0.875	0.5
KL136	1	0.25	0.81818	1.0	0.14286	0.90625	0.34069
KL149	2	0.0	0.81818	0.0	0.0	0.43548	-0.09433
KL151	1	0.0	0.93939	0.0	0.0	0.48438	-0.03125


  _warn_prf(average, modifier, msg_start, len(result))


***
> Work on TropiSeq :

In [25]:
sorted_dic_tropiseq, labels_tropiseq, count_kltypes = make_labels(tropigat_results, predictor = "tropiseq",  to_ignore = phages_to_ignore)
decript_dic(sorted_dic_tropiseq, labels_tropiseq, count_kltypes)

100%|█████████████████████████████████████████| 128/128 [00:01<00:00, 76.04it/s]


KL2	16	0.28571	0.87903	0.1875	0.6	0.58449	0.28797
KL3	12	0.4	0.59091	0.25	1.0	0.625	0.36274
KL4	7	0.25	0.93023	0.14286	1.0	0.57143	0.36438
KL9	1	0.66667	0.98438	1.0	0.5	0.99206	0.70147
KL13	3	0.5	0.98039	0.33333	1.0	0.66667	0.57161
KL14	10	0.16667	0.91935	0.1	0.5	0.54561	0.19719
KL15	1	1.0	1.0	1.0	1.0	1.0	1.0
KL16	4	0.57143	0.97581	0.5	0.66667	0.74583	0.56538
KL17	2	0.66667	0.98438	0.5	1.0	0.75	0.70147
KL18	1	0.0	0.95312	0.0	0.0	0.48413	-0.02263
KL20	1	0.0	0.98039	0.0	0.0	0.49505	-0.0099
KL21	1	0.5	0.96875	1.0	0.33333	0.98413	0.56811
KL23	1	0.66667	0.98438	1.0	0.5	0.99206	0.70147
KL24	5	0.4	0.94118	0.4	0.4	0.68454	0.36907
KL25	3	0.85714	0.9902	1.0	0.75	0.99495	0.86164
KL27	2	0.66667	0.98438	0.5	1.0	0.75	0.70147
KL28	14	0.1	0.85484	0.07143	0.16667	0.51299	0.03831
KL30	3	0.33333	0.96078	0.33333	0.33333	0.65657	0.31313
KL31	1	0.0	0.96875	0.0	0.0	0.49206	-0.01587
KL35	7	0.57143	0.93023	0.57143	0.57143	0.76673	0.53345
KL39	3	0.0	0.92188	0.0	0.0	0.48361	-0.03983
KL43	2	0.0	0.95312	0.0	0.0	0.

> Only right-handed beta-helix 

In [26]:
tropigat_results_helix = tropigat_results.copy()
tropigat_results_helix = tropigat_results_helix[(tropigat_results_helix["Folds"].isin(["right-handed beta-helix"])) & (tropigat_results_helix["Targets"].str.count("pass") == 0)]
#tropigat_results_helix.reset_index(drop=True, inplace=True)

sorted_dic_tropiseq_helix, labels_tropiseq_helix, count_kltypes = make_labels(tropigat_results_helix,predictor = "tropiseq" ,to_ignore = phages_to_ignore)
decript_dic(sorted_dic_tropiseq_helix, labels_tropiseq_helix, count_kltypes)



100%|████████████████████████████████████████| 128/128 [00:01<00:00, 125.38it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KL2	8	0.46154	0.92135	0.375	0.6	0.67515	0.43512
KL3	3	1.0	1.0	1.0	1.0	1.0	1.0
KL9	error
KL13	2	0.66667	0.9875	0.5	1.0	0.75	0.70262
KL14	2	0.5	0.97753	0.5	0.5	0.74425	0.48851
KL15	1	1.0	1.0	1.0	1.0	1.0	1.0
KL16	2	0.8	0.98876	1.0	0.66667	0.99425	0.81179
KL17	2	0.66667	0.97872	0.5	1.0	0.75	0.69938
KL18	1	0.0	0.93617	0.0	0.0	0.47826	-0.03108
KL20	1	0.0	0.975	0.0	0.0	0.49367	-0.01266
KL21	error
KL23	1	1.0	1.0	1.0	1.0	1.0	1.0
KL24	4	0.57143	0.9625	0.5	0.66667	0.74342	0.55849
KL25	3	0.85714	0.9875	1.0	0.75	0.99351	0.86038
KL27	1	1.0	1.0	1.0	1.0	1.0	1.0
KL28	1	0.4	0.96629	1.0	0.25	0.98295	0.4914
KL30	2	0.5	0.975	0.5	0.5	0.74359	0.48718
KL35	4	0.72727	0.94643	1.0	0.57143	0.97115	0.7338
KL39	1	0.0	0.93617	0.0	0.0	0.47826	-0.03108
KL43	1	0.0	0.95745	0.0	0.0	0.48913	-0.02174
KL48	1	0.0	0.93617	0.0	0.0	0.47826	-0.03108
KL51	1	1.0	1.0	1.0	1.0	1.0	1.0
KL57	2	1.0	1.0	1.0	1.0	1.0	1.0
KL58	3	0.5	0.95745	0.33333	1.0	0.66667	0.56466
KL63	2	0.8	0.9875	1.0	0.66667	0.99359	0.81125
KL64	1	0.5	0.96429	1.0	0.33

***
# Good calls : 

In [77]:

total_calls = 0
TropiGAT_good_calls = []
TropiSeq_good_calls = []
top_n = 15

for phage in raw_df["Phage"].unique() : 
    tmp_df = raw_df[raw_df["Phage"] == phage]
    targets_set = set(tmp_df["Targets"].tolist()[0].split(","))
    n_targets = len(targets_set)
    if "".join((list(targets_set)[0])).count("pass") == 0 : 
        total_calls += n_targets
        # TropiGAT calls : 
        call_tropigat = set()
        for calls in tmp_df["TropiGAT_predictions"] :
            top_n_predictions = [x.split(":")[0] for x in calls.split(";")][0:top_n-1]
            call_tropigat.update(top_n_predictions)
        good_calls_tropigat = call_tropigat.intersection(targets_set)
        if len(good_calls_tropigat) > 0 :
            for kltype in good_calls_tropigat : 
                a = (phage , kltype)
                TropiGAT_good_calls.append(a)
        # TropiSeq calls : 
        call_tropiseq = set()
        for calls in tmp_df["TropiSeq_predictions"] :
            top_n_predictions = [x.split(":")[0] for x in calls.split(";")][0:top_n-1]
            call_tropiseq.update(top_n_predictions)
        good_calls_tropiseq = call_tropiseq.intersection(targets_set)
        if len(good_calls_tropiseq) > 0 :
            for kltype in good_calls_tropiseq : 
                a = (phage , kltype)
                TropiSeq_good_calls.append(a)
        




In [133]:
# tailored_bit50
# 1512
import os 
import pandas as pd 

path_project = "/media/concha-eloko/Linux/PPT_clean"


raw_df = pd.read_csv(f"{path_project}/raw_metrics.classic_1001.bit75.detailed.tsv" , sep = "\t", header = 0)

#raw_df_all = raw_df[(~raw_df["Folds"].isin(["6-bladed beta-propeller"])) & (raw_df["Targets"].str.count("pass") == 0)]
#raw_df_helix = tropigat_results_helix[(~tropigat_results_helix["Folds"].isin(["6-bladed beta-propeller"])) & (tropigat_results_helix["Targets"].str.count("pass") == 0)]
# ["right-handed beta-helix", "triple-helix","unknown"]
raw_df_helix = tropigat_results_helix[(tropigat_results_helix["Folds"].isin(["triple-helix","unknown"])) & (tropigat_results_helix["Targets"].str.count("pass") == 0)]



In [134]:
raw_df_helix

Unnamed: 0,Phage,Protein,Folds,TropiGAT_predictions,TropiGAT_good_calls,TropiSeq_predictions,TropiSeq_good_calls,Targets


> Functions : 

In [69]:
def debugg_get_calls_metrics(raw_df , top_n = 15, to_ignore = []) :
    total_calls = 0
    TropiGAT_good_calls = []
    TropiSeq_good_calls = []
    for phage in raw_df["Phage"].unique() : 
        if phage not in to_ignore and phage == "K64PH164C4":
            tmp_df = raw_df[raw_df["Phage"] == phage]
            print(tmp_df)
            targets_set = set(tmp_df["Targets"].tolist()[0].split(","))
            n_targets = len(targets_set)
            if "".join((list(targets_set)[0])).count("pass") == 0 : 
                total_calls += n_targets
                # TropiGAT calls : 
                call_tropigat = set()
                for calls in tmp_df["TropiGAT_predictions"] :
                    top_n_predictions = [x.split(":")[0].strip() for x in calls.split(";")][0:top_n-1]
                    call_tropigat.update(top_n_predictions)
                print(call_tropigat)
                good_calls_tropigat = call_tropigat.intersection(targets_set)
                if len(good_calls_tropigat) > 0 :
                    for kltype in good_calls_tropigat : 
                        a = (phage , kltype)
                        TropiGAT_good_calls.append(a)
                # TropiSeq calls : 
                call_tropiseq = set()
                for calls in tmp_df["TropiSeq_predictions"] :
                    top_n_predictions = [x.split(":")[0] for x in calls.split(";")][0:top_n-1]
                    call_tropiseq.update(top_n_predictions)
                good_calls_tropiseq = call_tropiseq.intersection(targets_set)
                if len(good_calls_tropiseq) > 0 :
                    for kltype in good_calls_tropiseq : 
                        a = (phage , kltype)
                        TropiSeq_good_calls.append(a)
    all_calls = set(TropiGAT_good_calls).union(set(TropiSeq_good_calls))

    return total_calls, all_calls , TropiGAT_good_calls , TropiSeq_good_calls

In [70]:
total_calls, all_calls , TropiGAT_good_calls , TropiSeq_good_calls = debugg_get_calls_metrics(raw_df, to_ignore = phages_to_ignore)


          Phage             Protein                    Folds  \
168  K64PH164C4  K64PH164C4__cds_24  right-handed beta-helix   

                                  TropiGAT_predictions TropiGAT_good_calls  \
168  KL14:0.925 ; KL81:0.918 ; KL45:0.868 ; KL128:0...                KL64   

    TropiSeq_predictions TropiSeq_good_calls Targets  
168          KL64: 0.932                KL64    KL64  
{'KL70', 'KL128', 'KL14', 'KL45', 'KL22', 'KL64', 'KL123', 'KL46', 'KL15', 'KL29', 'KL81', 'KL127', 'KL23'}


In [71]:
TropiGAT_good_calls

[('K64PH164C4', 'KL64')]

In [72]:
def get_calls_metrics(raw_df , top_n = 15, to_ignore = []) :
    total_calls = 0
    TropiGAT_good_calls = []
    TropiSeq_good_calls = []
    
    for phage in raw_df["Phage"].unique() : 
        if phage not in to_ignore :
            tmp_df = raw_df[raw_df["Phage"] == phage]
            targets_set = set(tmp_df["Targets"].tolist()[0].split(","))
            n_targets = len(targets_set)
            if "".join((list(targets_set)[0])).count("pass") == 0 : 
                total_calls += n_targets
                # TropiGAT calls : 
                call_tropigat = set()
                for calls in tmp_df["TropiGAT_predictions"] :
                    top_n_predictions = [x.split(":")[0].strip() for x in calls.split(";")][0:top_n-1]
                    call_tropigat.update(top_n_predictions)
                good_calls_tropigat = call_tropigat.intersection(targets_set)
                if len(good_calls_tropigat) > 0 :
                    for kltype in good_calls_tropigat : 
                        a = (phage , kltype)
                        TropiGAT_good_calls.append(a)
                # TropiSeq calls : 
                call_tropiseq = set()
                for calls in tmp_df["TropiSeq_predictions"] :
                    top_n_predictions = [x.split(":")[0].strip() for x in calls.split(";")][0:top_n-1]
                    call_tropiseq.update(top_n_predictions)
                good_calls_tropiseq = call_tropiseq.intersection(targets_set)
                if len(good_calls_tropiseq) > 0 :
                    for kltype in good_calls_tropiseq : 
                        a = (phage , kltype)
                        TropiSeq_good_calls.append(a)
    all_calls = set(TropiGAT_good_calls).union(set(TropiSeq_good_calls))

    return total_calls, all_calls , TropiGAT_good_calls , TropiSeq_good_calls


In [73]:
def get_bad_calls_metrics(raw_df , top_n = 15, to_ignore = []) :
    total_calls = 0
    TropiGAT_bad_calls = []
    TropiSeq_bad_calls = []
    for phage in raw_df["Phage"].unique() : 
        if phage not in to_ignore :
            tmp_df = raw_df[raw_df["Phage"] == phage]
            targets_set = set(tmp_df["Targets"].tolist()[0].split(","))
            n_targets = len(targets_set)
            if "".join((list(targets_set)[0])).count("pass") == 0 : 
                total_calls += n_targets
                # TropiGAT calls : 
                top_n_predictions_all = []
                for calls in tmp_df["TropiGAT_predictions"] :
                    top_n_predictions = [x.split(":")[0].strip() for x in calls.split(";")][0:top_n-1]
                    top_n_predictions_all = top_n_predictions_all + top_n_predictions
                bad_call_tropigat = targets_set - set(top_n_predictions_all)
                if len(bad_call_tropigat) > 0 :
                    for kltype in bad_call_tropigat : 
                        a = (phage , kltype)
                        TropiGAT_bad_calls.append(a)                        
                # TropiSeq calls : 
                top_n_predictions_all = []
                for calls in tmp_df["TropiSeq_predictions"] :
                    top_n_predictions = [x.split(":")[0].strip() for x in calls.split(";")][0:top_n-1]
                    top_n_predictions_all = top_n_predictions_all + top_n_predictions
                bad_call_tropiseq = targets_set - set(top_n_predictions_all)
                if len(bad_call_tropiseq) > 0 :
                    for kltype in bad_call_tropiseq : 
                        a = (phage , kltype)
                        TropiSeq_bad_calls.append(a)
    all_calls = set(TropiGAT_bad_calls).union(set(TropiSeq_bad_calls))

    return total_calls, all_calls , TropiGAT_bad_calls , TropiSeq_bad_calls

> All predictions : 

In [114]:
total_calls, all_calls , TropiGAT_good_calls , TropiSeq_good_calls = get_calls_metrics(raw_df, to_ignore = phages_to_ignore)


In [115]:
total_calls , len(all_calls) , len(TropiGAT_good_calls), len(TropiSeq_good_calls)

(273, 95, 79, 40)

In [78]:
len(set([hit[0] for hit in all_calls]))

73

In [76]:
common_calls = set(TropiGAT_good_calls) & set(TropiSeq_good_calls)
print(len(common_calls))

print(common_calls)
Counter([call[1] for call in common_calls])

24
{('A1h', 'KL2'), ('K11PH164C1', 'KL57'), ('K64PH164C4', 'KL64'), ('PFOEGONH', 'KL3'), ('S11a', 'KL25'), ('OPBIHMGG', 'KL3'), ('A1g', 'KL16'), ('K21lambda1', 'KL21'), ('S13d', 'KL14'), ('K24PH164C1', 'KL24'), ('LCALLHIG', 'KL3'), ('D7c', 'KL28'), ('K23PH08C2', 'KL23'), ('K27PH129C1', 'KL27'), ('S10a', 'KL25'), ('K15PH90', 'KL15'), ('K54lambda1_1_1', 'KL24'), ('K2PH164C2', 'KL2'), ('K57lambda1_2', 'KL57'), ('NBNDMPCG', 'KL2'), ('K25PH129C1', 'KL25'), ('A3b', 'KL30'), ('K17alfa62', 'KL17'), ('K16PH164C3', 'KL16')}


Counter({'KL2': 3,
         'KL3': 3,
         'KL25': 3,
         'KL57': 2,
         'KL16': 2,
         'KL24': 2,
         'KL64': 1,
         'KL21': 1,
         'KL14': 1,
         'KL28': 1,
         'KL23': 1,
         'KL27': 1,
         'KL15': 1,
         'KL30': 1,
         'KL17': 1})

> Helix predictions : 

Good calls :

In [130]:
total_calls_helix, all_calls_helix , TropiGAT_good_calls_helix , TropiSeq_good_calls_helix = get_calls_metrics(raw_df_helix, to_ignore = phages_to_ignore)


In [117]:
TropiGAT_good_calls_helix

[('A1g', 'KL16'),
 ('A1h', 'KL2'),
 ('A1k', 'KL22'),
 ('A1m', 'KL22'),
 ('A1o', 'KL22'),
 ('A1q', 'KL22'),
 ('A1r', 'KL22'),
 ('D7c', 'KL28'),
 ('HIIECEMK', 'KL2'),
 ('K10PH82C1', 'KL10'),
 ('K11PH164C1', 'KL57'),
 ('K14PH164C1', 'KL14'),
 ('K15PH90', 'KL15'),
 ('K16PH164C3', 'KL16'),
 ('K17alfa61', 'KL17'),
 ('K17alfa62', 'KL17'),
 ('K17alfa62', 'KL62'),
 ('K22PH164C1', 'KL22'),
 ('K23PH08C2', 'KL23'),
 ('K24PH164C1', 'KL24'),
 ('K25PH129C1', 'KL25'),
 ('K26PH128C1', 'KL74'),
 ('K27PH129C1', 'KL27'),
 ('K2PH164C2', 'KL2'),
 ('K37PH164C1', 'KL2'),
 ('K38PH09C2', 'KL38'),
 ('K43PH164C1', 'KL43'),
 ('K45PH128C2', 'KL45'),
 ('K46PH129', 'KL46'),
 ('K48PH164C1', 'KL48'),
 ('K53PH164C2', 'KL53'),
 ('K54lambda1_1_1', 'KL24'),
 ('K57lambda1_2', 'KL57'),
 ('K60PH164C1', 'KL60'),
 ('K60PH164C1', 'KL18'),
 ('K64PH164C4', 'KL64'),
 ('K74PH129C2', 'KL74'),
 ('LCALLHIG', 'KL3'),
 ('M5b', 'KL22'),
 ('NBNDMPCG', 'KL2'),
 ('OPBIHMGG', 'KL3'),
 ('PFOEGONH', 'KL3'),
 ('PP187', 'KL110'),
 ('S10a', 'KL25'

In [131]:
total_calls_helix , len(all_calls_helix) , len(TropiGAT_good_calls_helix), len(TropiSeq_good_calls_helix)

(126, 62, 47, 37)

In [123]:
62/137*100

45.25547445255474

Bad calls :

In [121]:
total_bad_calls_helix, all_bad_calls_helix , TropiGAT_bad_calls_helix , TropiSeq_bad_calls_helix = get_bad_calls_metrics(raw_df_helix, to_ignore = phages_to_ignore)


In [122]:
total_bad_calls_helix , len(all_bad_calls_helix) , len(TropiGAT_bad_calls_helix), len(TropiSeq_bad_calls_helix)

(126, 104, 79, 89)

In [89]:
TropiGAT_bad_calls_helix

[('A1a', 'KL151'),
 ('A1b', 'KL157'),
 ('A1c', 'KL1'),
 ('A1d', 'KL20'),
 ('A1h', 'KL13'),
 ('A1i', 'KL126'),
 ('A1j', 'KL126'),
 ('A1j', 'KL61'),
 ('A1k', 'KL37'),
 ('A1l', 'KL37'),
 ('A1l', 'KL22'),
 ('A1m', 'KL37'),
 ('A1o', 'KL37'),
 ('A1q', 'KL37'),
 ('A1r', 'KL37'),
 ('A2a', 'KL126'),
 ('A2b', 'KL126'),
 ('A3b', 'KL30'),
 ('A3c', 'KL61'),
 ('A3d', 'KL61'),
 ('A3d', 'KL134'),
 ('D7b', 'KL140'),
 ('K11PH164C1', 'KL11'),
 ('K13PH07C1S', 'KL13'),
 ('K1PH164C1', 'KL1'),
 ('K22PH164C1', 'KL37'),
 ('K23PH08C2', 'KL58'),
 ('K23PH08C2', 'KL54'),
 ('K26PH128C1', 'KL26'),
 ('K2PH164C1', 'KL2'),
 ('K2alfa62', 'KL2'),
 ('K35PH164C3', 'KL30'),
 ('K35PH164C3', 'KL35'),
 ('K35PH164C3', 'KL36'),
 ('K37PH164C1', 'KL37'),
 ('K39PH122C2', 'KL39'),
 ('K39PH122C2', 'KL36'),
 ('K40PH129C1', 'KL40'),
 ('K42PH8', 'KL42'),
 ('K42PH8', 'KL36'),
 ('K43PH164C1', 'KL36'),
 ('K44PH129C1', 'KL37'),
 ('K44PH129C1', 'KL44'),
 ('K45PH128C2', 'KL71'),
 ('K51PH129C1', 'KL51'),
 ('K54lambda1_1_1', 'KL58'),
 ('K54lamb

In [54]:
Counter([call[1] for call in TropiGAT_bad_calls_helix])

Counter({'KL37': 10,
         'KL22': 8,
         'KL2': 7,
         'KL126': 4,
         'KL61': 4,
         'KL35': 4,
         'KL36': 4,
         'KL1': 3,
         'KL140': 3,
         'KL58': 3,
         'KL54': 3,
         'KL16': 2,
         'KL13': 2,
         'KL30': 2,
         'KL25': 2,
         'KL71': 2,
         'KL63': 2,
         'KL24': 2,
         'KL149': 2,
         'KL102': 2,
         'KL151': 1,
         'KL157': 1,
         'KL20': 1,
         'KL134': 1,
         'KL10': 1,
         'KL57': 1,
         'KL11': 1,
         'KL14': 1,
         'KL23': 1,
         'KL26': 1,
         'KL39': 1,
         'KL40': 1,
         'KL42': 1,
         'KL44': 1,
         'KL45': 1,
         'KL46': 1,
         'KL48': 1,
         'KL51': 1,
         'KL53': 1,
         'KL56': 1,
         'KL68': 1,
         'KL5': 1,
         'KL67': 1,
         'KL64': 1,
         'KL66': 1,
         'KL7': 1,
         'KL6': 1,
         'KL82': 1,
         'KL8': 1,
         'KL139': 

In [55]:
Counter([call[1] for call in TropiSeq_bad_calls_helix])

Counter({'KL37': 10,
         'KL22': 8,
         'KL2': 5,
         'KL126': 4,
         'KL61': 4,
         'KL36': 4,
         'KL1': 3,
         'KL54': 3,
         'KL58': 2,
         'KL74': 2,
         'KL71': 2,
         'KL24': 2,
         'KL149': 2,
         'KL102': 2,
         'KL20': 1,
         'KL134': 1,
         'KL10': 1,
         'KL11': 1,
         'KL13': 1,
         'KL14': 1,
         'KL17': 1,
         'KL62': 1,
         'KL26': 1,
         'KL30': 1,
         'KL38': 1,
         'KL39': 1,
         'KL40': 1,
         'KL42': 1,
         'KL43': 1,
         'KL44': 1,
         'KL45': 1,
         'KL46': 1,
         'KL48': 1,
         'KL53': 1,
         'KL56': 1,
         'KL68': 1,
         'KL5': 1,
         'KL60': 1,
         'KL18': 1,
         'KL67': 1,
         'KL66': 1,
         'KL7': 1,
         'KL6': 1,
         'KL82': 1,
         'KL8': 1,
         'KL139': 1,
         'KL55': 1,
         'KL103': 1,
         'KL110': 1,
         'KL136': 

In [76]:
TropiSeq_good_calls_helix

[('A1a', 'KL151'),
 ('A1b', 'KL157'),
 ('A1g', 'KL16'),
 ('A1h', 'KL2'),
 ('A1h', 'KL13'),
 ('A3b', 'KL30'),
 ('D7b', 'KL140'),
 ('D7c', 'KL28'),
 ('K11PH164C1', 'KL57'),
 ('K15PH90', 'KL15'),
 ('K16PH164C3', 'KL16'),
 ('K17alfa62', 'KL17'),
 ('K23PH08C2', 'KL23'),
 ('K24PH164C1', 'KL24'),
 ('K25PH129C1', 'KL25'),
 ('K27PH129C1', 'KL27'),
 ('K2PH164C2', 'KL2'),
 ('K35PH164C3', 'KL35'),
 ('K51PH129C1', 'KL51'),
 ('K54lambda1_1_1', 'KL24'),
 ('K57lambda1_2', 'KL57'),
 ('K58PH129C2', 'KL58'),
 ('K63PH128', 'KL63'),
 ('K64PH164C4', 'KL64'),
 ('KBDEFBCI', 'KL35'),
 ('LCALLHIG', 'KL3'),
 ('NBNDMPCG', 'KL2'),
 ('NBNDMPCG', 'KL35'),
 ('NJHLHPIG', 'KL35'),
 ('OPBIHMGG', 'KL3'),
 ('P4a', 'KL140'),
 ('P4b', 'KL140'),
 ('PFOEGONH', 'KL3'),
 ('S10a', 'KL25'),
 ('S11a', 'KL25'),
 ('S13b', 'KL63'),
 ('S13d', 'KL14')]

In [77]:
Counter([call[1] for call in TropiSeq_good_calls_helix])

Counter({'KL35': 4,
         'KL2': 3,
         'KL140': 3,
         'KL25': 3,
         'KL3': 3,
         'KL16': 2,
         'KL57': 2,
         'KL24': 2,
         'KL63': 2,
         'KL151': 1,
         'KL157': 1,
         'KL13': 1,
         'KL30': 1,
         'KL28': 1,
         'KL15': 1,
         'KL17': 1,
         'KL23': 1,
         'KL27': 1,
         'KL51': 1,
         'KL58': 1,
         'KL64': 1,
         'KL14': 1})

In [78]:
TropiGAT_good_calls_helix

[('D7c', 'KL28'),
 ('K15PH90', 'KL15'),
 ('K17alfa61', 'KL17'),
 ('K17alfa62', 'KL62'),
 ('K17alfa62', 'KL17'),
 ('K24PH164C1', 'KL24'),
 ('K26PH128C1', 'KL74'),
 ('K27PH129C1', 'KL27'),
 ('K38PH09C2', 'KL38'),
 ('K43PH164C1', 'KL43'),
 ('K54lambda1_1_1', 'KL24'),
 ('K57lambda1_2', 'KL57'),
 ('K60PH164C1', 'KL18'),
 ('K60PH164C1', 'KL60'),
 ('K74PH129C2', 'KL74'),
 ('LCALLHIG', 'KL3'),
 ('NBNDMPCG', 'KL2'),
 ('OPBIHMGG', 'KL3'),
 ('PFOEGONH', 'KL3'),
 ('PP187', 'KL110'),
 ('S11a', 'KL25'),
 ('S13d', 'KL14')]

In [79]:
Counter([call[1] for call in TropiGAT_good_calls_helix])

Counter({'KL3': 3,
         'KL17': 2,
         'KL24': 2,
         'KL74': 2,
         'KL28': 1,
         'KL15': 1,
         'KL62': 1,
         'KL27': 1,
         'KL38': 1,
         'KL43': 1,
         'KL57': 1,
         'KL18': 1,
         'KL60': 1,
         'KL2': 1,
         'KL110': 1,
         'KL25': 1,
         'KL14': 1})

In [90]:
# Number of phage with at least one good call : 
all_phages_called = [call[0] for call in TropiGAT_good_calls+TropiSeq_good_calls]
Counter(all_phages_called)

len(set(all_phages_called))

73

> Differences :

In [110]:
tropigat_only_beta = set(TropiGAT_good_calls_helix) - set(TropiSeq_good_calls_helix)
tropiseq_only_beta = set(TropiSeq_good_calls_helix) - set(TropiGAT_good_calls_helix)

In [111]:
tropigat_only_beta

{('A1k', 'KL22'),
 ('A1m', 'KL22'),
 ('A1o', 'KL22'),
 ('A1q', 'KL22'),
 ('A1r', 'KL22'),
 ('HIIECEMK', 'KL2'),
 ('K10PH82C1', 'KL10'),
 ('K14PH164C1', 'KL14'),
 ('K17alfa61', 'KL17'),
 ('K17alfa62', 'KL62'),
 ('K22PH164C1', 'KL22'),
 ('K26PH128C1', 'KL74'),
 ('K37PH164C1', 'KL2'),
 ('K38PH09C2', 'KL38'),
 ('K43PH164C1', 'KL43'),
 ('K45PH128C2', 'KL45'),
 ('K46PH129', 'KL46'),
 ('K48PH164C1', 'KL48'),
 ('K53PH164C2', 'KL53'),
 ('K60PH164C1', 'KL18'),
 ('K60PH164C1', 'KL60'),
 ('K74PH129C2', 'KL74'),
 ('M5b', 'KL22'),
 ('PP187', 'KL110'),
 ('S13e', 'KL136')}

In [112]:
tropiseq_only_beta

{('A1a', 'KL151'),
 ('A1b', 'KL157'),
 ('A1h', 'KL13'),
 ('A3b', 'KL30'),
 ('D7b', 'KL140'),
 ('K35PH164C3', 'KL35'),
 ('K51PH129C1', 'KL51'),
 ('K58PH129C2', 'KL58'),
 ('K63PH128', 'KL63'),
 ('KBDEFBCI', 'KL35'),
 ('NBNDMPCG', 'KL35'),
 ('NJHLHPIG', 'KL35'),
 ('P4a', 'KL140'),
 ('P4b', 'KL140'),
 ('S13b', 'KL63')}

In [106]:
tropigat_only = set(TropiGAT_good_calls) - set(TropiSeq_good_calls)
tropiseq_only = set(TropiSeq_good_calls) - set(TropiGAT_good_calls)

In [107]:
tropigat_only

{('A1k', 'KL22'),
 ('A1m', 'KL22'),
 ('A1o', 'KL22'),
 ('A1q', 'KL22'),
 ('A1r', 'KL22'),
 ('DJLANJJD', 'KL14'),
 ('DJLANJJD', 'KL38'),
 ('DJLANJJD', 'KL57'),
 ('EHPPICDA', 'KL38'),
 ('EHPPICDA', 'KL57'),
 ('EKPIEFBL', 'KL29'),
 ('EKPIEFBL', 'KL38'),
 ('EKPIEFBL', 'KL53'),
 ('EKPIEFBL', 'KL57'),
 ('EKPIEFBL', 'KL74'),
 ('FADJDIKG', 'KL57'),
 ('FADJDIKG', 'KL74'),
 ('GCLPFEGH', 'KL38'),
 ('GCLPFEGH', 'KL57'),
 ('HIIECEMK', 'KL2'),
 ('INMBNBLA', 'KL29'),
 ('INMBNBLA', 'KL38'),
 ('INMBNBLA', 'KL53'),
 ('INMBNBLA', 'KL57'),
 ('IPGJFKPH', 'KL38'),
 ('IPGJFKPH', 'KL57'),
 ('IPGJFKPH', 'KL74'),
 ('K10PH82C1', 'KL10'),
 ('K14PH164C1', 'KL14'),
 ('K17alfa61', 'KL17'),
 ('K17alfa62', 'KL62'),
 ('K19PH14C4P1', 'KL19'),
 ('K22PH164C1', 'KL22'),
 ('K26PH128C1', 'KL74'),
 ('K29PH164C1', 'KL24'),
 ('K35PH164C3', 'KL30'),
 ('K35PH164C3', 'KL36'),
 ('K37PH164C1', 'KL2'),
 ('K38PH09C2', 'KL38'),
 ('K43PH164C1', 'KL36'),
 ('K43PH164C1', 'KL43'),
 ('K45PH128C2', 'KL45'),
 ('K46PH129', 'KL46'),
 ('K48PH164

In [108]:
tropiseq_only

{('A1a', 'KL151'),
 ('A1b', 'KL157'),
 ('A1h', 'KL13'),
 ('D7b', 'KL140'),
 ('K35PH164C3', 'KL35'),
 ('K4PH164', 'KL4'),
 ('K51PH129C1', 'KL51'),
 ('K58PH129C2', 'KL58'),
 ('K63PH128', 'KL63'),
 ('K9PH25C2', 'KL9'),
 ('KBDEFBCI', 'KL35'),
 ('NBNDMPCG', 'KL35'),
 ('NJHLHPIG', 'KL35'),
 ('P4a', 'KL140'),
 ('P4b', 'KL140'),
 ('S13b', 'KL63')}

In [94]:
kl_tropiseq_only = [kl[1] for kl in tropiseq_only]
Counter(kl_tropiseq_only)

Counter({'KL35': 4,
         'KL140': 3,
         'KL63': 2,
         'KL51': 1,
         'KL9': 1,
         'KL4': 1,
         'KL151': 1,
         'KL157': 1,
         'KL58': 1,
         'KL13': 1})

In [109]:
kl_tropigat_only = [kl[1] for kl in tropigat_only]
Counter(kl_tropigat_only)

Counter({'KL38': 8,
         'KL57': 8,
         'KL22': 7,
         'KL74': 5,
         'KL53': 3,
         'KL14': 2,
         'KL30': 2,
         'KL36': 2,
         'KL2': 2,
         'KL29': 2,
         'KL19': 1,
         'KL27': 1,
         'KL18': 1,
         'KL62': 1,
         'KL10': 1,
         'KL17': 1,
         'KL46': 1,
         'KL45': 1,
         'KL136': 1,
         'KL48': 1,
         'KL43': 1,
         'KL24': 1,
         'KL110': 1,
         'KL60': 1})

In [99]:
df_propeller = raw_df[raw_df["Folds"] == "6-bladed beta-propeller"]
df_propeller

Unnamed: 0,Phage,Protein,Folds,TropiGAT_predictions,TropiGAT_good_calls,TropiSeq_predictions,TropiSeq_good_calls,Targets
0,A1a,A1a_00002,6-bladed beta-propeller,KL111:0.983 ; KL123:0.982 ; KL45:0.973 ; KL24:...,0,KL102: 0.691,0,KL151
3,A1b,A1b_00036,6-bladed beta-propeller,KL123:0.998 ; KL111:0.983 ; KL128:0.982 ; KL45...,0,KL102: 0.691,0,KL157
4,A1c,A1c_00046,6-bladed beta-propeller,KL123:0.994 ; KL24:0.982 ; KL45:0.953 ; KL111:...,0,KL102: 0.691,0,KL1
6,A1d,A1d_00013,6-bladed beta-propeller,KL123:0.989 ; KL45:0.959 ; KL111:0.954 ; KL14:...,0,KL102: 0.691,0,KL20
8,A1e,A1e_00024,6-bladed beta-propeller,KL123:0.991 ; KL111:0.962 ; KL45:0.949 ; KL14:...,0,KL102: 0.691,0,pass (wzi-K?)
...,...,...,...,...,...,...,...,...
178,K74PH129C2,K74PH129C2__cds_46,6-bladed beta-propeller,KL24:0.999 ; KL123:0.994 ; KL102:0.976 ; KL15:...,0,KL102: 0.691,0,KL74
182,K82P1,K82P1__cds_40,6-bladed beta-propeller,KL24:0.999 ; KL123:0.994 ; KL102:0.971 ; KL23:...,0,KL102: 0.691,0,KL82
184,K8PH128,K8PH128__cds_41,6-bladed beta-propeller,KL24:0.999 ; KL123:0.994 ; KL102:0.971 ; KL23:...,0,KL102: 0.691,0,"KL1,KL8"
202,NEBGPKAL,NEBGPKAL_00013,6-bladed beta-propeller,KL123:0.99 ; KL111:0.988 ; KL45:0.962 ; KL128:...,0,KL102: 0.691,0,KL2


In [100]:
total_calls_propeller, all_calls_propeller , TropiGAT_good_calls_propeller , TropiSeq_good_calls_propeller = get_calls_metrics(df_propeller, to_ignore = phages_to_ignore)


In [102]:
total_calls_propeller , len(all_calls_propeller) , len(TropiGAT_good_calls_propeller), len(TropiSeq_good_calls_propeller)

(193, 30, 29, 2)