***
### Import modules : 

In [2]:
import os 
import pandas as pd 
from tqdm import tqdm
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef


# Measure the performances formally :


> Open the results files : 

In [60]:
path_project = "/media/concha-eloko/Linux/PPT_clean"

# Classic version : 
# PPT_results.matrices.tailored.tsv : Tailored version
# PPT_results.classic_1112.tsv : Classic version 
# PPT_results.matrices.tailored_bit50.tsv : tailored and bit50
# PPT_results.matrices.1512.tsv
tropigat_results = pd.read_csv(f"{path_project}/PPT_results.matrices.1512.tsv", header = 0, sep = "\t")
tropigat_results

Unnamed: 0,phage,protein_id,predictions_seqbased,predictions_tropigat
0,A1a,A1a_00002,KL7:0.989;KL102:0.569,KL123:0.996 ; KL7:0.981 ; KL9:0.974 ; KL110:0....
1,A1a,A1a_00014,KL151:0.708,KL74:0.974 ; KL70:0.907 ; KL29:0.823 ; KL110:0...
2,A1b,A1b_00048,KL157:0.57,KL53:0.989 ; KL60:0.989 ; KL34:0.865 ; KL13:0....
3,A1b,A1b_00036,KL7:0.989;KL102:0.569,KL123:0.999 ; KL7:0.998 ; KL110:0.981 ; KL9:0....
4,A1c,A1c_00046,KL7:0.989;KL102:0.569,KL123:0.996 ; KL7:0.992 ; KL110:0.967 ; KL9:0....
...,...,...,...,...
180,S13a,S13a_00036,KL38:0.699,KL70:0.965 ; KL136:0.934 ; KL123:0.921 ; KL12:...
181,S13b,S13b_00058,KL63:0.893,KL47:0.992 ; KL64:0.913 ; KL28:0.882 ; KL34:0....
182,S13c,S13c_00055,KL38:0.9,KL123:0.976 ; KL145:0.972 ; KL136:0.902 ; KL70...
183,S13d,S13d_00057,KL14:0.976,KL14:0.99 ; KL21:0.975 ; KL13:0.917 ; KL62:0.8...


***
### Read the matrices :

In [61]:
path_finetuning = "/media/concha-eloko/Linux/PPT_clean/in_vitro/fine_tuning"

bea_df = pd.read_csv(f"{path_finetuning}/bea_fine_tuning.df", sep = "\t", header = 0)
bea_df["Protein"] = bea_df["Protein"].apply(lambda x : x.replace("_", "__"))
pool_bea = set([kltype.strip() for kltypes in bea_df["Target"] for kltype in kltypes.split(",") if kltype.count("wzi") == 0 if kltype.count("pass") == 0])

ferriol_df = pd.read_csv(f"{path_finetuning}/ferriol_fine_tuning.df", sep = "\t", header = 0)
ferriol_df["Target"] = ferriol_df["Target"].apply(lambda x : x.replace("K", "KL"))
pool_ferriol = set([kltype.strip() for kltypes in ferriol_df["Target"] for kltype in kltypes.split(",") if kltype.count("wzi") == 0 if kltype.count("pass") == 0])

towndsend_df = pd.read_csv(f"{path_finetuning}/towndsend_fine_tuning.df", sep = "\t", header = 0)
towndsend_df["Protein"] = towndsend_df["Protein"].apply(lambda x : x.replace("_", "__"))
pool_towndsend = set([kltype.strip() for kltypes in towndsend_df["Target"] for kltype in kltypes.split(",") if kltype.count("wzi") == 0 if kltype.count("pass") == 0])

dico_matrices = {"ferriol" : {"matrix" : ferriol_df, "pool" : pool_ferriol}, 
                 "bea" : {"matrix": bea_df, "pool" : pool_bea}, 
                 "towndsend" : {"matrix" : towndsend_df, "pool" : pool_towndsend}}



> TropiGATv2 DF : 

In [62]:
DF_info = pd.read_csv(f"{path_project}/TropiGATv2.final_df.tsv", sep = "\t" ,  header = 0)
DF_info_lvl_0 = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_lvl_0 = DF_info_lvl_0.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)

df_prophages = DF_info_lvl_0.drop_duplicates(subset = ["Phage"])
dico_prophage_count = dict(Counter(df_prophages["KL_type_LCA"]))

KLtypes = [kltype for kltype in dico_prophage_count if dico_prophage_count[kltype] >= 20]



In [63]:
# targets dico : 
dico_hits = {}
for author in dico_matrices :
    matrix = dico_matrices[author]["matrix"]
    for _, row in matrix.iterrows() : 
        for phage in matrix["Phages"].unique() : 
            all_targets = set()
            targets = matrix[matrix["Phages"] == phage]["Target"].values
            for calls in targets : 
                actual_targets = [x.strip() for x in calls.split(",")]
                all_targets.update(actual_targets)
            dico_hits[phage] = all_targets

***
### Make the raw results file : 

In [7]:
top_n = 15

path_project = "/media/concha-eloko/Linux/PPT_clean"

# Classic version : 
# PPT_results.matrices.tailored.tsv : Tailored version
# PPT_results.classic_1112.tsv : Classic version 
# PPT_results.matrices.tailored_bit50.tsv : tailored and bit50
tropigat_results = pd.read_csv(f"{path_project}/PPT_results.matrices.1512.tsv", header = 0, sep = "\t")

with open(f"{path_project}/raw_metrics.1512.tsv", "w") as outfile :
    outfile.write(f"Phage\tProtein\tTropiGAT_predictions\tTropiGAT_good_calls\tTropiSeq_predictions\tTropiSeq_good_calls\tTargets\n")
    for _, row in tropigat_results.iterrows() : 
        targets = dico_hits[row["phage"]]
        outfile.write(f"{row['phage']}\t{row['protein_id']}\t")
        # TropiGAT part : 
        # write the pred
        top_n_predictions = ";".join([x for x in row["predictions_tropigat"].split(";")][0:top_n-1])
        outfile.write(top_n_predictions + "\t")
        # check the calls
        tropigat_pred = [x.split(":")[0].strip() for x in row["predictions_tropigat"].split(";")]
        top_KLtypes_pred = set(tropigat_pred[0: top_n-1])
        good_calls = top_KLtypes_pred.intersection(targets)
        if len(good_calls) > 0 : 
            outfile.write(",".join(list(good_calls)) + "\t")
        else : 
            outfile.write("0" + "\t")
        # TropiSeq part : 
        # write the pred
        outfile.write(row["predictions_seqbased"] + "\t")
        if row["predictions_seqbased"] != "No hits" : 
            tropiseq_pred = [x.split(":")[0].strip() for x in row["predictions_seqbased"].split(";")]
            top_predictions = set(tropiseq_pred[0: top_n-1])
            good_calls = top_predictions.intersection(targets)
            if len(good_calls) > 0 : 
                outfile.write(",".join(list(good_calls)) + "\t")
            else :
                outfile.write("0" + "\t")
        else :
            outfile.write("0\t")
        target_clean = ",".join(list(targets))
        outfile.write(target_clean + "\n")

    

In [8]:
raw_df = pd.read_csv(f"{path_project}/raw_metrics.1512.tsv" , sep = "\t", header = 0)
raw_df

Unnamed: 0,Phage,Protein,TropiGAT_predictions,TropiGAT_good_calls,TropiSeq_predictions,TropiSeq_good_calls,Targets
0,A1a,A1a_00002,KL123:0.996 ; KL7:0.981 ; KL9:0.974 ; KL110:0....,0,KL7:0.989;KL102:0.569,0,KL151
1,A1a,A1a_00014,KL74:0.974 ; KL70:0.907 ; KL29:0.823 ; KL110:0...,0,KL151:0.708,KL151,KL151
2,A1b,A1b_00048,KL53:0.989 ; KL60:0.989 ; KL34:0.865 ; KL13:0....,0,KL157:0.57,KL157,KL157
3,A1b,A1b_00036,KL123:0.999 ; KL7:0.998 ; KL110:0.981 ; KL9:0....,0,KL7:0.989;KL102:0.569,0,KL157
4,A1c,A1c_00046,KL123:0.996 ; KL7:0.992 ; KL110:0.967 ; KL9:0....,0,KL7:0.989;KL102:0.569,0,KL1
...,...,...,...,...,...,...,...
180,S13a,S13a_00036,KL70:0.965 ; KL136:0.934 ; KL123:0.921 ; KL12:...,0,KL38:0.699,0,"KL149,KL102"
181,S13b,S13b_00058,KL47:0.992 ; KL64:0.913 ; KL28:0.882 ; KL34:0....,0,KL63:0.893,KL63,KL63
182,S13c,S13c_00055,KL123:0.976 ; KL145:0.972 ; KL136:0.902 ; KL70...,0,KL38:0.9,0,"KL149,KL102"
183,S13d,S13d_00057,KL14:0.99 ; KL21:0.975 ; KL13:0.917 ; KL62:0.8...,KL14,KL14:0.976,KL14,KL14


***
### Make the matrices files : 

In [7]:
top_n = 15
labels_tropigat = {}
for kltype in tqdm(dico_prophage_count) : 
    pred_labels , real_labels = [] , []
    for author in dico_matrices :
        if kltype in dico_matrices[author]["pool"] : 
            matrix = dico_matrices[author]["matrix"]
            for phage in matrix["Phages"].unique() :
                top_predictions = set()
                predictions = tropigat_results[tropigat_results["phage"] == phage]["predictions_tropigat"].values
                for calls in predictions : 
                    predicted_kltypes = [x.split(":")[0].strip() for x in calls.split(";")]
                    top_predictions.update(predicted_kltypes[0:top_n-1])
                if kltype in top_predictions : 
                    pred_labels.append(1)
                else : 
                    pred_labels.append(0)
                if kltype in dico_hits[phage] : 
                    real_labels.append(1)
                else :
                    real_labels.append(0)
    labels_tropigat[kltype] = {"y_pred" : pred_labels, "real_labels" : real_labels}

100%|████████████████████████████████████████| 129/129 [00:00<00:00, 164.07it/s]


In [8]:
labels_tropigat
sorted_dict = dict(sorted(labels_tropigat.items(), key=lambda item: int(item[0].split("KL")[1])))


In [9]:
aucs = []

for kltype in sorted_dict : 
    if len(labels_tropigat[kltype]["real_labels"]) > 1 : 
        if Counter(labels_tropigat[kltype]["y_pred"])[1] > 0 :
            f1 = f1_score(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"], average='binary')
            precision = precision_score(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"], average='binary')
            recall = recall_score(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"], average='binary')
            mcc = matthews_corrcoef(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"])
            accuracy = accuracy_score(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"])
            auc = roc_auc_score(labels_tropigat[kltype]["real_labels"], labels_tropigat[kltype]["y_pred"])
            
            print(kltype , round(f1,5), round(precision,5), round(recall,5), round(accuracy,5), round(mcc,5), round(auc,5), sep = "\t")
            aucs.append(auc)

KL1	0.0	0.0	0.0	0.94318	-0.02865	0.48824
KL2	0.4	0.36364	0.44444	0.89091	0.34274	0.68757
KL3	0.57143	0.5	0.66667	0.86364	0.49951	0.7807
KL5	0.0	0.0	0.0	0.84	-0.05764	0.42857
KL7	0.0	0.0	0.0	0.76	-0.10206	0.39583
KL8	0.0	0.0	0.0	0.64	-0.10253	0.32653
KL10	0.28571	0.16667	1.0	0.9	0.38686	0.94898
KL13	0.0	0.0	0.0	0.73864	-0.08538	0.37791
KL14	0.14286	0.07692	1.0	0.72727	0.23549	0.86047
KL16	0.17647	0.09677	1.0	0.74545	0.2673	0.86916
KL17	0.4	0.33333	0.5	0.94	0.37819	0.72917
KL18	0.16667	0.09091	1.0	0.8	0.26899	0.89796
KL20	0.66667	0.5	1.0	0.97368	0.69749	0.98649
KL22	0.0	0.0	0.0	0.86364	-0.0546	0.48718
KL23	1.0	1.0	1.0	1.0	1.0	1.0
KL24	0.33333	0.25	0.5	0.90909	0.31053	0.71429
KL25	0.2069	0.11538	1.0	0.73864	0.29011	0.86471
KL27	0.28571	0.2	0.5	0.9	0.27217	0.70833
KL28	0.14815	0.08	1.0	0.61667	0.21972	0.80172
KL29	0.0	0.0	0.0	0.7	-0.08909	0.35714
KL36	0.0	0.0	0.0	0.88	-0.06019	0.47826
KL38	0.66667	0.5	1.0	0.98	0.69985	0.9898
KL39	0.14286	0.08333	0.5	0.76	0.12427	0.63542
KL43	0.125	0.06667	

In [10]:
top_n = 15
labels_tropiseq = {}
for kltype in tqdm(dico_prophage_count) : 
    pred_labels , real_labels = [] , []
    for author in dico_matrices :
        if kltype in dico_matrices[author]["pool"] : 
            matrix = dico_matrices[author]["matrix"]
            for phage in matrix["Phages"].unique() :
                top_predictions = set()
                predictions = tropigat_results[tropigat_results["phage"] == phage]["predictions_seqbased"].values
                for calls in predictions : 
                    predicted_kltypes = [x.split(":")[0].strip() for x in calls.split(";") if x != "No hits"]
                    top_predictions.update(predicted_kltypes[0:top_n-1])
                if kltype in top_predictions : 
                    pred_labels.append(1)
                else : 
                    pred_labels.append(0)
                if kltype in dico_hits[phage] : 
                    real_labels.append(1)
                else :
                    real_labels.append(0)
    labels_tropiseq[kltype] = {"y_pred" : pred_labels, "real_labels" : real_labels}

100%|████████████████████████████████████████| 129/129 [00:00<00:00, 166.38it/s]


In [11]:
aucs_tropiseq = []
for kltype in sorted_dict : 
    if len(labels_tropiseq[kltype]["real_labels"]) > 1 : 
        if Counter(labels_tropiseq[kltype]["y_pred"])[1] > 0 :
            f1 = f1_score(labels_tropiseq[kltype]["real_labels"], labels_tropiseq[kltype]["y_pred"], average='binary')
            precision = precision_score(labels_tropiseq[kltype]["real_labels"], labels_tropiseq[kltype]["y_pred"], average='binary')
            recall = recall_score(labels_tropiseq[kltype]["real_labels"], labels_tropiseq[kltype]["y_pred"], average='binary')
            mcc = matthews_corrcoef(labels_tropiseq[kltype]["real_labels"], labels_tropiseq[kltype]["y_pred"])
            accuracy = accuracy_score(labels_tropiseq[kltype]["real_labels"], labels_tropiseq[kltype]["y_pred"])
            auc = roc_auc_score(labels_tropiseq[kltype]["real_labels"], labels_tropiseq[kltype]["y_pred"])
            print(kltype , round(f1,5), round(precision,5), round(recall,5), round(accuracy,5), round(mcc,5), round(auc,5), sep = "\t")
            aucs_tropiseq.append(auc)

KL2	0.66667	0.83333	0.55556	0.95455	0.65858	0.77283
KL3	0.66667	0.66667	0.66667	0.90909	0.61404	0.80702
KL13	0.66667	1.0	0.5	0.98864	0.70303	0.75
KL14	0.66667	1.0	0.5	0.98864	0.70303	0.75
KL16	1.0	1.0	1.0	1.0	1.0	1.0
KL17	0.66667	1.0	0.5	0.98	0.69985	0.75
KL18	0.5	0.33333	1.0	0.96	0.56544	0.97959
KL22	0.16667	0.5	0.1	0.88636	0.18565	0.54359
KL23	1.0	1.0	1.0	1.0	1.0	1.0
KL24	0.44444	0.4	0.5	0.94318	0.41776	0.73214
KL25	0.85714	0.75	1.0	0.98864	0.86092	0.99412
KL27	0.5	0.5	0.5	0.96	0.47917	0.73958
KL28	0.4	0.33333	0.5	0.95	0.38341	0.73276
KL30	0.66667	1.0	0.5	0.98864	0.70303	0.75
KL38	1.0	1.0	1.0	1.0	1.0	1.0
KL39	0.0	0.0	0.0	0.94	-0.02916	0.48958
KL43	0.0	0.0	0.0	0.96	-0.02041	0.4898
KL45	0.66667	0.5	1.0	0.98	0.69985	0.9898
KL51	1.0	1.0	1.0	1.0	1.0	1.0
KL56	1.0	1.0	1.0	1.0	1.0	1.0
KL60	0.0	0.0	0.0	0.94	-0.02916	0.47959
KL63	0.8	0.66667	1.0	0.98864	0.81174	0.99419
KL64	1.0	1.0	1.0	1.0	1.0	1.0
KL66	1.0	1.0	1.0	1.0	1.0	1.0
KL70	0.0	0.0	0.0	0.96	-0.02041	0.4898
KL71	0.0	0.0	0.0	0.94	-0.02916

In [95]:
kltype_see = "KL43"
labels_zip_seq = tuple(zip(labels_tropiseq[kltype_see]["y_pred"], labels_tropiseq[kltype_see]["real_labels"]))
labels_zip_gat = tuple(zip(labels_tropigat[kltype_see]["y_pred"], labels_tropigat[kltype_see]["real_labels"]))

# tuple(zip(labels_zip_seq , labels_zip_gat))

In [12]:
import statistics

mean = statistics.mean(aucs_tropiseq)
print(mean)

0.7992843781041178


In [13]:
tropigat_results

Unnamed: 0,phage,protein_id,predictions_seqbased,predictions_tropigat
0,A1a,A1a_00002,KL7:0.989;KL102:0.569,KL123:0.996 ; KL7:0.981 ; KL9:0.974 ; KL110:0....
1,A1a,A1a_00014,KL151:0.708,KL74:0.974 ; KL70:0.907 ; KL29:0.823 ; KL110:0...
2,A1b,A1b_00048,KL157:0.57,KL53:0.989 ; KL60:0.989 ; KL34:0.865 ; KL13:0....
3,A1b,A1b_00036,KL7:0.989;KL102:0.569,KL123:0.999 ; KL7:0.998 ; KL110:0.981 ; KL9:0....
4,A1c,A1c_00046,KL7:0.989;KL102:0.569,KL123:0.996 ; KL7:0.992 ; KL110:0.967 ; KL9:0....
...,...,...,...,...
180,S13a,S13a_00036,KL38:0.699,KL70:0.965 ; KL136:0.934 ; KL123:0.921 ; KL12:...
181,S13b,S13b_00058,KL63:0.893,KL47:0.992 ; KL64:0.913 ; KL28:0.882 ; KL34:0....
182,S13c,S13c_00055,KL38:0.9,KL123:0.976 ; KL145:0.972 ; KL136:0.902 ; KL70...
183,S13d,S13d_00057,KL14:0.976,KL14:0.99 ; KL21:0.975 ; KL13:0.917 ; KL62:0.8...


***
# Good calls : 

In [25]:
# tailored_bit50
# 1512
raw_df = pd.read_csv(f"{path_project}/raw_metrics.1512.tsv" , sep = "\t", header = 0)
raw_df

Unnamed: 0,Phage,Protein,TropiGAT_predictions,TropiGAT_good_calls,TropiSeq_predictions,TropiSeq_good_calls,Targets
0,A1a,A1a_00002,KL123:0.996 ; KL7:0.981 ; KL9:0.974 ; KL110:0....,0,KL7:0.989;KL102:0.569,0,KL151
1,A1a,A1a_00014,KL74:0.974 ; KL70:0.907 ; KL29:0.823 ; KL110:0...,0,KL151:0.708,KL151,KL151
2,A1b,A1b_00048,KL53:0.989 ; KL60:0.989 ; KL34:0.865 ; KL13:0....,0,KL157:0.57,KL157,KL157
3,A1b,A1b_00036,KL123:0.999 ; KL7:0.998 ; KL110:0.981 ; KL9:0....,0,KL7:0.989;KL102:0.569,0,KL157
4,A1c,A1c_00046,KL123:0.996 ; KL7:0.992 ; KL110:0.967 ; KL9:0....,0,KL7:0.989;KL102:0.569,0,KL1
...,...,...,...,...,...,...,...
180,S13a,S13a_00036,KL70:0.965 ; KL136:0.934 ; KL123:0.921 ; KL12:...,0,KL38:0.699,0,"KL149,KL102"
181,S13b,S13b_00058,KL47:0.992 ; KL64:0.913 ; KL28:0.882 ; KL34:0....,0,KL63:0.893,KL63,KL63
182,S13c,S13c_00055,KL123:0.976 ; KL145:0.972 ; KL136:0.902 ; KL70...,0,KL38:0.9,0,"KL149,KL102"
183,S13d,S13d_00057,KL14:0.99 ; KL21:0.975 ; KL13:0.917 ; KL62:0.8...,KL14,KL14:0.976,KL14,KL14


In [26]:

total_calls = 0
TropiGAT_good_calls = []
TropiSeq_good_calls = []
top_n = 15

for phage in raw_df["Phage"].unique() : 
    tmp_df = raw_df[raw_df["Phage"] == phage]
    targets_set = set(tmp_df["Targets"].tolist()[0].split(","))
    n_targets = len(targets_set)
    if "".join((list(targets_set)[0])).count("pass") == 0 : 
        total_calls += n_targets
        # TropiGAT calls : 
        call_tropigat = set()
        for calls in tmp_df["TropiGAT_predictions"] :
            top_n_predictions = [x.split(":")[0] for x in calls.split(";")][0:top_n-1]
            call_tropigat.update(top_n_predictions)
        good_calls_tropigat = call_tropigat.intersection(targets_set)
        if len(good_calls_tropigat) > 0 :
            for kltype in good_calls_tropigat : 
                a = (phage , kltype)
                TropiGAT_good_calls.append(a)
        # TropiSeq calls : 
        call_tropiseq = set()
        for calls in tmp_df["TropiSeq_predictions"] :
            top_n_predictions = [x.split(":")[0] for x in calls.split(";")][0:top_n-1]
            call_tropiseq.update(top_n_predictions)
        good_calls_tropiseq = call_tropiseq.intersection(targets_set)
        if len(good_calls_tropiseq) > 0 :
            for kltype in good_calls_tropiseq : 
                a = (phage , kltype)
                TropiSeq_good_calls.append(a)
        




In [27]:
len(TropiGAT_good_calls) , len(TropiSeq_good_calls)

(18, 40)

In [28]:
all_calls = set(TropiGAT_good_calls).union(set(TropiSeq_good_calls))
len(all_calls)

43

In [18]:
total_calls

138

In [31]:
all_calls

{('A1a', 'KL151'),
 ('A1b', 'KL157'),
 ('A1g', 'KL16'),
 ('A1h', 'KL13'),
 ('A1h', 'KL2'),
 ('A1l', 'KL22'),
 ('A3b', 'KL30'),
 ('D7b', 'KL140'),
 ('D7c', 'KL28'),
 ('HIIECEMK', 'KL2'),
 ('K16PH164C3', 'KL16'),
 ('K17alfa62', 'KL17'),
 ('K23PH08C2', 'KL23'),
 ('K24PH164C1', 'KL24'),
 ('K25PH129C1', 'KL25'),
 ('K26PH128C1', 'KL74'),
 ('K27PH129C1', 'KL27'),
 ('K2PH164C2', 'KL2'),
 ('K37PH164C1', 'KL2'),
 ('K38PH09C2', 'KL38'),
 ('K43PH164C1', 'KL43'),
 ('K45PH128C2', 'KL45'),
 ('K51PH129C1', 'KL51'),
 ('K53PH164C2', 'KL53'),
 ('K54lambda1_1_1', 'KL24'),
 ('K56PH164C1', 'KL56'),
 ('K60PH164C1', 'KL18'),
 ('K60PH164C1', 'KL60'),
 ('K63PH128', 'KL63'),
 ('K64PH164C4', 'KL64'),
 ('K66PH128C1', 'KL66'),
 ('K74PH129C2', 'KL74'),
 ('M12a', 'KL139'),
 ('NBNDMPCG', 'KL2'),
 ('OPBIHMGG', 'KL3'),
 ('PFOEGONH', 'KL3'),
 ('PP187', 'KL110'),
 ('PP187', 'KL16'),
 ('S10a', 'KL25'),
 ('S11a', 'KL25'),
 ('S13b', 'KL63'),
 ('S13d', 'KL14'),
 ('S13e', 'KL136')}

In [19]:
43/138*100

31.15942028985507

In [None]:
# Adjust the number of correct calls : remove the KLtypes for which there are no models ?
# Do the analysis for the new dpos 
# Check the folds
# Repeat the predictions with the final tailored

# Fine tune the final tailored
# Repeat the predictions 


In [29]:
TropiGAT_good_calls

[('D7c', 'KL28'),
 ('K17alfa62', 'KL17'),
 ('K24PH164C1', 'KL24'),
 ('K25PH129C1', 'KL25'),
 ('K26PH128C1', 'KL74'),
 ('K27PH129C1', 'KL27'),
 ('K43PH164C1', 'KL43'),
 ('K45PH128C2', 'KL45'),
 ('K53PH164C2', 'KL53'),
 ('K54lambda1_1_1', 'KL24'),
 ('K60PH164C1', 'KL60'),
 ('K74PH129C2', 'KL74'),
 ('OPBIHMGG', 'KL3'),
 ('PFOEGONH', 'KL3'),
 ('PP187', 'KL110'),
 ('S10a', 'KL25'),
 ('S11a', 'KL25'),
 ('S13d', 'KL14')]

> At least one positive prediction for each phage : 

In [43]:
towndsend_df

Unnamed: 0,Phages,Protein,Target
0,BLCJPOBP,BLCJPOBP__00041,KL2
1,DIMCIIMF,DIMCIIMF__00240,KL28
2,DIMCIIMF,DIMCIIMF__00039,KL28
3,DJLANJJD,DJLANJJD__00238,pass (baseplate)
4,EHPPICDA,EHPPICDA__00095,pass (baseplate)
5,EKPIEFBL,EKPIEFBL__00177,pass (baseplate)
6,EKPIEFBL,EKPIEFBL__00113,pass (baseplate)
7,EONHMLJF,EONHMLJF__00087,pass (KL107 no confidence)
8,FADJDIKG,FADJDIKG__00083,pass (baseplate)
9,FADJDIKG,FADJDIKG__00019,pass (baseplate)


In [48]:
raw_df

Unnamed: 0,Phage,Protein,TropiGAT_predictions,TropiGAT_good_calls,TropiSeq_predictions,TropiSeq_good_calls,Targets
0,A1a,A1a_00002,KL123:0.996 ; KL7:0.981 ; KL9:0.974 ; KL110:0....,0,KL7:0.989;KL102:0.569,0,KL151
1,A1a,A1a_00014,KL74:0.974 ; KL70:0.907 ; KL29:0.823 ; KL110:0...,0,KL151:0.708,KL151,KL151
2,A1b,A1b_00048,KL53:0.989 ; KL60:0.989 ; KL34:0.865 ; KL13:0....,0,KL157:0.57,KL157,KL157
3,A1b,A1b_00036,KL123:0.999 ; KL7:0.998 ; KL110:0.981 ; KL9:0....,0,KL7:0.989;KL102:0.569,0,KL157
4,A1c,A1c_00046,KL123:0.996 ; KL7:0.992 ; KL110:0.967 ; KL9:0....,0,KL7:0.989;KL102:0.569,0,KL1
...,...,...,...,...,...,...,...
180,S13a,S13a_00036,KL70:0.965 ; KL136:0.934 ; KL123:0.921 ; KL12:...,0,KL38:0.699,0,"KL149,KL102"
181,S13b,S13b_00058,KL47:0.992 ; KL64:0.913 ; KL28:0.882 ; KL34:0....,0,KL63:0.893,KL63,KL63
182,S13c,S13c_00055,KL123:0.976 ; KL145:0.972 ; KL136:0.902 ; KL70...,0,KL38:0.9,0,"KL149,KL102"
183,S13d,S13d_00057,KL14:0.99 ; KL21:0.975 ; KL13:0.917 ; KL62:0.8...,KL14,KL14:0.976,KL14,KL14


In [55]:
total_calls = 0
TropiGAT_good_calls = []
TropiSeq_good_calls = []
top_n = 15

for phage in raw_df["Phage"].unique() : 
    if phage in ferriol_df["Phages"].tolist() or towndsend_df["Phages"].tolist():
        kltype = "ok"
        tmp_df = raw_df[raw_df["Phage"] == phage]
        targets_set = set(tmp_df["Targets"].tolist()[0].split(","))
        n_targets = len(targets_set)
        if "".join((list(targets_set)[0])).count("pass") == 0 : 
            total_calls += 1
            # TropiGAT calls : 
            call_tropigat = set()
            for calls in tmp_df["TropiGAT_predictions"] :
                top_n_predictions = [x.split(":")[0] for x in calls.split(";")][0:top_n-1]
                call_tropigat.update(top_n_predictions)
            good_calls_tropigat = call_tropigat.intersection(targets_set)
            if len(good_calls_tropigat) > 0 :
                a = (phage , kltype)
                TropiGAT_good_calls.append(a)
            # TropiSeq calls : 
            call_tropiseq = set()
            for calls in tmp_df["TropiSeq_predictions"] :
                top_n_predictions = [x.split(":")[0] for x in calls.split(";")][0:top_n-1]
                call_tropiseq.update(top_n_predictions)
            good_calls_tropiseq = call_tropiseq.intersection(targets_set)
            if len(good_calls_tropiseq) > 0 :
                a = (phage , kltype)
                TropiSeq_good_calls.append(a)

In [56]:
len(TropiGAT_good_calls) , len(TropiSeq_good_calls)

(18, 38)

In [57]:
total_calls

95

In [58]:
all_calls = set(TropiGAT_good_calls).union(set(TropiSeq_good_calls))
len(all_calls)

40

In [59]:
TropiSeq_good_calls

[('A1a', 'ok'),
 ('A1b', 'ok'),
 ('A1g', 'ok'),
 ('A1h', 'ok'),
 ('A1l', 'ok'),
 ('A3b', 'ok'),
 ('D7b', 'ok'),
 ('D7c', 'ok'),
 ('HIIECEMK', 'ok'),
 ('K16PH164C3', 'ok'),
 ('K17alfa62', 'ok'),
 ('K23PH08C2', 'ok'),
 ('K24PH164C1', 'ok'),
 ('K25PH129C1', 'ok'),
 ('K26PH128C1', 'ok'),
 ('K27PH129C1', 'ok'),
 ('K2PH164C2', 'ok'),
 ('K37PH164C1', 'ok'),
 ('K38PH09C2', 'ok'),
 ('K45PH128C2', 'ok'),
 ('K51PH129C1', 'ok'),
 ('K54lambda1_1_1', 'ok'),
 ('K56PH164C1', 'ok'),
 ('K60PH164C1', 'ok'),
 ('K63PH128', 'ok'),
 ('K64PH164C4', 'ok'),
 ('K66PH128C1', 'ok'),
 ('K74PH129C2', 'ok'),
 ('M12a', 'ok'),
 ('NBNDMPCG', 'ok'),
 ('OPBIHMGG', 'ok'),
 ('PFOEGONH', 'ok'),
 ('PP187', 'ok'),
 ('S10a', 'ok'),
 ('S11a', 'ok'),
 ('S13b', 'ok'),
 ('S13d', 'ok'),
 ('S13e', 'ok')]