In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
import pandas as pd
from protzilla.constants.paths import PROJECT_PATH
from protzilla.utilities.transform_dfs import long_to_wide

In [2]:
probabilities = [0,0.05,0.1,0.15,0.2,0.25,0.3,0.4,0.5]

In [5]:
method_combinations = [({"method":"z_score","parameters":{}, "fc_threshold":0.3157},{"method":"knn","parameters":{"number_of_neighbours":5}}),
                       ({"method":"z_score","parameters":{}, "fc_threshold":0.3157},{"method":"simple_imputation_per_protein","parameters":{"strategy":"median"}}),
                       ({"method":"totalsum","parameters":{},"fc_threshold":0.0000638564}, {"method":"knn","parameters":{"number_of_neighbours":5}}),
                       ({"method":"median","parameters":{"percentile":0.5},"fc_threshold":0.0309},{"method":"knn","parameters":{"number_of_neighbours":5}})]

In [6]:
def extract_method_info(norm_method,imp_method):
    norm_name = norm_method["method"]
    imp_name = imp_method["method"]
    try:
        add_info = imp_method["parameters"]["strategy"]
    except:
        add_info = ""

    return norm_name, imp_name, add_info

In [7]:
def classify(proteins, pred_positives):
    classification = []
    for protein in proteins:
        if protein in pred_positives:
            classification.append(1)
        else:
            classification.append(0)
    return classification

### Calculating ROC curves

In [8]:
# load ground truth (ba_39_gt)
# ground truth was calculated using the PROTzilla Runner commandline interface with workflow ba_39_gt_wf
true_positives_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_gt\\history_dfs\\6-data_analysis-differential_expression-t_test-de_proteins_df.csv")
true_positives_protein_list = long_to_wide(true_positives_df).columns
all_proteins_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_gt\\history_dfs\\6-data_analysis-differential_expression-t_test-corrected_p_values_df.csv")
roc_df = pd.DataFrame()
roc_df["Protein ID"] = all_proteins_df["Protein ID"]
classification = classify(all_proteins_df["Protein ID"],true_positives_protein_list)
roc_df["classification"] = classification

In [9]:
auc_dict = {}
for combination in method_combinations:
            norm_name, imp_name, add_info = extract_method_info(combination[0], combination[1])
            id = f"{norm_name}_{imp_name}{add_info}"
                     
            # load differential expression results
            pred_positives_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_{norm_name}_{imp_name}{add_info}\\history_dfs\\7-data_analysis-differential_expression-t_test-de_proteins_df.csv")
            pred_positives_protein_list = long_to_wide(pred_positives_df).columns
            classification = classify(roc_df["Protein ID"],pred_positives_protein_list)

            pred_classification_df = pd.DataFrame()
            pred_classification_df["Protein ID"] = roc_df["Protein ID"]
            pred_classification_df["pred classification"] = classification
            
            roc_df = roc_df.join(pred_classification_df.set_index('Protein ID'), on="Protein ID")

            # create ROC curves
            fpr, tpr, thresholds = metrics.roc_curve(roc_df["classification"], roc_df["pred classification"],pos_label=1)
            roc_auc = metrics.auc(fpr, tpr)
            auc_dict[id] = roc_auc
            display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='example estimator')
            display.plot()
            plt.savefig(f"{PROJECT_PATH}\\user_data\\plots\\{id}.svg",format="svg")
            plt.close()
            roc_df.drop(columns=["pred classification"],inplace=True)


### Create method ranking

In [10]:
ranked_methods = {}
for combination in method_combinations:
    norm_name, imp_name, add_info = extract_method_info(combination[0], combination[1])
    id = f"{norm_name}_{imp_name}{add_info}"
    ranked_methods[id] = auc_dict[id]
ranked_methods = sorted(ranked_methods.items(), key=lambda x:x[1],reverse=True)


In [11]:
ranked_methods

[('median_knn', 0.8604157008190388),
 ('z_score_knn', 0.8326379230412609),
 ('z_score_simple_imputation_per_proteinmedian', 0.75),
 ('totalsum_knn', 0.6759259259259259)]