In [4]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
import pandas as pd
from protzilla.constants.paths import PROJECT_PATH
from protzilla.utilities.transform_dfs import long_to_wide

In [5]:
norm_methods = [
    {"method":"z_score","parameters":{}},
    {"method":"median","parameters":{"percentile":0.5}},
    {"method":"totalsum","parameters":{}}
]

In [6]:
imp_methods = [
    {"method":"min_value_per_sample","parameters":{"shrinking_value":1}},
    {"method":"simple_imputation_per_protein","parameters":{"strategy":"median"}},
    {"method":"simple_imputation_per_protein","parameters":{"strategy":"most_frequent"}},
    {"method":"knn","parameters":{"number_of_neighbours":5}}
    ]

In [7]:
probabilities = [0,0.05,0.1,0.15,0.2,0.25,0.3,0.4,0.5]

In [5]:
# helper method to extract method names from the dictionaries
def extract_method_info(norm_method,imp_method):
    norm_name = norm_method["method"]
    imp_name = imp_method["method"]
    try:
        add_info = imp_method["parameters"]["strategy"]
    except:
        add_info = ""

    return norm_name, imp_name, add_info

In [6]:
# returns a list containing 1 or 0 for each protein in proteins, 
# depending on it being differentially expressed or not
def classify(proteins, pred_positives):
    classification = []
    for protein in proteins:
        if protein in pred_positives:
            classification.append(1)
        else:
            classification.append(0)
    return classification

### Calculating ROC curves

In [9]:
# load ground truth
true_positives_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_small_gt\\history_dfs\\5-data_analysis-differential_expression-t_test-de_proteins_df.csv")
true_positives_protein_list = long_to_wide(true_positives_df).columns
all_proteins_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_small_gt\\history_dfs\\5-data_analysis-differential_expression-t_test-corrected_p_values_df.csv")
roc_df = pd.DataFrame()
roc_df["Protein ID"] = all_proteins_df["Protein ID"]
classification = classify(all_proteins_df["Protein ID"],true_positives_protein_list)
roc_df["classification"] = classification

In [8]:
auc_dict = {}
for p in probabilities:
    for norm_method in norm_methods:
        for imp_method in imp_methods:
            norm_name, imp_name, add_info = extract_method_info(norm_method, imp_method)
            id = f"{p}_{norm_name}_{imp_name}{add_info}"

            # load predicted proteins         
            pred_positives_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\{p}\\ba_39_{p}_{norm_name}_{imp_name}{add_info}\\history_dfs\\5-data_analysis-differential_expression-t_test-de_proteins_df.csv")
            pred_positives_protein_list = pred_positives_df["Protein ID"].unique().tolist()
            classification = classify(roc_df["Protein ID"],pred_positives_protein_list)

            pred_classification_df = pd.DataFrame()
            pred_classification_df["Protein ID"] = roc_df["Protein ID"]
            pred_classification_df["pred classification"] = classification
            
            roc_df = roc_df.join(pred_classification_df.set_index('Protein ID'), on="Protein ID")

            # calculate ROC curve and AUC
            fpr, tpr, thresholds = metrics.roc_curve(roc_df["classification"], roc_df["pred classification"],pos_label=1)
            roc_auc = metrics.auc(fpr, tpr)
            auc_dict[id] = roc_auc
            # add ROC curve to plot
            display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='example estimator')
            display.plot()
            plt.savefig(f"{PROJECT_PATH}\\user_data\\plots\\{id}.png",format="png")
            plt.close()
            roc_df.drop(columns=["pred classification"],inplace=True)


### Create ranking per dataset

In [9]:
ranked_methods = pd.DataFrame()
auc_p_dict = {}
for p in probabilities:
    for norm_method in norm_methods:
        for imp_method in imp_methods:
            norm_name, imp_name, add_info = extract_method_info(norm_method, imp_method)
            id = f"{p}_{norm_name}_{imp_name}{add_info}"
            auc_p_dict[id] = auc_dict[id]
    ranked_methods[f"{p}"] = sorted(auc_p_dict.items(), key=lambda x:x[1],reverse=True)
    auc_p_dict = {}
ranked_methods.to_csv(f"{PROJECT_PATH}\\user_data\\ba39_small_method_ranking.csv")


In [10]:
ranked_methods

Unnamed: 0,0,0.05,0.1,0.15,0.2,0.25,0.3,0.4,0.5
0,"(0_z_score_min_value_per_sample, 0.73798131692...","(0.05_z_score_knn, 0.7288486367433736)","(0.1_z_score_knn, 0.7379813169286853)","(0.15_z_score_knn, 0.7471139971139972)","(0.2_z_score_knn, 0.7153679653679654)","(0.25_z_score_knn, 0.7074314574314574)","(0.3_z_score_knn, 0.7018872939925571)","(0.4_z_score_knn, 0.7054758107389687)","(0.5_z_score_knn, 0.6455722639933166)"
1,(0_z_score_simple_imputation_per_proteinmedian...,"(0.05_z_score_min_value_per_sample, 0.71253892...",(0.1_z_score_simple_imputation_per_proteinmost...,(0.15_z_score_simple_imputation_per_proteinmed...,"(0.2_median_knn, 0.6547049441786283)","(0.25_median_knn, 0.6535087719298246)","(0.3_totalsum_knn, 0.6712235133287765)",(0.4_totalsum_simple_imputation_per_proteinmed...,"(0.5_median_knn, 0.5971937419305839)"
2,(0_z_score_simple_imputation_per_proteinmost_f...,"(0.05_totalsum_knn, 0.7122161464266727)","(0.1_totalsum_knn, 0.6860142781195412)","(0.15_totalsum_knn, 0.6697045644414065)","(0.2_totalsum_knn, 0.6486101617680565)","(0.25_totalsum_knn, 0.652958152958153)","(0.3_median_knn, 0.6376357560568087)","(0.4_median_knn, 0.619807093491304)",(0.5_z_score_simple_imputation_per_proteinmedi...
3,"(0_z_score_knn, 0.7379813169286853)",(0.05_z_score_simple_imputation_per_proteinmos...,(0.1_z_score_simple_imputation_per_proteinmedi...,"(0.15_median_knn, 0.6693817878028405)",(0.2_z_score_simple_imputation_per_proteinmedi...,(0.25_z_score_simple_imputation_per_proteinmed...,(0.3_totalsum_simple_imputation_per_proteinmed...,"(0.4_totalsum_knn, 0.6130667578036)",(0.5_totalsum_simple_imputation_per_proteinmed...
4,"(0_totalsum_min_value_per_sample, 0.6590529353...",(0.05_z_score_simple_imputation_per_proteinmed...,"(0.1_median_knn, 0.6693817878028405)",(0.15_z_score_simple_imputation_per_proteinmos...,(0.2_totalsum_simple_imputation_per_proteinmed...,(0.25_z_score_simple_imputation_per_proteinmos...,(0.3_z_score_simple_imputation_per_proteinmedi...,(0.4_z_score_simple_imputation_per_proteinmedi...,"(0.5_totalsum_knn, 0.5396825396825397)"
5,(0_totalsum_simple_imputation_per_proteinmedia...,"(0.05_totalsum_min_value_per_sample, 0.7038429...",(0.1_totalsum_simple_imputation_per_proteinmed...,(0.15_totalsum_simple_imputation_per_proteinme...,(0.2_z_score_simple_imputation_per_proteinmost...,(0.25_median_simple_imputation_per_proteinmedi...,(0.3_median_simple_imputation_per_proteinmedia...,(0.4_median_simple_imputation_per_proteinmedia...,(0.5_median_simple_imputation_per_proteinmedia...
6,(0_totalsum_simple_imputation_per_proteinmost_...,"(0.05_median_min_value_per_sample, 0.691558441...","(0.1_median_min_value_per_sample, 0.6467684362...",(0.15_median_simple_imputation_per_proteinmedi...,(0.2_median_simple_imputation_per_proteinmedia...,(0.25_totalsum_simple_imputation_per_proteinme...,(0.3_median_simple_imputation_per_proteinmost_...,(0.4_median_simple_imputation_per_proteinmost_...,(0.5_z_score_simple_imputation_per_proteinmost...
7,"(0_totalsum_knn, 0.6590529353687249)","(0.05_median_knn, 0.6749259512417407)",(0.1_median_simple_imputation_per_proteinmedia...,(0.15_median_simple_imputation_per_proteinmost...,(0.2_median_simple_imputation_per_proteinmost_...,(0.25_median_simple_imputation_per_proteinmost...,(0.3_z_score_simple_imputation_per_proteinmost...,(0.4_z_score_simple_imputation_per_proteinmost...,(0.5_median_simple_imputation_per_proteinmost_...
8,"(0_median_min_value_per_sample, 0.649920255183...",(0.05_median_simple_imputation_per_proteinmost...,(0.1_median_simple_imputation_per_proteinmost_...,"(0.15_median_min_value_per_sample, 0.595997569...",(0.2_totalsum_simple_imputation_per_proteinmos...,(0.25_totalsum_simple_imputation_per_proteinmo...,(0.3_totalsum_simple_imputation_per_proteinmos...,(0.4_totalsum_simple_imputation_per_proteinmos...,"(0.5_z_score_min_value_per_sample, 0.5)"
9,"(0_median_simple_imputation_per_proteinmedian,...",(0.05_totalsum_simple_imputation_per_proteinme...,"(0.1_z_score_min_value_per_sample, 0.621762740...",(0.15_totalsum_simple_imputation_per_proteinmo...,"(0.2_z_score_min_value_per_sample, 0.565447710...","(0.25_z_score_min_value_per_sample, 0.50793650...","(0.3_z_score_min_value_per_sample, 0.514676843...","(0.4_z_score_min_value_per_sample, 0.5)","(0.5_median_min_value_per_sample, 0.5)"


### Calculate overall ranking by adding ranks across all datasets

In [14]:
probabilities = [0.05,0.1,0.15,0.2,0.25,0.3,0.4,0.5]

In [15]:
scores = {}
for norm_method in norm_methods:
    for imp_method in imp_methods:
        score = 0
        for p in probabilities:
            norm_name, imp_name, add_info = extract_method_info(norm_method, imp_method)
            id = f"{p}_{norm_name}_{imp_name}{add_info}"
            ranking = ranked_methods[str(p)]
            for method in ranking:
                if id in method:
                    score += ranking.array.tolist().index(method)
        scores.update({id:score})
                    

In [16]:
overall_ranking = sorted(scores.items(), key=lambda x:x[1])
overall_ranking

[('0.5_z_score_knn', 0),
 ('0.5_totalsum_knn', 18),
 ('0.5_median_knn', 21),
 ('0.5_z_score_simple_imputation_per_proteinmedian', 24),
 ('0.5_totalsum_simple_imputation_per_proteinmedian', 36),
 ('0.5_z_score_simple_imputation_per_proteinmost_frequent', 37),
 ('0.5_median_simple_imputation_per_proteinmedian', 50),
 ('0.5_median_simple_imputation_per_proteinmost_frequent', 56),
 ('0.5_z_score_min_value_per_sample', 65),
 ('0.5_median_min_value_per_sample', 70),
 ('0.5_totalsum_simple_imputation_per_proteinmost_frequent', 72),
 ('0.5_totalsum_min_value_per_sample', 79)]