In [24]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
import pandas as pd
from protzilla.constants.paths import PROJECT_PATH
from protzilla.utilities.transform_dfs import long_to_wide

In [25]:
probabilities = [0,0.05,0.1,0.15,0.2,0.25,0.3,0.4,0.5]

In [26]:
norm_methods = [
    {"method":"z_score","parameters":{}},
    {"method":"median","parameters":{"percentile":0.5}},
    {"method":"totalsum","parameters":{}}
]

In [27]:
imp_methods = [
    {"method":"min_value_per_sample","parameters":{"shrinking_value":1}},
    {"method":"simple_imputation_per_protein","parameters":{"strategy":"median"}},
    {"method":"simple_imputation_per_protein","parameters":{"strategy":"most_frequent"}},
    {"method":"knn","parameters":{"number_of_neighbours":5}}
    ]

In [28]:
method_combinations = [({"method":"z_score","parameters":{}, "fc_threshold":0.4237},{"method":"knn","parameters":{"number_of_neighbours":5}}),
                       ({"method":"z_score","parameters":{}, "fc_threshold":0.4237},{"method":"simple_imputation_per_protein","parameters":{"strategy":"median"}}),
                       ({"method":"totalsum","parameters":{},"fc_threshold":0.00004422884}, {"method":"knn","parameters":{"number_of_neighbours":5}}),
                       ({"method":"median","parameters":{"percentile":0.5},"fc_threshold":0.0474},{"method":"knn","parameters":{"number_of_neighbours":5}})]

In [29]:
def extract_method_info(norm_method,imp_method):
    norm_name = norm_method["method"]
    imp_name = imp_method["method"]
    try:
        add_info = imp_method["parameters"]["strategy"]
    except:
        add_info = ""

    return norm_name, imp_name, add_info

In [30]:
def classify(proteins, pred_positives):
    classification = []
    for protein in proteins:
        if protein in pred_positives:
            classification.append(1)
        else:
            classification.append(0)
    return classification

### 0,1

In [31]:
true_positives_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_gt_new_fc\\history_dfs\\6-data_analysis-differential_expression-t_test-de_proteins_df.csv")
true_positives_protein_list = long_to_wide(true_positives_df).columns
all_proteins_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_gt_new_fc\\history_dfs\\6-data_analysis-differential_expression-t_test-corrected_p_values_df.csv")
roc_df = pd.DataFrame()
roc_df["Protein ID"] = all_proteins_df["Protein ID"]
classification = classify(all_proteins_df["Protein ID"],true_positives_protein_list)
roc_df["classification"] = classification

In [32]:
auc_dict = {}
for combination in method_combinations:
            norm_name, imp_name, add_info = extract_method_info(combination[0], combination[1])
            id = f"{norm_name}_{imp_name}{add_info}"
                     
            pred_positives_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_{norm_name}_{imp_name}{add_info}_new_fc\\history_dfs\\7-data_analysis-differential_expression-t_test-de_proteins_df.csv")
            pred_positives_protein_list = long_to_wide(pred_positives_df).columns
            classification = classify(roc_df["Protein ID"],pred_positives_protein_list)

            pred_classification_df = pd.DataFrame()
            pred_classification_df["Protein ID"] = roc_df["Protein ID"]
            pred_classification_df["pred classification"] = classification
            
            roc_df = roc_df.join(pred_classification_df.set_index('Protein ID'), on="Protein ID")

            fpr, tpr, thresholds = metrics.roc_curve(roc_df["classification"], roc_df["pred classification"],pos_label=1)
            roc_auc = metrics.auc(fpr, tpr)
            auc_dict[id] = roc_auc
            display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='example estimator')
            display.plot()
            plt.savefig(f"{PROJECT_PATH}\\user_data\\plots\\{id}_new_fc.svg",format="svg")
            plt.close()
            roc_df.drop(columns=["pred classification"],inplace=True)


In [33]:
ranked_methods = {}
for combination in method_combinations:
    norm_name, imp_name, add_info = extract_method_info(combination[0], combination[1])
    id = f"{norm_name}_{imp_name}{add_info}"
    ranked_methods[id] = auc_dict[id]
ranked_methods = sorted(ranked_methods.items(), key=lambda x:x[1],reverse=True)


In [34]:
ranked_methods

[('median_knn', 0.8604157008190388),
 ('z_score_knn', 0.8326379230412609),
 ('z_score_simple_imputation_per_proteinmedian', 0.75),
 ('totalsum_knn', 0.6759259259259259)]

### plots

In [35]:
def get_fpr_tpr(fpr,tpr):
    fpr = 0 if len(fpr) == 2 else fpr[1]
    tpr = 0 if len(tpr) == 2 else tpr[1]
    return fpr,tpr

In [36]:
auc_dict = {}
tpr_dict = {}
fpr_dict = {}
for combination in method_combinations:
    norm_method = combination[0]
    imp_method = combination[1]    
    norm_name, imp_name, add_info = extract_method_info(norm_method, imp_method)
    id = f"{norm_name}_{imp_name}{add_info}"
                     
    pred_positives_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_{norm_name}_{imp_name}{add_info}\\history_dfs\\7-data_analysis-differential_expression-t_test-de_proteins_df.csv")
    pred_positives_protein_list = pred_positives_df["Protein ID"].unique().tolist()
    classification = classify(roc_df["Protein ID"],pred_positives_protein_list)

    pred_classification_df = pd.DataFrame()
    pred_classification_df["Protein ID"] = roc_df["Protein ID"]
    pred_classification_df["pred classification"] = classification
            
    roc_df = roc_df.join(pred_classification_df.set_index('Protein ID'), on="Protein ID")

    fpr, tpr, thresholds = metrics.roc_curve(roc_df["classification"], roc_df["pred classification"],pos_label=1)

    roc_df.drop(columns=["pred classification"],inplace=True)

            #curve = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='example estimator')
    
    if add_info != "":
        name = f"{norm_name}_{add_info}"
    else:
        name = f"{norm_name}_{imp_name}"
    plt.plot(fpr,tpr,label=name)

    fpr,tpr = get_fpr_tpr(fpr,tpr)
    tpr_dict[id] = tpr
    fpr_dict[id] = fpr

plt.legend()

plt.xlabel("fpr")
plt.ylabel("tpr")
plt.title(f"ROC curves for ba_39 dataset")
plt.savefig(f"{PROJECT_PATH}\\user_data\\plots\\roc_curves\\ba_39_roc_new_fc.png",format="png")
plt.close()
            

In [14]:
print("tpr")
print(tpr_dict.items())
print("fpr")
print(fpr_dict.items())

tpr
dict_items([('z_score_knn', 0.35), ('z_score_simple_imputation_per_proteinmedian', 0.28125), ('totalsum_knn', 0.0125), ('median_knn', 0.4625)])
fpr
dict_items([('z_score_knn', 0.008532423208191127), ('z_score_simple_imputation_per_proteinmedian', 0.0008532423208191126), ('totalsum_knn', 0.0), ('median_knn', 0.016211604095563138)])
