In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
import pandas as pd
from protzilla.constants.paths import PROJECT_PATH
from protzilla.utilities.transform_dfs import long_to_wide

In [2]:
probabilities = [0,0.05,0.1,0.15,0.2,0.25,0.3,0.4,0.5]

In [3]:
norm_methods = [
    {"method":"z_score","parameters":{}},
    {"method":"median","parameters":{"percentile":0.5}},
    {"method":"totalsum","parameters":{}}
]

In [4]:
imp_methods = [
    {"method":"min_value_per_sample","parameters":{"shrinking_value":1}},
    {"method":"simple_imputation_per_protein","parameters":{"strategy":"median"}},
    {"method":"simple_imputation_per_protein","parameters":{"strategy":"most_frequent"}},
    {"method":"knn","parameters":{"number_of_neighbours":5}}
    ]

In [2]:
def extract_method_info(norm_method,imp_method):
    norm_name = norm_method["method"]
    imp_name = imp_method["method"]
    try:
        add_info = imp_method["parameters"]["strategy"]
    except:
        add_info = ""

    return norm_name, imp_name, add_info 

### small ba39

In [9]:
def create_line_plot(title,xlabel,ylabel,gt):
    plt.xticks(rotation = 90)
    plt.title(title, fontsize=14)
    plt.xlabel(xlabel, fontsize=14)
    plt.ylabel(ylabel, fontsize=14)
    plt.grid(True)

    return plt

In [11]:
pred_positives_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_small_gt\\history_dfs\\5-data_analysis-differential_expression-t_test-de_proteins_df.csv")
pred_positives_protein_list = pred_positives_df["Protein ID"].unique().tolist()
len(pred_positives_protein_list)

63

In [13]:
line_graph = create_line_plot("Number of Differentially Expressed Proteins","method combinations", "number of proteins",nr_de_gt)
nr_de_proteins = {}
for p in probabilities:
    for norm_method in norm_methods:
        for imp_method in imp_methods:
            norm_name, imp_name, add_info = extract_method_info(norm_method, imp_method)
            id = f"{p}_{norm_name}_{imp_name}{add_info}"
                     
            pred_positives_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\{p}\\ba_39_{p}_{norm_name}_{imp_name}{add_info}\\history_dfs\\5-data_analysis-differential_expression-t_test-de_proteins_df.csv")
            pred_positives_protein_list = pred_positives_df["Protein ID"].unique().tolist()
            if add_info != "":
                name = f"{norm_name}_{add_info}"
            else:
                name = f"{norm_name}_{imp_name}"
            nr_de_proteins[name] = len(pred_positives_protein_list)
    line_graph.plot(nr_de_proteins.keys(),nr_de_proteins.values(),label=str(p))
line_graph.legend(loc="upper center", ncols=3)
line_graph.savefig(f"{PROJECT_PATH}\\user_data\\plots\\nr_de_proteins\\nr_de_proteins.png",format="png",bbox_inches='tight')
line_graph.close()



### ba39

In [3]:
method_combinations = [({"method":"z_score","parameters":{}, "fc_threshold":0.3157},{"method":"knn","parameters":{"number_of_neighbours":5}}),
                       ({"method":"z_score","parameters":{}, "fc_threshold":0.3157},{"method":"simple_imputation_per_protein","parameters":{"strategy":"median"}}),
                       ({"method":"totalsum","parameters":{},"fc_threshold":0.0000638564}, {"method":"knn","parameters":{"number_of_neighbours":5}}),
                       ({"method":"median","parameters":{"percentile":0.5},"fc_threshold":0.0309},{"method":"knn","parameters":{"number_of_neighbours":5}})]

In [6]:
pred_positives_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_gt\\history_dfs\\6-data_analysis-differential_expression-t_test-de_proteins_df.csv")
pred_positives_protein_list = pred_positives_df["Protein ID"].unique().tolist()
len(pred_positives_protein_list)

174

In [11]:
nr_de_proteins = {}
for combination in method_combinations:
    norm_method = combination[0]
    imp_method = combination[1]
    norm_name, imp_name, add_info = extract_method_info(norm_method, imp_method)
    id = f"{norm_name}_{imp_name}{add_info}"

    pred_positives_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_{id}\\history_dfs\\7-data_analysis-differential_expression-t_test-de_proteins_df.csv")
    pred_positives_protein_list = pred_positives_df["Protein ID"].unique().tolist()

    if add_info != "":
        name = f"{norm_name}_{add_info}"
    else:
        name = f"{norm_name}_{imp_name}"
    nr_de_proteins[name] = len(pred_positives_protein_list)
plt.bar(nr_de_proteins.keys(),nr_de_proteins.values())
plt.xticks(rotation = 90)
plt.title("Number of Differentially Expressed Proteins", fontsize=14)
plt.xlabel("method combination", fontsize=14)
plt.ylabel("number of proteins", fontsize=14)
plt.savefig(f"{PROJECT_PATH}\\user_data\\plots\\nr_de_proteins\\nr_de_proteins_ba39.png",format="png",bbox_inches='tight')
plt.close()

In [12]:
nr_de_proteins

{'z_score_knn': 122,
 'z_score_median': 91,
 'totalsum_knn': 4,
 'median_knn': 167}

In [19]:
orig_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_gt\\dataframes\\df_0.csv")
filtered_df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_gt\\dataframes\\df_3.csv")
print(long_to_wide(orig_df).shape)
print(orig_df.isna().sum().sum()/(143*2766))
print(long_to_wide(filtered_df).shape)
print(filtered_df.isna().sum().sum()/(105*1492))

(143, 2766)
0.5016964236053173
(105, 1492)
0.12785012128175666
