# Description

**TODO**

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

import conf

# Settings

In [3]:
N_TISSUES = 49
# N_TISSUES = 1
N_THRESHOLDS = 5
N_PREDICTIONS = 646

In [4]:
OUTPUT_DIR = conf.RESULTS["DRUG_DISEASE_ANALYSES"]
display(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses')

In [5]:
OUTPUT_PREDICTIONS_DIR = Path(OUTPUT_DIR, "predictions", "dotprod_neg")
display(OUTPUT_PREDICTIONS_DIR)
OUTPUT_PREDICTIONS_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg')

# Load PharmacotherapyDB gold standard

In [6]:
gold_standard = pd.read_pickle(
    Path(OUTPUT_DIR, "gold_standard.pkl"),
)

In [7]:
gold_standard.shape

(998, 3)

In [8]:
gold_standard.head()

Unnamed: 0,trait,drug,true_class
0,DOID:10652,DB00843,1
1,DOID:10652,DB00674,1
2,DOID:10652,DB01043,1
3,DOID:10652,DB00989,1
4,DOID:10652,DB00810,0


In [9]:
gold_standard["true_class"].value_counts()

1    755
0    243
Name: true_class, dtype: int64

In [10]:
gold_standard["true_class"].value_counts(normalize=True)

1    0.756513
0    0.243487
Name: true_class, dtype: float64

In [11]:
# doids_in_gold_standard = set(gold_standard["trait"])

# Load drug-disease predictions

In [12]:
from collections import defaultdict

In [13]:
current_prediction_files = list(OUTPUT_PREDICTIONS_DIR.glob("*.h5"))
display(len(current_prediction_files))

assert len(current_prediction_files) == 2 * (
    N_TISSUES * N_THRESHOLDS
)  # two methods (single-gene and module-based)

490

In [14]:
current_prediction_files[:5]

[PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-data-all_genes-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-data-top_100_genes-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-data-top_250_genes-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-data-top_500_genes-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-data-top_50_genes-prediction_scores.h5')]

In [15]:
predictions = []

for f in tqdm(current_prediction_files, ncols=100):
    # FIXME: it shouldn't be necessary to include this anymore
    # exclude S-MultiXcan results, since they have no direction of effect
    if f.name.startswith("smultixcan-"):
        continue

    prediction_data = pd.read_hdf(f, key="prediction")
    prediction_data = pd.merge(
        prediction_data, gold_standard, on=["trait", "drug"], how="inner"
    )

    metadata = pd.read_hdf(f, key="metadata")

    prediction_data["score"] = prediction_data["score"].rank()
    prediction_data["trait"] = prediction_data["trait"].astype("category")
    prediction_data["drug"] = prediction_data["drug"].astype("category")

    prediction_data = prediction_data.assign(method=metadata.method.values[0])
    prediction_data["method"] = prediction_data["method"].astype("category")

    prediction_data = prediction_data.assign(n_top_genes=metadata.n_top_genes.values[0])
    #     prediction_data["n_top_genes"] = prediction_data["data"].astype("category")

    prediction_data = prediction_data.assign(data=metadata.data.values[0])
    prediction_data["data"] = prediction_data["data"].astype("category")

    predictions.append(prediction_data)

100%|█████████████████████████████████████████████████████████████| 490/490 [02:42<00:00,  3.02it/s]


In [16]:
assert np.all(pred.shape[0] == N_PREDICTIONS for pred in predictions)

In [17]:
predictions = pd.concat(predictions, ignore_index=True)

In [18]:
display(predictions.shape)

assert predictions.shape[0] == 2 * (N_TISSUES * N_THRESHOLDS) * N_PREDICTIONS

(316540, 7)

In [19]:
predictions.head()

Unnamed: 0,trait,drug,score,true_class,method,n_top_genes,data
0,DOID:0050741,DB00215,146.0,1,Gene-based,-1.0,spredixcan-mashr-zscores-Adipose_Subcutaneous-...
1,DOID:0050741,DB00704,465.0,1,Gene-based,-1.0,spredixcan-mashr-zscores-Adipose_Subcutaneous-...
2,DOID:0050741,DB00822,530.0,1,Gene-based,-1.0,spredixcan-mashr-zscores-Adipose_Subcutaneous-...
3,DOID:10283,DB00014,124.0,1,Gene-based,-1.0,spredixcan-mashr-zscores-Adipose_Subcutaneous-...
4,DOID:10283,DB00175,309.0,0,Gene-based,-1.0,spredixcan-mashr-zscores-Adipose_Subcutaneous-...


In [20]:
assert not predictions.isna().any().any()

In [21]:
_tmp = predictions["method"].value_counts()
display(_tmp)

assert _tmp.loc["Gene-based"] == N_TISSUES * N_THRESHOLDS * N_PREDICTIONS
assert _tmp.loc["Module-based"] == N_TISSUES * N_THRESHOLDS * N_PREDICTIONS

Gene-based      158270
Module-based    158270
Name: method, dtype: int64

In [22]:
_tmp = predictions.groupby(["method", "n_top_genes"]).count()
display(_tmp)

assert np.all(_tmp == N_TISSUES * N_PREDICTIONS)

Unnamed: 0_level_0,Unnamed: 1_level_0,trait,drug,score,true_class,data
method,n_top_genes,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Gene-based,-1.0,31654,31654,31654,31654,31654
Gene-based,50.0,31654,31654,31654,31654,31654
Gene-based,100.0,31654,31654,31654,31654,31654
Gene-based,250.0,31654,31654,31654,31654,31654
Gene-based,500.0,31654,31654,31654,31654,31654
Module-based,-1.0,31654,31654,31654,31654,31654
Module-based,5.0,31654,31654,31654,31654,31654
Module-based,10.0,31654,31654,31654,31654,31654
Module-based,25.0,31654,31654,31654,31654,31654
Module-based,50.0,31654,31654,31654,31654,31654


In [23]:
# FIXME: add this to the 011 notebooks... or maybe it's fine here (after submitting draft)
def _get_tissue(x):
    if x.endswith("-projection"):
        return x.split("spredixcan-mashr-zscores-")[1].split("-projection")[0]
    else:
        return x.split("spredixcan-mashr-zscores-")[1].split("-data")[0]


predictions = predictions.assign(tissue=predictions["data"].apply(_get_tissue))

# # FIXME: remove or better document; here just for the most_signif version
# predictions = predictions.assign(tissue="most_signif")

In [24]:
predictions.head()

Unnamed: 0,trait,drug,score,true_class,method,n_top_genes,data,tissue
0,DOID:0050741,DB00215,146.0,1,Gene-based,-1.0,spredixcan-mashr-zscores-Adipose_Subcutaneous-...,Adipose_Subcutaneous
1,DOID:0050741,DB00704,465.0,1,Gene-based,-1.0,spredixcan-mashr-zscores-Adipose_Subcutaneous-...,Adipose_Subcutaneous
2,DOID:0050741,DB00822,530.0,1,Gene-based,-1.0,spredixcan-mashr-zscores-Adipose_Subcutaneous-...,Adipose_Subcutaneous
3,DOID:10283,DB00014,124.0,1,Gene-based,-1.0,spredixcan-mashr-zscores-Adipose_Subcutaneous-...,Adipose_Subcutaneous
4,DOID:10283,DB00175,309.0,0,Gene-based,-1.0,spredixcan-mashr-zscores-Adipose_Subcutaneous-...,Adipose_Subcutaneous


In [25]:
_tmp = predictions.groupby(["method", "tissue"]).count()
display(_tmp)

assert np.all(_tmp.loc["Gene-based"] == (N_PREDICTIONS * N_THRESHOLDS))
assert np.all(_tmp.loc["Module-based"] == (N_PREDICTIONS * N_THRESHOLDS))

Unnamed: 0_level_0,Unnamed: 1_level_0,trait,drug,score,true_class,n_top_genes,data
method,tissue,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Gene-based,Adipose_Subcutaneous,3230,3230,3230,3230,3230,3230
Gene-based,Adipose_Visceral_Omentum,3230,3230,3230,3230,3230,3230
Gene-based,Adrenal_Gland,3230,3230,3230,3230,3230,3230
Gene-based,Artery_Aorta,3230,3230,3230,3230,3230,3230
Gene-based,Artery_Coronary,3230,3230,3230,3230,3230,3230
...,...,...,...,...,...,...,...
Module-based,Testis,3230,3230,3230,3230,3230,3230
Module-based,Thyroid,3230,3230,3230,3230,3230,3230
Module-based,Uterus,3230,3230,3230,3230,3230,3230
Module-based,Vagina,3230,3230,3230,3230,3230,3230


## Testing

In [26]:
# all prediction tables should have the same shape
predictions_shape = (
    predictions.groupby(["method", "n_top_genes", "tissue"])
    .apply(lambda x: x.shape)
    .unique()
)
display(predictions_shape)

assert predictions_shape.shape[0] == 1
assert predictions_shape[0][0] == N_PREDICTIONS

array([(646, 8)], dtype=object)

## Save

In [27]:
output_file = Path(OUTPUT_DIR, "predictions", "predictions_results.pkl").resolve()
display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions_using_each_tissue_bak/predictions_results.pkl')

In [28]:
predictions.to_pickle(output_file)

# Aggregate predictions

In [150]:
def _reduce_mean(x):
    return pd.Series(
        {
            "score": x["score"].mean(),
            "true_class": x["true_class"].unique()[0]
            #             if x["true_class"].unique().shape[0] == 1
            #             else None,
        }
    )


def _reduce_max(x):
    return pd.Series(
        {
            "score": x["score"].max(),
            "true_class": x["true_class"].unique()[0]
            #             if x["true_class"].unique().shape[0] == 1
            #             else None,
        }
    )


# def _reduce_best(x):
# #     assert x["true_class"].unique() == FINISH
#     x_stand = (x["score"] - x["score"].mean()) / x["score"].std()

#     x_max_score = x_stand.max()
#     x_min_score = x_stand.min()

#     # select best score
#     x_selected = x["score"].max()
#     if abs(x_min_score) > abs(x_max_score):
#         x_selected = x["score"].min()

#     return pd.Series(
#         {
#             "score": x_selected,
#             "true_class": x["true_class"].unique()[0]
#             #             if x["true_class"].unique().shape[0] == 1
#             #             else None,
#         }
#     )

In [151]:
predictions_avg = (
    predictions.groupby(["trait", "drug", "method", "tissue"])
    #     predictions.groupby(["trait", "drug", "method"])
    .apply(_reduce_mean)
    .dropna()
    .groupby(["trait", "drug", "method"])
    .apply(_reduce_max)
    .dropna()
    .sort_index()
    .reset_index()
)

In [152]:
# predictions_avg should have twice the number of rows in the predictions table, since has both methods
display(predictions_avg.shape)
assert predictions_avg.shape[0] == int(predictions_shape[0][0] * 2)

(1292, 5)

In [153]:
assert predictions_avg.dropna().shape == predictions_avg.shape

In [154]:
predictions_avg.head()

Unnamed: 0,trait,drug,method,score,true_class
0,DOID:0050741,DB00215,Gene-based,460.0,1.0
1,DOID:0050741,DB00215,Module-based,437.8,1.0
2,DOID:0050741,DB00704,Gene-based,544.4,1.0
3,DOID:0050741,DB00704,Module-based,562.8,1.0
4,DOID:0050741,DB00822,Gene-based,598.2,1.0


## Save

In [155]:
output_file = Path(
    OUTPUT_DIR, "predictions", "predictions_results_aggregated.pkl"
).resolve()
display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions_using_each_tissue_bak/predictions_results_aggregated.pkl')

In [156]:
predictions_avg.to_pickle(output_file)

# ROC

In [157]:
from sklearn.metrics import roc_auc_score

## Predictions

In [158]:
# by method/n_top_genes
predictions.groupby(["method", "tissue", "n_top_genes"]).apply(
    lambda x: roc_auc_score(x["true_class"], x["score"])
).groupby(["method", "n_top_genes"]).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
method,n_top_genes,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Gene-based,-1.0,49.0,0.532496,0.017163,0.492209,0.518996,0.53334,0.542742,0.573357
Gene-based,50.0,49.0,0.53165,0.021763,0.486716,0.514991,0.530862,0.546968,0.570521
Gene-based,100.0,49.0,0.538694,0.021833,0.471134,0.525866,0.540072,0.552516,0.577018
Gene-based,250.0,49.0,0.544592,0.020689,0.500647,0.534772,0.542894,0.557347,0.611515
Gene-based,500.0,49.0,0.541715,0.018178,0.500922,0.529472,0.538929,0.554759,0.593922
Module-based,-1.0,49.0,0.55097,0.021875,0.488396,0.53685,0.552254,0.565001,0.599869
Module-based,5.0,49.0,0.546616,0.022894,0.483261,0.528784,0.549019,0.561422,0.605431
Module-based,10.0,49.0,0.549093,0.023742,0.478016,0.536479,0.546349,0.560335,0.612864
Module-based,25.0,49.0,0.546519,0.02897,0.490034,0.523759,0.544105,0.567382,0.611074
Module-based,50.0,49.0,0.546046,0.026593,0.485353,0.531936,0.545826,0.562413,0.596345


In [159]:
# by method/tissue
predictions.groupby(["method", "tissue", "n_top_genes"]).apply(
    lambda x: roc_auc_score(x["true_class"], x["score"])
).groupby(["method", "tissue"]).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
method,tissue,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Gene-based,Adipose_Subcutaneous,5.0,0.517983,0.017567,0.501542,0.504625,0.510957,0.531000,0.541792
Gene-based,Adipose_Visceral_Omentum,5.0,0.516053,0.010469,0.504873,0.506332,0.518501,0.520662,0.529899
Gene-based,Adrenal_Gland,5.0,0.507062,0.007235,0.500647,0.500922,0.505740,0.509966,0.518033
Gene-based,Artery_Aorta,5.0,0.530601,0.022785,0.492209,0.528743,0.539631,0.542315,0.550107
Gene-based,Artery_Coronary,5.0,0.550239,0.019450,0.534194,0.536437,0.538667,0.564877,0.577018
...,...,...,...,...,...,...,...,...,...
Module-based,Testis,5.0,0.552882,0.016990,0.534703,0.541737,0.548937,0.561422,0.577610
Module-based,Thyroid,5.0,0.559365,0.008815,0.550933,0.551979,0.557334,0.565001,0.571581
Module-based,Uterus,5.0,0.537946,0.017693,0.516464,0.523759,0.539342,0.554856,0.555310
Module-based,Vagina,5.0,0.542282,0.017486,0.514660,0.540980,0.541779,0.553452,0.560541


## Predictions avg

In [160]:
predictions_avg.head()

Unnamed: 0,trait,drug,method,score,true_class
0,DOID:0050741,DB00215,Gene-based,460.0,1.0
1,DOID:0050741,DB00215,Module-based,437.8,1.0
2,DOID:0050741,DB00704,Gene-based,544.4,1.0
3,DOID:0050741,DB00704,Module-based,562.8,1.0
4,DOID:0050741,DB00822,Gene-based,598.2,1.0


In [161]:
predictions_avg.groupby(["method"]).apply(
    lambda x: roc_auc_score(x["true_class"], x["score"])
)

method
Gene-based      0.577831
Module-based    0.632101
dtype: float64

# PR

In [162]:
from sklearn.metrics import average_precision_score

## Predictions

In [163]:
# by method/n_top_genes
predictions.groupby(["method", "tissue", "n_top_genes"]).apply(
    lambda x: average_precision_score(x["true_class"], x["score"])
).groupby(["method", "n_top_genes"]).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
method,n_top_genes,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Gene-based,-1.0,49.0,0.819652,0.00944,0.799782,0.813899,0.820518,0.826926,0.837497
Gene-based,50.0,49.0,0.818973,0.011333,0.797444,0.812942,0.818782,0.825572,0.846052
Gene-based,100.0,49.0,0.822515,0.010255,0.795834,0.81696,0.82132,0.829157,0.843993
Gene-based,250.0,49.0,0.826079,0.010757,0.805038,0.820633,0.8258,0.832956,0.858472
Gene-based,500.0,49.0,0.824255,0.008704,0.80921,0.817748,0.825342,0.829961,0.851972
Module-based,-1.0,49.0,0.825578,0.010326,0.803155,0.818189,0.825569,0.832833,0.850523
Module-based,5.0,49.0,0.824127,0.011438,0.79578,0.815947,0.827801,0.8319,0.851023
Module-based,10.0,49.0,0.824383,0.0125,0.795728,0.817049,0.823271,0.831864,0.849924
Module-based,25.0,49.0,0.821462,0.01499,0.794916,0.810213,0.820894,0.830868,0.853334
Module-based,50.0,49.0,0.822552,0.015475,0.778559,0.815136,0.824726,0.831564,0.851906


In [164]:
# by method/tissue
predictions.groupby(["method", "tissue", "n_top_genes"]).apply(
    lambda x: average_precision_score(x["true_class"], x["score"])
).groupby(["method", "tissue"]).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
method,tissue,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Gene-based,Adipose_Subcutaneous,5.0,0.811499,0.009896,0.797558,0.807947,0.810087,0.819524,0.822379
Gene-based,Adipose_Visceral_Omentum,5.0,0.811665,0.005740,0.804858,0.805990,0.814885,0.816269,0.816321
Gene-based,Adrenal_Gland,5.0,0.813936,0.005540,0.809210,0.810372,0.810965,0.816784,0.822350
Gene-based,Artery_Aorta,5.0,0.822988,0.011146,0.803923,0.824227,0.825382,0.828809,0.832597
Gene-based,Artery_Coronary,5.0,0.829851,0.009367,0.821312,0.822853,0.825572,0.837073,0.842447
...,...,...,...,...,...,...,...,...,...
Module-based,Testis,5.0,0.829219,0.009344,0.816903,0.825178,0.828267,0.833967,0.841779
Module-based,Thyroid,5.0,0.829913,0.004428,0.822062,0.831268,0.831579,0.831826,0.832833
Module-based,Uterus,5.0,0.812736,0.012707,0.798563,0.800676,0.814974,0.822915,0.826553
Module-based,Vagina,5.0,0.811183,0.013788,0.792978,0.805208,0.810304,0.817441,0.829982


## Predictions avg

In [165]:
predictions_avg.groupby(["method"]).apply(
    lambda x: average_precision_score(x["true_class"], x["score"])
)

method
Gene-based      0.846631
Module-based    0.857572
dtype: float64