# Description

**TODO**

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

import conf

# Settings

In [3]:
OUTPUT_DIR = conf.RESULTS["DRUG_DISEASE_ANALYSES"]
display(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses')

In [4]:
OUTPUT_PREDICTIONS_DIR = Path(OUTPUT_DIR, "predictions", "dotprod_neg")
display(OUTPUT_PREDICTIONS_DIR)
OUTPUT_PREDICTIONS_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg')

# Load PharmacotherapyDB gold standard

In [5]:
gold_standard = pd.read_pickle(
    Path(OUTPUT_DIR, "gold_standard.pkl"),
)

In [6]:
gold_standard.shape

(998, 3)

In [7]:
gold_standard.head()

Unnamed: 0,trait,drug,true_class
0,DOID:10652,DB00843,1
1,DOID:10652,DB00674,1
2,DOID:10652,DB01043,1
3,DOID:10652,DB00989,1
4,DOID:10652,DB00810,0


In [8]:
gold_standard["true_class"].value_counts()

1    755
0    243
Name: true_class, dtype: int64

In [9]:
gold_standard["true_class"].value_counts(normalize=True)

1    0.756513
0    0.243487
Name: true_class, dtype: float64

In [10]:
doids_in_gold_standard = set(gold_standard["trait"])

# Load drug-disease predictions

In [11]:
from collections import defaultdict

In [12]:
current_prediction_files = list(OUTPUT_PREDICTIONS_DIR.glob("*.h5"))
display(len(current_prediction_files))

102

In [13]:
current_prediction_files[:5]

[PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg/smultixcan-mashr-zscores-data-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg/smultixcan-mashr-zscores-projection-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-data-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-projection-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Visceral_Omentum-data-prediction_scores.h5')]

In [14]:
predictions = []

for f in tqdm(current_prediction_files, ncols=100):
    # exclude S-MultiXcan results, since they have no direction of effect
    if f.name.startswith("smultixcan-"):
        continue

    prediction_data = pd.read_hdf(f, key="prediction")
    prediction_data = pd.merge(
        prediction_data, gold_standard, on=["trait", "drug"], how="inner"
    )

    metadata = pd.read_hdf(f, key="metadata")

    prediction_data["score"] = prediction_data["score"].rank()
    prediction_data["trait"] = prediction_data["trait"].astype("category")
    prediction_data["drug"] = prediction_data["drug"].astype("category")
    prediction_data = prediction_data.assign(method=metadata.method)
    prediction_data = prediction_data.assign(data=metadata.data)

    predictions.append(prediction_data)

100%|█████████████████████████████████████████████████████████████| 102/102 [00:14<00:00,  7.11it/s]


In [15]:
predictions = pd.concat(predictions, ignore_index=True)

In [16]:
predictions.shape

(64600, 6)

In [17]:
predictions.head()

Unnamed: 0,trait,drug,score,true_class,method,data
0,DOID:0050741,DB00215,251.0,1,Gene-based,spredixcan-mashr-zscores-Adipose_Subcutaneous-...
1,DOID:0050741,DB00704,307.0,1,Gene-based,spredixcan-mashr-zscores-Adipose_Subcutaneous-...
2,DOID:0050741,DB00822,94.0,1,Gene-based,spredixcan-mashr-zscores-Adipose_Subcutaneous-...
3,DOID:10283,DB00014,69.0,1,Gene-based,spredixcan-mashr-zscores-Adipose_Subcutaneous-...
4,DOID:10283,DB00175,254.0,0,Gene-based,spredixcan-mashr-zscores-Adipose_Subcutaneous-...


## Testing

In [18]:
# all prediction tables should have the same shape
predictions_shape = (
    predictions.groupby(["method", "data"]).apply(lambda x: x.shape).unique()
)
display(predictions_shape)
assert predictions_shape.shape[0] == 1

array([(646, 6)], dtype=object)

## Save

In [19]:
output_file = Path(OUTPUT_DIR, "predictions", "predictions_results.pkl").resolve()
display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/predictions_results.pkl')

In [20]:
predictions.to_pickle(output_file)

# Aggregate predictions

In [21]:
def _reduce(x):
    return pd.Series(
        {
            "score": x["score"].max(),
            "true_class": x["true_class"].unique()[0]
            if x["true_class"].unique().shape[0] == 1
            else None,
        }
    )

In [22]:
predictions_avg = (
    predictions.groupby(["trait", "drug", "method"])
    .apply(_reduce)
    .dropna()
    .sort_index()
    .reset_index()
)

In [23]:
predictions_avg.shape

(1292, 5)

In [24]:
# predictions_avg should have twice the number of rows in the predictions table, since has both methods
assert predictions_avg.shape[0] == int(predictions_shape[0][0] * 2)

In [25]:
assert predictions_avg.dropna().shape == predictions_avg.shape

In [26]:
predictions_avg.head()

Unnamed: 0,trait,drug,method,score,true_class
0,DOID:0050741,DB00215,Gene-based,379.0,1.0
1,DOID:0050741,DB00215,Module-based,541.0,1.0
2,DOID:0050741,DB00704,Gene-based,483.0,1.0
3,DOID:0050741,DB00704,Module-based,562.0,1.0
4,DOID:0050741,DB00822,Gene-based,579.0,1.0


## Save

In [27]:
output_file = Path(
    OUTPUT_DIR, "predictions", "predictions_results_aggregated.pkl"
).resolve()
display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/predictions/predictions_results_aggregated.pkl')

In [28]:
predictions_avg.to_pickle(output_file)

# ROC

In [29]:
from sklearn.metrics import roc_auc_score

In [30]:
predictions.groupby(["method", "data"]).apply(
    lambda x: roc_auc_score(x["true_class"], x["score"])
).groupby("method").describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gene-based,50.0,0.537465,0.021894,0.488657,0.525222,0.539817,0.549346,0.591238
Module-based,50.0,0.550209,0.022309,0.488396,0.536499,0.552103,0.564925,0.599869


In [31]:
predictions_avg.groupby(["method"]).apply(
    lambda x: roc_auc_score(x["true_class"], x["score"])
).groupby("method").describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gene-based,1.0,0.573384,,0.573384,0.573384,0.573384,0.573384,0.573384
Module-based,1.0,0.61486,,0.61486,0.61486,0.61486,0.61486,0.61486


# PR

In [32]:
from sklearn.metrics import average_precision_score

In [33]:
predictions.groupby(["method", "data"]).apply(
    lambda x: average_precision_score(x["true_class"], x["score"])
).groupby("method").describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gene-based,50.0,0.821235,0.012008,0.791883,0.815912,0.823289,0.826459,0.845993
Module-based,50.0,0.825101,0.010762,0.801752,0.817628,0.825254,0.832694,0.850523


In [34]:
predictions_avg.groupby(["method"]).apply(
    lambda x: average_precision_score(x["true_class"], x["score"])
).groupby("method").describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gene-based,1.0,0.841137,,0.841137,0.841137,0.841137,0.841137,0.841137
Module-based,1.0,0.851117,,0.851117,0.851117,0.851117,0.851117,0.851117
