# Description

This notebook reads the *full predictions* results (with all traits in PhenomeXcan, it doesn't matter if it doesn't have DOID map) generated with the `011-prediction-*` notebooks and saves for later use.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

from data.hdf5 import simplify_trait_fullcode
import conf

# Settings

In [3]:
# these numbers are for testing/checking
N_TISSUES = 49
N_THRESHOLDS = 5

In [4]:
INPUT_DIR = conf.RESULTS["DRUG_DISEASE_ANALYSES"] / "lincs"
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs')

In [5]:
INPUT_PREDICTIONS_DIR = Path(INPUT_DIR, "predictions", "dotprod_neg")
display(INPUT_PREDICTIONS_DIR)
INPUT_PREDICTIONS_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/predictions/dotprod_neg')

In [6]:
OUTPUT_DIR = Path(INPUT_DIR, "predictions")
display(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/predictions')

In [6]:
OUTPUT_FILENAME = Path(OUTPUT_DIR, "predictions")
display(OUTPUT_FILENAME)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/predictions')

# Load drug-disease predictions

In [7]:
from collections import defaultdict

In [8]:
# get all prediction files

current_prediction_files = sorted(
    [f for f in list(INPUT_PREDICTIONS_DIR.glob("*.h5")) if "-projection-" in f.name]
)
display(len(current_prediction_files))

assert len(current_prediction_files) == (N_TISSUES * N_THRESHOLDS)

245

In [9]:
current_prediction_files[:10]

[PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-projection-all_genes-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-projection-top_10_genes-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-projection-top_25_genes-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-projection-top_50_genes-prediction_scores.h5'),
 PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subc

In [10]:
def _get_tissue(x):
    """
    It extracts the tissue name from a filename.
    """
    if x.endswith("-projection"):
        return x.split("spredixcan-mashr-zscores-")[1].split("-projection")[0]
    else:
        return x.split("spredixcan-mashr-zscores-")[1].split("-data")[0]

In [11]:
# get all tissue names

all_tissues = set()
all_methods = set()

for f in tqdm(current_prediction_files, ncols=100):
    # read metadata
    metadata = pd.read_hdf(f, key="metadata")

    # get the tissue name
    _data = metadata.data.values[0]
    _tissue = _get_tissue(_data)
    all_tissues.add(_tissue)

    _n_top_genes = metadata.n_top_genes.values[0]
    all_methods.add(_n_top_genes)

100%|█████████████████████████████████████████████████████████████| 245/245 [00:13<00:00, 17.71it/s]


In [12]:
assert len(all_methods) == N_THRESHOLDS
display(all_methods)

{-1.0, 5, 10, 25, 50}

In [13]:
all_tissues = sorted(list(all_tissues))

In [14]:
assert len(all_tissues) == N_TISSUES

In [15]:
_tmp_df = pd.read_hdf(current_prediction_files[0], key="full_prediction")
all_traits = _tmp_df["trait"].drop_duplicates().tolist()
all_drugs = _tmp_df["drug"].drop_duplicates().tolist()

In [16]:
_tmp_df.head()

Unnamed: 0,trait,drug,score
0,I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,DB00014,-0.015198
1,I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,DB00091,0.052318
2,I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,DB00121,0.00714
3,I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,DB00130,-0.019694
4,I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,DB00131,0.018786


In [17]:
assert len(all_traits) == 4091

In [18]:
assert len(all_drugs) == 1170

## Create predictions dataframe

In [28]:
# Iterate for each prediction file and perform some preprocessing.
#
# Each prediction file (.h5) has the predictions of one method (either module-based
# or gene-based) for all drug-disease pairs across all S-PrediXcan tissues

with pd.HDFStore(OUTPUT_FILENAME, mode="w", complevel=4) as store:
    for tissue in tqdm(all_tissues, ncols=100):
        # get all the prediction files for one tissue
        tissue_prediction_files = [
            x for x in current_prediction_files if f"-{tissue}-" in x.name
        ]
        assert len(tissue_prediction_files) == len(all_methods)

        tissue_df = pd.DataFrame(
            data=0,
            index=all_traits.copy(),
            columns=all_drugs.copy(),
            dtype="float32",
        )

        for f in tissue_prediction_files:
            # read metadata
            metadata = pd.read_hdf(f, key="metadata")
            _data = metadata.data.values[0]
            _tissue = _get_tissue(_data)
            assert _tissue == tissue

            # get full predictions
            prediction_data = pd.read_hdf(f, key="full_prediction")
            prediction_data["score"] = prediction_data["score"].rank()
            prediction_data = prediction_data.pivot(
                index="trait", columns="drug", values="score"
            )
            prediction_data = prediction_data.astype("float32")

            # sum across N_THRESHOLDS (which is equals to len(all_methods))
            tissue_df += prediction_data.loc[tissue_df.index, tissue_df.columns]

        # save the average
        store.put(
            simplify_trait_fullcode(tissue, prefix=""),
            (tissue_df / len(all_methods)).astype("float32"),
            format="fixed",
        )

100%|███████████████████████████████████████████████████████████████| 49/49 [08:56<00:00, 10.95s/it]


## Testing

In [29]:
_tissue = "Adipose_Subcutaneous"

In [30]:
with pd.HDFStore(OUTPUT_FILENAME, mode="r") as store:
    tissue_df = store[simplify_trait_fullcode(_tissue, prefix="")]

In [31]:
assert not tissue_df.isna().any().any()

In [32]:
tissue_df.shape

(4091, 1170)

In [33]:
tissue_df.head()

Unnamed: 0,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,-0.006497,0.07221,-0.000529,-0.004296,0.006141,0.003154,-0.009074,0.000412,-0.011412,-5.1e-05,...,-0.005129,-0.005269,0.00396,0.000896,-0.007339,-0.0005,-0.006592,-0.005177,0.000278,-0.00105
I71-Diagnoses_main_ICD10_I71_Aortic_aneurysm_and_dissection,0.000659,-0.005164,0.002791,0.010931,0.002671,0.005856,0.000188,0.004834,0.003271,-0.003199,...,0.004561,0.003152,-9.7e-05,0.001627,0.003118,-0.000943,-0.001265,0.012858,0.000736,0.004679
G62-Diagnoses_main_ICD10_G62_Other_polyneuropathies,0.004322,-0.017429,-0.000954,-0.008551,0.002615,0.003103,0.005615,0.006073,0.000174,0.006135,...,0.004491,0.002732,0.001949,-0.002629,-0.002929,-0.008633,0.009766,-0.006897,0.006556,0.005399
2395_4-Hairbalding_pattern_Pattern_4,0.001281,0.014856,-0.000798,-0.006296,-0.002154,-0.005253,-0.01391,-0.005741,-0.00484,0.008523,...,-0.000877,0.004396,-0.001962,-0.002118,0.005094,-0.005699,-0.011614,-0.022731,-0.004846,-0.001254
20003_1141168590-Treatmentmedication_code_pariet_10mg_ec_tablet,-0.001726,0.02415,-0.006406,0.003348,0.004601,-0.007618,-0.005133,-0.005093,-0.002373,0.000356,...,-0.0054,-0.003261,-0.001523,-0.003059,0.004404,-2.7e-05,-0.004118,-0.007614,0.003105,-0.00973


In [34]:
_files = [x for x in current_prediction_files if f"-{_tissue}-" in x.name]

In [35]:
display(len(_files))
assert len(_files) == N_THRESHOLDS

5

In [42]:
_files_data = [
    pd.read_hdf(f, key="full_prediction").set_index(["trait", "drug"]).squeeze().rank()
    for f in _files
]

In [43]:
_files_data[0].head(5)

trait                                         drug   
I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities  DB00014   -0.015198
                                              DB00091    0.052318
                                              DB00121    0.007140
                                              DB00130   -0.019694
                                              DB00131    0.018786
Name: score, dtype: float64

In [44]:
_trait = "I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities"
_drug = "DB00014"

In [58]:
assert tissue_df.loc[_trait, _drug].round(7) == np.mean(
    [x.loc[(_trait, _drug)] for x in _files_data]
).round(7).astype("float32")