# Description

This notebook reads the *full predictions* results (with all traits in PhenomeXcan, it doesn't matter if it doesn't have DOID map) generated with the `011-prediction-*` notebooks and saves for later use.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

from data.hdf5 import simplify_trait_fullcode
import conf

# Settings

In [3]:
# these numbers are for testing/checking
N_TISSUES = 49
N_THRESHOLDS = 5

In [4]:
INPUT_DIR = conf.RESULTS["DRUG_DISEASE_ANALYSES"] / "lincs"
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs')

In [5]:
INPUT_PREDICTIONS_DIR = Path(INPUT_DIR, "predictions", "dotprod_neg")
display(INPUT_PREDICTIONS_DIR)
INPUT_PREDICTIONS_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/predictions/dotprod_neg')

In [6]:
OUTPUT_DIR = Path(INPUT_DIR, "predictions")
display(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/predictions')

In [7]:
OUTPUT_FILENAME = Path(OUTPUT_DIR, "full_predictions_by_tissue-rank.h5")
display(OUTPUT_FILENAME)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/predictions/full_predictions_by_tissue-rank.h5')

# Load drug-disease predictions

In [8]:
from collections import defaultdict

In [9]:
# get all prediction files

current_prediction_files = sorted(
    [f for f in list(INPUT_PREDICTIONS_DIR.glob("*.h5")) if "-projection-" in f.name]
)
display(len(current_prediction_files))

assert len(current_prediction_files) == (N_TISSUES * N_THRESHOLDS)

245

In [10]:
current_prediction_files[:10]

[PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-projection-all_genes-prediction_scores.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-projection-top_10_genes-prediction_scores.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-projection-top_25_genes-prediction_scores.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/predictions/dotprod_neg/spredixcan-mashr-zscores-Adipose_Subcutaneous-projection-top_50_genes-prediction_scores.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/predictions/dotp

In [11]:
def _get_tissue(x):
    """
    It extracts the tissue name from a filename.
    """
    if x.endswith("-projection"):
        return x.split("spredixcan-mashr-zscores-")[1].split("-projection")[0]
    else:
        return x.split("spredixcan-mashr-zscores-")[1].split("-data")[0]

In [12]:
# get all tissue names

all_tissues = set()
all_methods = set()

for f in tqdm(current_prediction_files, ncols=100):
    # read metadata
    metadata = pd.read_hdf(f, key="metadata")

    # get the tissue name
    _data = metadata.data.values[0]
    _tissue = _get_tissue(_data)
    all_tissues.add(_tissue)

    _n_top_genes = metadata.n_top_genes.values[0]
    all_methods.add(_n_top_genes)

100%|████████████████████████████████████████████████████████████| 245/245 [00:01<00:00, 127.44it/s]


In [13]:
assert len(all_methods) == N_THRESHOLDS
display(all_methods)

{-1.0, 5, 10, 25, 50}

In [14]:
all_tissues = sorted(list(all_tissues))

In [15]:
assert len(all_tissues) == N_TISSUES

In [16]:
_tmp_df = pd.read_hdf(current_prediction_files[0], key="full_prediction")
all_traits = _tmp_df["trait"].drop_duplicates().tolist()
all_drugs = _tmp_df["drug"].drop_duplicates().tolist()

In [17]:
_tmp_df.head()

Unnamed: 0,trait,drug,score
0,I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,DB00014,-0.015198
1,I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,DB00091,0.052318
2,I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,DB00121,0.00714
3,I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,DB00130,-0.019694
4,I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,DB00131,0.018786


In [18]:
assert len(all_traits) == 4091

In [19]:
assert len(all_drugs) == 1170

## Create predictions dataframe

In [20]:
# Iterate for each prediction file and perform some preprocessing.
#
# Each prediction file (.h5) has the predictions of one method (either module-based
# or gene-based) for all drug-disease pairs across all S-PrediXcan tissues

with pd.HDFStore(OUTPUT_FILENAME, mode="w", complevel=4) as store:
    for tissue in tqdm(all_tissues, ncols=100):
        # get all the prediction files for one tissue
        tissue_prediction_files = [
            x for x in current_prediction_files if f"-{tissue}-" in x.name
        ]
        assert len(tissue_prediction_files) == len(all_methods)

        tissue_df = pd.DataFrame(
            data=0,
            index=all_traits.copy(),
            columns=all_drugs.copy(),
            dtype="float32",
        )

        for f in tissue_prediction_files:
            # read metadata
            metadata = pd.read_hdf(f, key="metadata")
            _data = metadata.data.values[0]
            _tissue = _get_tissue(_data)
            assert _tissue == tissue

            # get full predictions
            prediction_data = pd.read_hdf(f, key="full_prediction")
            prediction_data["score"] = prediction_data["score"].rank()
            prediction_data = prediction_data.pivot(
                index="trait", columns="drug", values="score"
            )
            prediction_data = prediction_data.astype("float32")

            # sum across N_THRESHOLDS (which is equals to len(all_methods))
            tissue_df += prediction_data.loc[tissue_df.index, tissue_df.columns]

        # save the average
        store.put(
            simplify_trait_fullcode(tissue, prefix=""),
            (tissue_df / len(all_methods)).astype("float32"),
            format="fixed",
        )

100%|███████████████████████████████████████████████████████████████| 49/49 [10:30<00:00, 12.86s/it]


## Testing

In [21]:
_tissue = "Adipose_Subcutaneous"

In [22]:
with pd.HDFStore(OUTPUT_FILENAME, mode="r") as store:
    tissue_df = store[simplify_trait_fullcode(_tissue, prefix="")]

In [23]:
assert not tissue_df.isna().any().any()

In [24]:
tissue_df.shape

(4091, 1170)

In [25]:
tissue_df.head()

Unnamed: 0,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,859149.625,4768451.0,1805252.0,1854894.375,3398083.25,2779902.25,627193.3,2718546.75,506876.6,2362122.0,...,1253533.375,1393785.375,3439556.0,2334818.0,612397.2,2318932.0,1158458.0,1771659.0,2203938.75,2170518.0
I71-Diagnoses_main_ICD10_I71_Aortic_aneurysm_and_dissection,2158918.0,1604089.75,2996882.0,4482039.0,3456084.5,2998575.75,2721814.0,3717782.5,3051466.0,1574597.375,...,3172424.0,2908268.0,2173213.5,3204648.5,3457790.0,2039608.0,2139980.0,4523702.0,2600490.75,3802814.0
G62-Diagnoses_main_ICD10_G62_Other_polyneuropathies,3461948.75,1927388.75,2176780.0,512809.5,3038566.25,3553626.75,3918880.0,4038913.5,2468866.0,3752765.5,...,3365832.75,3317607.25,2800978.75,1869802.0,1344453.0,625473.2,4325136.0,1872992.0,3932942.5,3852908.0
2395_4-Hairbalding_pattern_Pattern_4,3090963.0,4568733.0,2645423.0,1255693.875,2618424.0,2325709.25,399321.1,1472670.75,1134247.0,3924732.75,...,2988520.5,3734202.5,2425991.25,1622558.375,3532253.0,1441019.0,625913.2,76976.6,1666976.625,2402028.0
20003_1141168590-Treatmentmedication_code_pariet_10mg_ec_tablet,2202725.5,3775144.75,685390.3,3455764.5,3799727.25,621039.375,1006550.0,1001633.375,1557073.0,2179146.0,...,1229269.625,1622658.0,2122098.25,1419842.625,3815263.0,2282254.0,1187751.0,1162379.0,3317796.5,434379.8


In [26]:
_files = [x for x in current_prediction_files if f"-{_tissue}-" in x.name]

In [27]:
display(len(_files))
assert len(_files) == N_THRESHOLDS

5

In [28]:
_files_data = [
    pd.read_hdf(f, key="full_prediction").set_index(["trait", "drug"]).squeeze().rank()
    for f in _files
]

In [29]:
_files_data[0].head(5)

trait                                         drug   
I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities  DB00014     624145.0
                                              DB00091    4719379.0
                                              DB00121    3416104.5
                                              DB00130     403035.5
                                              DB00131    4345081.0
Name: score, dtype: float64

In [30]:
_trait = "I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities"
_drug = "DB00014"

In [31]:
assert tissue_df.loc[_trait, _drug].round(7) == np.mean(
    [x.loc[(_trait, _drug)] for x in _files_data]
).round(7).astype("float32")