# Description

This notebook takes the top LVs driving the prediction of a drug and a trait. Then it reads the cell types/tissues associated with each LV.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from data.recount2 import LVAnalysis
import conf

# Settings

In [3]:
SHORT_TRAIT_NAME = "ICD10_I70_Atherosclerosis"
FULL_TRAIT_NAME = "I70-Diagnoses_main_ICD10_I70_Atherosclerosis"

In [4]:
QUANTILE = 0.95

# Paths

In [5]:
OUTPUT_DIR = conf.RESULTS["DRUG_DISEASE_ANALYSES"] / "lincs" / "analyses"
display(OUTPUT_DIR)
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/analyses')

In [6]:
OUTPUT_FIGURES_DIR = Path(
    conf.MANUSCRIPT["FIGURES_DIR"], "drug_disease_prediction"
).resolve()
display(OUTPUT_FIGURES_DIR)
OUTPUT_FIGURES_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier_manuscript/content/images/drug_disease_prediction')

In [7]:
OUTPUT_FILE = OUTPUT_DIR / "cardiovascular-niacin.h5"
display(OUTPUT_FILE)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/analyses/cardiovascular-niacin.h5')

# Load data

## Original data

In [8]:
INPUT_SUBSET = "z_score_std"

In [9]:
INPUT_STEM = "projection-smultixcan-efo_partial-mashr-zscores"

In [10]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    f"{INPUT_SUBSET}-{INPUT_STEM}.pkl",
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/data_transformations/z_score_std/z_score_std-projection-smultixcan-efo_partial-mashr-zscores.pkl')

'z_score_std-projection-smultixcan-efo_partial-mashr-zscores'

In [11]:
data = pd.read_pickle(input_filepath)

In [12]:
data.shape

(3752, 987)

In [13]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.695006,1.962565,0.057683,0.878731,-0.539977,1.481272,-0.396422,1.09018,0.759223,0.931395,...,1.129784,1.752343,-1.411403,2.823863,0.931116,-1.054519,0.432982,-0.633597,0.554279,-0.642479
100002_raw-Energy,-1.528127,-0.345309,-0.148953,-0.24206,0.373427,0.791092,0.263477,0.987702,0.354391,1.416059,...,0.224604,0.769882,-0.509482,0.091153,2.286789,-1.008256,-0.029764,1.737229,-0.272107,-0.526125
100003_raw-Protein,-0.704572,-1.011299,0.67142,0.143991,0.615212,0.874212,-0.040998,0.91517,0.254369,-0.084237,...,1.003019,1.044314,-2.376108,0.004778,0.053714,-0.892447,-0.1838,1.377991,-0.278794,-0.419733
100004_raw-Fat,-0.989832,-1.87549,0.261555,-1.420719,0.366238,1.167049,0.257387,0.717674,-0.997664,0.969825,...,0.585913,0.638314,0.119139,-0.140204,1.394326,-1.173402,0.555058,1.013982,-0.544506,-0.064061
100005_raw-Carbohydrate,-0.580143,0.243335,0.158966,-0.036558,0.068176,-0.202639,1.101281,0.675227,1.463432,1.010078,...,-0.249108,-0.026814,0.232713,0.323682,1.168642,-0.282935,0.653105,1.909526,0.199997,-1.656894


## Load drug/trait LVs

In [14]:
output_file = OUTPUT_DIR / "cardiovascular-niacin.h5"
display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/drug_disease_analyses/lincs/analyses/cardiovascular-niacin.h5')

In [15]:
with pd.HDFStore(output_file, mode="r") as store:
    traits_module_tissue_data = store["traits_module_tissue_data"]
    drug_data = store["drug_data"]
    drug_trait_predictions = store["drug_trait_predictions"]

In [16]:
top_lvs = drug_trait_predictions[FULL_TRAIT_NAME].sort_values(ascending=False)

q = top_lvs.quantile(QUANTILE)

top_lvs = top_lvs[top_lvs > q]

In [17]:
top_lvs.shape

(50,)

In [18]:
top_lvs

LV116    0.006180
LV931    0.004843
LV744    0.002292
LV697    0.002231
LV885    0.001984
LV536    0.001956
LV550    0.001933
LV220    0.001822
LV272    0.001785
LV739    0.001770
LV678    0.001525
LV470    0.001444
LV66     0.001375
LV189    0.001371
LV517    0.001296
LV840    0.001285
LV246    0.001253
LV502    0.001249
LV525    0.001227
LV85     0.001220
LV926    0.001217
LV824    0.001205
LV496    0.001181
LV545    0.001147
LV541    0.001083
LV143    0.001054
LV221    0.001040
LV122    0.001037
LV802    0.001008
LV106    0.001007
LV798    0.000935
LV50     0.000923
LV23     0.000912
LV631    0.000863
LV6      0.000856
LV893    0.000851
LV882    0.000841
LV142    0.000806
LV913    0.000796
LV478    0.000789
LV387    0.000737
LV879    0.000733
LV855    0.000718
LV248    0.000714
LV532    0.000706
LV512    0.000692
LV353    0.000692
LV202    0.000686
LV848    0.000684
LV319    0.000676
Name: I70-Diagnoses_main_ICD10_I70_Atherosclerosis, dtype: float64

In [19]:
lvs_list = top_lvs.index.tolist()

In [20]:
len(lvs_list)

50

In [21]:
def _get_attributes(x):
    _cols = [c for c in x.index if not c.startswith("LV")]
    _tmp = x[_cols].dropna()
    if _tmp.shape[0] > 0:
        return _tmp.iloc[0]

    return None

# Get cell types/tissues for top LVs

In [22]:
cell_type_dfs = []
tissue_dfs = []

pbar = tqdm(lvs_list)
for lv_name in pbar:
    pbar.set_description(lv_name)

    lv_obj = LVAnalysis(lv_name, data)

    lv_data = lv_obj.get_experiments_data(debug=False, warnings=False)

    # get cell type attributes
    lv_attrs = pd.Series(lv_data.columns.tolist())
    lv_attrs = lv_attrs[
        lv_attrs.str.match(
            "(?:cell[^\w]*type$)",
            case=False,
            flags=re.IGNORECASE,
        ).values
    ].sort_values(ascending=False)

    lv_attrs_data = lv_data[lv_attrs.tolist() + [lv_name]]
    lv_attrs_data = lv_attrs_data.assign(
        attr=lv_attrs_data.apply(_get_attributes, axis=1)
    )
    lv_attrs_data = lv_attrs_data.drop(columns=lv_attrs.tolist())
    lv_attrs_data = lv_attrs_data.dropna().sort_values(lv_name, ascending=False)
    lv_attrs_data = lv_attrs_data.rename(columns={lv_name: "lv"})
    lv_attrs_data = lv_attrs_data.assign(lv_name=lv_name)
    cell_type_dfs.append(lv_attrs_data)

    # get tissue attributes
    lv_attrs = pd.Series(lv_data.columns.tolist())
    lv_attrs = lv_attrs[
        lv_attrs.str.match(
            "(?:tissue$)|(?:tissue[^\w]*type$)",
            case=False,
            flags=re.IGNORECASE,
        ).values
    ].sort_values(ascending=False)

    lv_attrs_data = lv_data[lv_attrs.tolist() + [lv_name]]
    lv_attrs_data = lv_attrs_data.assign(
        attr=lv_attrs_data.apply(_get_attributes, axis=1)
    )
    lv_attrs_data = lv_attrs_data.drop(columns=lv_attrs.tolist())
    lv_attrs_data = lv_attrs_data.dropna().sort_values(lv_name, ascending=False)
    lv_attrs_data = lv_attrs_data.rename(columns={lv_name: "lv"})
    lv_attrs_data = lv_attrs_data.assign(lv_name=lv_name)
    tissue_dfs.append(lv_attrs_data)

LV319: 100%|██████████| 50/50 [12:51<00:00, 15.43s/it]


## Prepare dataframe

In [23]:
cell_types_data = pd.concat(cell_type_dfs, ignore_index=True)

In [24]:
cell_types_data.shape

(173682, 3)

In [25]:
cell_types_data.head()

Unnamed: 0,lv,attr,lv_name
0,2.307126,Neutrophils,LV116
1,2.151321,dermal fibroblast,LV116
2,2.047151,PBMCs,LV116
3,2.039088,PBMCs,LV116
4,2.031884,PBMCs,LV116


In [26]:
tissues_data = pd.concat(tissue_dfs, ignore_index=True)

In [27]:
tissues_data.shape

(75235, 3)

In [28]:
tissues_data.head()

Unnamed: 0,lv,attr,lv_name
0,2.7808,Whole blood,LV116
1,2.34326,Whole blood,LV116
2,2.334816,Whole blood,LV116
3,2.324611,Whole blood,LV116
4,2.213722,Whole blood,LV116


## Save

In [29]:
with pd.HDFStore(output_file, mode="r+", complevel=4) as store:
    store.put(f"traits/{SHORT_TRAIT_NAME}/top_lvs", top_lvs, format="fixed")
    store.put(f"traits/{SHORT_TRAIT_NAME}/cell_types", cell_types_data, format="fixed")
    store.put(f"traits/{SHORT_TRAIT_NAME}/tissues", tissues_data, format="fixed")