# Description

This notebook analyzes the LVs driving the association of Niacin with some cardiovascular traits. Then it writes a table in markdown with the results.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import re

import numpy as np
import pandas as pd

from entity import Gene
import conf

# Settings

In [3]:
QUANTILE = 0.95

# Paths

In [4]:
OUTPUT_DIR = conf.RESULTS["DRUG_DISEASE_ANALYSES"] / "lincs" / "analyses"
display(OUTPUT_DIR)
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/drug_disease_analyses/lincs/analyses')

In [5]:
INPUT_DIR = conf.RESULTS["DRUG_DISEASE_ANALYSES"] / "lincs" / "predictions"
# display(OUTPUT_DIR)
# OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
input_predictions_by_tissue_file = INPUT_DIR / "full_predictions_by_tissue-rank.h5"
display(input_predictions_by_tissue_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/drug_disease_analyses/lincs/predictions/full_predictions_by_tissue-rank.h5')

In [6]:
assert "CONTENT_DIR" in conf.MANUSCRIPT

OUTPUT_FILE_PATH = conf.MANUSCRIPT["CONTENT_DIR"] / "04.15.drug_disease_prediction.md"
display(OUTPUT_FILE_PATH)
assert OUTPUT_FILE_PATH.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier_manuscript/content/04.15.drug_disease_prediction.md')

# Data loading

## PharmacotherapyDB: load gold standard

### Final

In [7]:
gold_standard = pd.read_pickle(
    Path(conf.RESULTS["DRUG_DISEASE_ANALYSES"], "gold_standard.pkl"),
)

In [8]:
gold_standard.shape

(998, 3)

In [9]:
gold_standard.head()

Unnamed: 0,trait,drug,true_class
0,DOID:10652,DB00843,1
1,DOID:10652,DB00674,1
2,DOID:10652,DB01043,1
3,DOID:10652,DB00989,1
4,DOID:10652,DB00810,0


### Info

In [10]:
input_file = conf.PHARMACOTHERAPYDB["INDICATIONS_FILE"]
display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/data/hetionet/pharmacotherapydb-v1.0/indications.tsv')

In [11]:
gold_standard_info = pd.read_csv(input_file, sep="\t")

In [12]:
gold_standard_info = gold_standard_info.rename(columns={"drug": "drug_name"})

In [13]:
gold_standard_info.shape

(1388, 7)

In [14]:
gold_standard_info.head()

Unnamed: 0,doid_id,drugbank_id,disease,drug_name,category,n_curators,n_resources
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM,2,1
1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM,1,4
2,DOID:10652,DB01043,Alzheimer's disease,Memantine,DM,1,3
3,DOID:10652,DB00989,Alzheimer's disease,Rivastigmine,DM,1,3
4,DOID:10652,DB00245,Alzheimer's disease,Benzatropine,SYM,3,1


In [15]:
gold_standard_info = (
    gold_standard.set_index(["trait", "drug"])
    .join(
        gold_standard_info.rename(
            columns={"doid_id": "trait", "drugbank_id": "drug"}
        ).set_index(["trait", "drug"])
    )
    .reset_index()
)

In [16]:
gold_standard_info.shape

(998, 8)

In [17]:
gold_standard_info.head()

Unnamed: 0,trait,drug,true_class,disease,drug_name,category,n_curators,n_resources
0,DOID:10652,DB00843,1,Alzheimer's disease,Donepezil,DM,2,1
1,DOID:10652,DB00674,1,Alzheimer's disease,Galantamine,DM,1,4
2,DOID:10652,DB01043,1,Alzheimer's disease,Memantine,DM,1,3
3,DOID:10652,DB00989,1,Alzheimer's disease,Rivastigmine,DM,1,3
4,DOID:10652,DB00810,0,Alzheimer's disease,Biperiden,NOT,2,1


## LINCS data

In [18]:
input_file = Path(
    conf.RESULTS["DRUG_DISEASE_ANALYSES"], "lincs", "lincs-data.pkl"
).resolve()

display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/drug_disease_analyses/lincs/lincs-data.pkl')

In [19]:
lincs_data = pd.read_pickle(input_file).T.rename(columns=Gene.GENE_ID_TO_NAME_MAP)

In [20]:
display(lincs_data.shape)

(1170, 7120)

In [21]:
display(lincs_data.head())

Unnamed: 0_level_0,ADA,CDH2,AKT3,MED6,ACOT8,ABI1,GNPDA1,CDH3,TANK,TOPORS-AS1,...,RBX1,CDC42,THOC1,RCE1,HNRNPDL,DMTF1,PPP4R1,CDH1,PTBP3,CASP8AP2
perturbagen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DB00014,-1.001,1.146,-0.693,-0.037,0.162,-0.38,0.649,-1.952,0.294,0.274,...,-0.592,0.178,0.27,-0.013,0.351,-0.128,0.274,-0.734,-0.048,0.036
DB00091,-1.835,-1.863,1.694,0.383,-0.899,0.628,-4.878,2.527,1.709,-0.981,...,-0.668,-2.795,-0.333,-0.027,0.578,6.926,-1.875,4.382,0.266,-4.995
DB00121,1.391,0.011,-0.804,0.269,0.105,-0.588,-1.899,0.306,-1.178,0.12,...,-0.962,0.45,-0.999,1.358,-1.476,0.423,-1.356,-1.897,-0.299,-0.732
DB00130,1.132,-1.02,-0.164,-0.997,-0.09,0.195,-2.341,0.494,-0.813,-1.14,...,-0.553,-0.528,0.308,0.534,-0.32,3.001,-0.414,0.134,0.147,2.158
DB00131,0.257,1.143,1.145,0.185,-1.291,-0.457,0.038,1.442,-1.692,0.593,...,-0.556,-0.6,0.092,-0.564,0.174,-1.203,-1.08,-0.575,-0.792,-0.095


## LINCS projection

In [22]:
input_file = Path(
    conf.RESULTS["DRUG_DISEASE_ANALYSES"], "lincs", "lincs-projection.pkl"
).resolve()

display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/drug_disease_analyses/lincs/lincs-projection.pkl')

In [23]:
lincs_projection = pd.read_pickle(input_file).T

In [24]:
display(lincs_projection.shape)

(1170, 987)

In [25]:
display(lincs_projection.head())

Unnamed: 0_level_0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
perturbagen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DB00014,0.036115,0.012281,-0.005533,-0.004151,-0.015156,-0.010861,0.008181,-0.008148,0.004948,-0.009869,...,-0.003139,0.003546,-0.006242,0.002449,-0.004391,-0.004334,-0.019902,0.004292,-0.010378,0.002
DB00091,-0.35861,0.009738,-0.059174,-0.032884,-0.005276,0.063369,-0.079522,-0.01898,-0.025782,0.02154,...,-0.019892,0.00361,-0.016165,0.008686,0.065335,-0.042051,0.065516,0.016261,0.056314,-0.051035
DB00121,0.091067,0.00465,0.013454,0.005934,0.025747,0.004324,-0.001865,-0.019613,0.051503,0.013032,...,0.001629,-0.012565,0.009613,-0.014404,-0.009188,0.006544,0.047466,-0.01622,-0.018844,0.034675
DB00130,0.008923,-0.006554,0.008906,-0.003991,0.000346,0.008212,0.001242,-0.012771,0.014221,0.008269,...,-0.026139,-0.002977,0.021969,0.001204,-0.012731,-7.6e-05,-0.029127,-0.022679,0.016179,-0.003029
DB00131,0.046469,0.021758,-0.004939,-0.028524,-0.013932,0.012231,-0.001325,0.003787,0.035189,-0.004377,...,0.017373,0.005304,-0.001649,0.004017,-0.000622,-0.006477,0.021663,0.000988,0.002038,-0.011747


# Niacin and cardiovascular diseases

In [26]:
from entity import Trait

In [27]:
Trait.get_traits_from_efo("atherosclerosis")

[I70-Diagnoses_main_ICD10_I70_Atherosclerosis]

In [28]:
Trait.get_traits_from_efo("coronary artery disease")

[I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease,
 CARDIoGRAM_C4D_CAD_ADDITIVE]

In [29]:
_phenomexcan_traits = [
    "I70-Diagnoses_main_ICD10_I70_Atherosclerosis",
    "CARDIoGRAM_C4D_CAD_ADDITIVE",
    "I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease",
    "20002_1473-Noncancer_illness_code_selfreported_high_cholesterol",
    "6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above",
    "6150_1-Vascularheart_problems_diagnosed_by_doctor_Heart_attack",
    "I9_CHD-Major_coronary_heart_disease_event",
    "I9_CORATHER-Coronary_atherosclerosis",
    "I9_IHD-Ischaemic_heart_disease_wide_definition",
    "I9_MI-Myocardial_infarction",
    "I21-Diagnoses_main_ICD10_I21_Acute_myocardial_infarction",
    "20002_1075-Noncancer_illness_code_selfreported_heart_attackmyocardial_infarction",
]

_drug_id = "DB00627"
_drug_name = "Niacin"

In [30]:
for p in _phenomexcan_traits:
    print(p)
    d = Trait.get_trait(full_code=p)
    print((d.n, d.n_cases))

    print("\n")

I70-Diagnoses_main_ICD10_I70_Atherosclerosis
(361194, 566.0)


CARDIoGRAM_C4D_CAD_ADDITIVE
(184305, 60801.0)


I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease
(361194, 12769.0)


20002_1473-Noncancer_illness_code_selfreported_high_cholesterol
(361141, 43957.0)


6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above
(360420, 253565.0)


6150_1-Vascularheart_problems_diagnosed_by_doctor_Heart_attack
(360420, 8288.0)


I9_CHD-Major_coronary_heart_disease_event
(361194, 10157.0)


I9_CORATHER-Coronary_atherosclerosis
(361194, 14334.0)


I9_IHD-Ischaemic_heart_disease_wide_definition
(361194, 20857.0)


I9_MI-Myocardial_infarction
(361194, 7018.0)


I21-Diagnoses_main_ICD10_I21_Acute_myocardial_infarction
(361194, 5948.0)


20002_1075-Noncancer_illness_code_selfreported_heart_attackmyocardial_infarction
(361141, 8239.0)




## Get best tissue results for Niacin

In [31]:
drugs_tissue_df = {}

with pd.HDFStore(input_predictions_by_tissue_file, mode="r") as store:
    for tk in store.keys():
        df = store[tk][_drug_id]

        drugs_tissue_df[tk[1:]] = df

In [32]:
_tmp = pd.DataFrame(drugs_tissue_df)
display(_tmp.shape)
display(_tmp.head())

(4091, 49)

Unnamed: 0,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,Artery_Aorta,Artery_Coronary,Artery_Tibial,Brain_Amygdala,Brain_Anterior_cingulate_cortex_BA24,Brain_Caudate_basal_ganglia,Brain_Cerebellar_Hemisphere,...,Skin_Not_Sun_Exposed_Suprapubic,Skin_Sun_Exposed_Lower_leg,Small_Intestine_Terminal_Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole_Blood
I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities,2225108.0,3633306.5,3960107.25,1332054.75,2182425.0,4240339.0,2487093.0,2274668.5,1708839.625,2311500.0,...,3111573.0,1680886.375,2903454.0,1799322.375,1457560.625,3024874.0,3919868.0,3272210.5,1370451.0,3027555.75
I71-Diagnoses_main_ICD10_I71_Aortic_aneurysm_and_dissection,1803573.0,711574.0,1448934.75,4231713.0,1607177.0,1551651.25,1389767.0,3638570.0,2445540.5,3317544.0,...,1935886.625,2817997.0,1179611.25,1986576.25,1191470.25,2691960.0,1236895.75,3169182.25,1650408.625,2093344.375
G62-Diagnoses_main_ICD10_G62_Other_polyneuropathies,1298787.0,2518740.25,3957170.75,2937233.5,2458553.0,3229771.5,3576149.0,1369567.0,3312298.5,2019896.0,...,2011435.75,1944383.75,4573539.0,4539101.0,2696412.0,2205179.0,785903.0,1301301.75,4430059.0,3196323.5
2395_4-Hairbalding_pattern_Pattern_4,626108.2,3038228.5,2098431.5,2826578.25,308521.8,3097367.0,649715.8,2836304.5,3183412.0,871365.2,...,2939378.25,3161076.0,3586680.75,2400041.75,863019.625,928483.8,2868830.25,1913291.25,2029917.375,3557621.5
20003_1141168590-Treatmentmedication_code_pariet_10mg_ec_tablet,3017563.0,1880510.75,1094152.375,4206760.5,4452502.0,4179176.75,2875126.0,2042720.75,2486881.75,3534752.0,...,2561731.75,516964.0,2166641.5,2783551.5,2651308.5,2219414.0,1217317.25,2210372.5,3053042.25,3362472.0


In [33]:
# show top tissue models (from TWAS) for each trait
traits_best_tissues_df = (
    pd.DataFrame(drugs_tissue_df).loc[_phenomexcan_traits].idxmax(1)
)
display(traits_best_tissues_df)

I70-Diagnoses_main_ICD10_I70_Atherosclerosis                                                   Cells_Cultured_fibroblasts
CARDIoGRAM_C4D_CAD_ADDITIVE                                                                              Colon_Transverse
I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease                                  Brain_Putamen_basal_ganglia
20002_1473-Noncancer_illness_code_selfreported_high_cholesterol                                            Brain_Amygdala
6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above                                               Liver
6150_1-Vascularheart_problems_diagnosed_by_doctor_Heart_attack                                   Brain_Frontal_Cortex_BA9
I9_CHD-Major_coronary_heart_disease_event                                                     Brain_Cerebellar_Hemisphere
I9_CORATHER-Coronary_atherosclerosis                                                               Heart_Atrial_Appendage
I9_IHD-Ischaemic_heart_d

In [34]:
# pick the tissue with the maximum score for each trait
drug_df = pd.DataFrame(drugs_tissue_df).max(1)

In [35]:
drug_df.shape

(4091,)

In [36]:
drug_df.head()

I9_PHLETHROMBDVTLOW-DVT_of_lower_extremities                       4240339.0
I71-Diagnoses_main_ICD10_I71_Aortic_aneurysm_and_dissection        4231713.0
G62-Diagnoses_main_ICD10_G62_Other_polyneuropathies                4573539.0
2395_4-Hairbalding_pattern_Pattern_4                               3979892.0
20003_1141168590-Treatmentmedication_code_pariet_10mg_ec_tablet    4452501.5
dtype: float32

In [37]:
drug_df.loc[_phenomexcan_traits].sort_values()

20002_1075-Noncancer_illness_code_selfreported_heart_attackmyocardial_infarction    3996588.75
6150_1-Vascularheart_problems_diagnosed_by_doctor_Heart_attack                      4014872.00
I21-Diagnoses_main_ICD10_I21_Acute_myocardial_infarction                            4242830.50
I9_MI-Myocardial_infarction                                                         4417031.00
I9_CHD-Major_coronary_heart_disease_event                                           4443732.50
6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above               4539449.50
20002_1473-Noncancer_illness_code_selfreported_high_cholesterol                     4549772.50
I70-Diagnoses_main_ICD10_I70_Atherosclerosis                                        4574511.00
CARDIoGRAM_C4D_CAD_ADDITIVE                                                         4598049.00
I9_IHD-Ischaemic_heart_disease_wide_definition                                      4671838.50
I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_hea

In [38]:
drug_df.describe()

count    4.091000e+03
mean     4.240774e+06
std      3.346693e+05
min      2.317380e+06
25%      4.064316e+06
50%      4.323138e+06
75%      4.488743e+06
max      4.760801e+06
dtype: float64

In [39]:
drug_mean, drug_std = drug_df.mean(), drug_df.std()
display((drug_mean, drug_std))

(4240773.5, 334669.3)

In [40]:
drug_df_std = (drug_df - drug_mean) / drug_std
drug_df_stats = drug_df_std.describe()
display(drug_df_stats)

count    4.091000e+03
mean    -7.086703e-07
std      9.999999e-01
min     -5.747145e+00
25%     -5.272593e-01
50%      2.461071e-01
75%      7.409381e-01
max      1.553855e+00
dtype: float64

In [41]:
drug_df_std.quantile([0.80, 0.85, 0.90, 0.95])

0.80    0.834881
0.85    0.938875
0.90    1.036803
0.95    1.185430
dtype: float64

In [42]:
drug_df = (drug_df.loc[_phenomexcan_traits] - drug_mean) / drug_std

In [43]:
drug_df.shape

(12,)

In [44]:
drug_df.sort_values()

20002_1075-Noncancer_illness_code_selfreported_heart_attackmyocardial_infarction   -0.729630
6150_1-Vascularheart_problems_diagnosed_by_doctor_Heart_attack                     -0.674999
I21-Diagnoses_main_ICD10_I21_Acute_myocardial_infarction                            0.006146
I9_MI-Myocardial_infarction                                                         0.526662
I9_CHD-Major_coronary_heart_disease_event                                           0.606446
6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above               0.892451
20002_1473-Noncancer_illness_code_selfreported_high_cholesterol                     0.923297
I70-Diagnoses_main_ICD10_I70_Atherosclerosis                                        0.997216
CARDIoGRAM_C4D_CAD_ADDITIVE                                                         1.067548
I9_IHD-Ischaemic_heart_disease_wide_definition                                      1.288033
I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease          

All predictions of Niacin for these traits are high (above the mean and a standard deviation away)

In [45]:
# select traits for which niacin has a high prediction
selected_traits = drug_df[drug_df > drug_df_stats["75%"]].index.tolist()

In [46]:
selected_traits

['I70-Diagnoses_main_ICD10_I70_Atherosclerosis',
 'CARDIoGRAM_C4D_CAD_ADDITIVE',
 'I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease',
 '20002_1473-Noncancer_illness_code_selfreported_high_cholesterol',
 '6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above',
 'I9_CORATHER-Coronary_atherosclerosis',
 'I9_IHD-Ischaemic_heart_disease_wide_definition']

## Gene module-based - LVs driving association

In [47]:
def find_best_tissue(trait_id):
    return traits_best_tissues_df.loc[trait_id]

In [48]:
_tmp_res = find_best_tissue("I9_CORATHER-Coronary_atherosclerosis")
display(_tmp_res)

'Heart_Atrial_Appendage'

In [49]:
# available_doids = set(predictions_by_tissue["trait"].unique())
traits_lv_data = []

for trait in selected_traits:
    best_module_tissue = find_best_tissue(trait)
    display(best_module_tissue)

    best_module_tissue_data = pd.read_pickle(
        conf.RESULTS["DRUG_DISEASE_ANALYSES"]
        / "spredixcan"
        / "proj"
        / f"spredixcan-mashr-zscores-{best_module_tissue}-projection.pkl"
    )[trait]

    traits_lv_data.append(best_module_tissue_data)

'Cells_Cultured_fibroblasts'

'Colon_Transverse'

'Brain_Putamen_basal_ganglia'

'Brain_Amygdala'

'Liver'

'Heart_Atrial_Appendage'

'Brain_Cerebellar_Hemisphere'

In [50]:
module_tissue_data = pd.DataFrame(traits_lv_data).T

In [51]:
module_tissue_data.shape

(987, 7)

In [52]:
module_tissue_data.head()

Unnamed: 0,I70-Diagnoses_main_ICD10_I70_Atherosclerosis,CARDIoGRAM_C4D_CAD_ADDITIVE,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease,20002_1473-Noncancer_illness_code_selfreported_high_cholesterol,6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above,I9_CORATHER-Coronary_atherosclerosis,I9_IHD-Ischaemic_heart_disease_wide_definition
LV1,0.00572,-0.002536,0.003223,0.027183,-0.009975,-0.041645,-0.034174
LV2,-0.002927,-0.04091,0.009556,0.03117,-0.000601,-0.053181,-0.025907
LV3,0.002734,0.033692,-0.005872,-0.015628,0.029873,-0.018676,-0.020514
LV4,0.015454,-0.017444,0.004483,0.000231,-0.073387,0.022468,-0.046343
LV5,0.027796,0.016727,-0.009114,-0.037452,0.032017,-0.030727,-0.015489


In [53]:
drug_data = lincs_projection.loc[_drug_id]

In [54]:
drug_data.head()

LV1   -0.032338
LV2    0.007960
LV3    0.009136
LV4   -0.005717
LV5   -0.023626
Name: DB00627, dtype: float64

In [55]:
_tmp = (-1.0 * drug_data.dot(module_tissue_data)).sort_values(ascending=False)
display(_tmp)

I9_IHD-Ischaemic_heart_disease_wide_definition                           0.041041
6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above    0.033455
I9_CORATHER-Coronary_atherosclerosis                                     0.025396
I70-Diagnoses_main_ICD10_I70_Atherosclerosis                             0.023000
I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease             0.021002
20002_1473-Noncancer_illness_code_selfreported_high_cholesterol          0.018520
CARDIoGRAM_C4D_CAD_ADDITIVE                                              0.016894
Name: DB00627, dtype: float64

In [56]:
drug_trait_predictions = pd.DataFrame(
    -1.0 * (drug_data.to_frame().values * module_tissue_data.values),
    columns=module_tissue_data.columns.copy(),
    index=drug_data.index.copy(),
)

In [57]:
drug_trait_predictions.shape

(987, 7)

In [58]:
drug_trait_predictions.head()

Unnamed: 0,I70-Diagnoses_main_ICD10_I70_Atherosclerosis,CARDIoGRAM_C4D_CAD_ADDITIVE,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease,20002_1473-Noncancer_illness_code_selfreported_high_cholesterol,6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above,I9_CORATHER-Coronary_atherosclerosis,I9_IHD-Ischaemic_heart_disease_wide_definition
LV1,0.000185,-8.2e-05,0.000104,0.000879,-0.000323,-0.001347,-0.001105
LV2,2.3e-05,0.000326,-7.6e-05,-0.000248,5e-06,0.000423,0.000206
LV3,-2.5e-05,-0.000308,5.4e-05,0.000143,-0.000273,0.000171,0.000187
LV4,8.8e-05,-0.0001,2.6e-05,1e-06,-0.00042,0.000128,-0.000265
LV5,0.000657,0.000395,-0.000215,-0.000885,0.000756,-0.000726,-0.000366


In [59]:
common_lvs = []

for c in drug_trait_predictions.columns:
    d = Trait.get_trait(full_code=c)
    display(f"Name: {d.description}")
    display(f"Sample size: {(d.n, d.n_cases)}")

    _tmp = drug_trait_predictions[c]

    _tmp = _tmp[_tmp > 0.0]
    q = _tmp.quantile(QUANTILE)
    _tmp = _tmp[_tmp > q]
    display(f"Number of LVs: {_tmp.shape[0]}")

    _tmp = (
        _tmp.sort_values(ascending=False)
        .rename("lv_diff")
        .reset_index()
        .rename(columns={"index": "lv"})
    )
    _tmp = _tmp.assign(trait=c)
    common_lvs.append(_tmp)

    display(_tmp.head(20))
    print()

'Name: Diagnoses - main ICD10: I70 Atherosclerosis'

'Sample size: (361194, 566.0)'

'Number of LVs: 25'

Unnamed: 0,lv,lv_diff,trait
0,LV116,0.00618,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
1,LV931,0.004843,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
2,LV744,0.002292,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
3,LV697,0.002231,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
4,LV885,0.001984,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
5,LV536,0.001956,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
6,LV550,0.001933,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
7,LV220,0.001822,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
8,LV272,0.001785,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
9,LV739,0.00177,I70-Diagnoses_main_ICD10_I70_Atherosclerosis





'Name: Coronary Artery Disease'

'Sample size: (184305, 60801.0)'

'Number of LVs: 25'

Unnamed: 0,lv,lv_diff,trait
0,LV536,0.005978,CARDIoGRAM_C4D_CAD_ADDITIVE
1,LV74,0.004517,CARDIoGRAM_C4D_CAD_ADDITIVE
2,LV879,0.003454,CARDIoGRAM_C4D_CAD_ADDITIVE
3,LV38,0.002886,CARDIoGRAM_C4D_CAD_ADDITIVE
4,LV841,0.002169,CARDIoGRAM_C4D_CAD_ADDITIVE
5,LV530,0.001887,CARDIoGRAM_C4D_CAD_ADDITIVE
6,LV163,0.0016,CARDIoGRAM_C4D_CAD_ADDITIVE
7,LV678,0.001565,CARDIoGRAM_C4D_CAD_ADDITIVE
8,LV824,0.001456,CARDIoGRAM_C4D_CAD_ADDITIVE
9,LV58,0.001449,CARDIoGRAM_C4D_CAD_ADDITIVE





'Name: Diagnoses - main ICD10: I25 Chronic ischaemic heart disease'

'Sample size: (361194, 12769.0)'

'Number of LVs: 25'

Unnamed: 0,lv,lv_diff,trait
0,LV881,0.013063,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
1,LV163,0.005463,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
2,LV116,0.005052,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
3,LV227,0.002697,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
4,LV66,0.002403,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
5,LV442,0.002145,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
6,LV906,0.001888,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
7,LV175,0.001865,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
8,LV965,0.001627,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
9,LV395,0.001531,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...





'Name: Non-cancer illness code, self-reported: high cholesterol'

'Sample size: (361141, 43957.0)'

'Number of LVs: 25'

Unnamed: 0,lv,lv_diff,trait
0,LV227,0.002637,20002_1473-Noncancer_illness_code_selfreported...
1,LV97,0.002382,20002_1473-Noncancer_illness_code_selfreported...
2,LV547,0.002326,20002_1473-Noncancer_illness_code_selfreported...
3,LV95,0.002202,20002_1473-Noncancer_illness_code_selfreported...
4,LV530,0.00218,20002_1473-Noncancer_illness_code_selfreported...
5,LV170,0.00196,20002_1473-Noncancer_illness_code_selfreported...
6,LV525,0.001934,20002_1473-Noncancer_illness_code_selfreported...
7,LV246,0.001847,20002_1473-Noncancer_illness_code_selfreported...
8,LV738,0.001776,20002_1473-Noncancer_illness_code_selfreported...
9,LV940,0.00154,20002_1473-Noncancer_illness_code_selfreported...





'Name: Vascular/heart problems diagnosed by doctor: None of the above'

'Sample size: (360420, 253565.0)'

'Number of LVs: 26'

Unnamed: 0,lv,lv_diff,trait
0,LV743,0.006904,6150_100-Vascularheart_problems_diagnosed_by_d...
1,LV969,0.004285,6150_100-Vascularheart_problems_diagnosed_by_d...
2,LV575,0.004086,6150_100-Vascularheart_problems_diagnosed_by_d...
3,LV829,0.003023,6150_100-Vascularheart_problems_diagnosed_by_d...
4,LV707,0.002743,6150_100-Vascularheart_problems_diagnosed_by_d...
5,LV840,0.002685,6150_100-Vascularheart_problems_diagnosed_by_d...
6,LV931,0.002528,6150_100-Vascularheart_problems_diagnosed_by_d...
7,LV254,0.002311,6150_100-Vascularheart_problems_diagnosed_by_d...
8,LV885,0.00225,6150_100-Vascularheart_problems_diagnosed_by_d...
9,LV656,0.002098,6150_100-Vascularheart_problems_diagnosed_by_d...





'Name: Coronary atherosclerosis'

'Sample size: (361194, 14334.0)'

'Number of LVs: 25'

Unnamed: 0,lv,lv_diff,trait
0,LV881,0.01567,I9_CORATHER-Coronary_atherosclerosis
1,LV670,0.002761,I9_CORATHER-Coronary_atherosclerosis
2,LV824,0.00273,I9_CORATHER-Coronary_atherosclerosis
3,LV142,0.0026,I9_CORATHER-Coronary_atherosclerosis
4,LV227,0.002379,I9_CORATHER-Coronary_atherosclerosis
5,LV707,0.002191,I9_CORATHER-Coronary_atherosclerosis
6,LV530,0.002034,I9_CORATHER-Coronary_atherosclerosis
7,LV841,0.001971,I9_CORATHER-Coronary_atherosclerosis
8,LV254,0.001843,I9_CORATHER-Coronary_atherosclerosis
9,LV97,0.00181,I9_CORATHER-Coronary_atherosclerosis





'Name: Ischaemic heart disease, wide definition'

'Sample size: (361194, 20857.0)'

'Number of LVs: 26'

Unnamed: 0,lv,lv_diff,trait
0,LV881,0.007724,I9_IHD-Ischaemic_heart_disease_wide_definition
1,LV97,0.004228,I9_IHD-Ischaemic_heart_disease_wide_definition
2,LV931,0.003626,I9_IHD-Ischaemic_heart_disease_wide_definition
3,LV509,0.00336,I9_IHD-Ischaemic_heart_disease_wide_definition
4,LV395,0.002838,I9_IHD-Ischaemic_heart_disease_wide_definition
5,LV479,0.001992,I9_IHD-Ischaemic_heart_disease_wide_definition
6,LV502,0.001909,I9_IHD-Ischaemic_heart_disease_wide_definition
7,LV536,0.001816,I9_IHD-Ischaemic_heart_disease_wide_definition
8,LV220,0.001572,I9_IHD-Ischaemic_heart_disease_wide_definition
9,LV351,0.001518,I9_IHD-Ischaemic_heart_disease_wide_definition





# Get common LVs

In [60]:
common_lvs_df = pd.concat(common_lvs)  # .rename(columns={"index": "lv", 0: "value"})

In [61]:
common_lvs_df.shape

(177, 3)

In [62]:
common_lvs_df.head()

Unnamed: 0,lv,lv_diff,trait
0,LV116,0.00618,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
1,LV931,0.004843,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
2,LV744,0.002292,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
3,LV697,0.002231,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
4,LV885,0.001984,I70-Diagnoses_main_ICD10_I70_Atherosclerosis


In [63]:
lvs_by_count = (
    common_lvs_df.groupby("lv")["lv_diff"]
    .count()
    .squeeze()
    .sort_values(ascending=False)
)
display(lvs_by_count.head(25))

lv
LV97     5
LV707    4
LV227    4
LV116    4
LV678    4
LV824    4
LV163    4
LV885    4
LV246    3
LV21     3
LV840    3
LV841    3
LV354    3
LV254    3
LV881    3
LV536    3
LV530    3
LV931    3
LV95     3
LV170    3
LV395    2
LV38     2
LV525    2
LV587    2
LV59     2
Name: lv_diff, dtype: int64

In [64]:
lvs_sel = []

In [65]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    lv_df = common_lvs_df[common_lvs_df["lv"] == "LV116"].sort_values(
        "lv_diff", ascending=False
    )
    display(lv_df)
    lvs_sel.append(lv_df)

Unnamed: 0,lv,lv_diff,trait
0,LV116,0.00618,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
2,LV116,0.005052,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease
22,LV116,0.001646,6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above
12,LV116,0.001423,I9_IHD-Ischaemic_heart_disease_wide_definition


In [66]:
lv_df = common_lvs_df[common_lvs_df["lv"] == "LV931"].sort_values(
    "lv_diff", ascending=False
)
display(lv_df)
lvs_sel.append(lv_df)

Unnamed: 0,lv,lv_diff,trait
1,LV931,0.004843,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
2,LV931,0.003626,I9_IHD-Ischaemic_heart_disease_wide_definition
6,LV931,0.002528,6150_100-Vascularheart_problems_diagnosed_by_d...


In [67]:
lv_df = common_lvs_df[common_lvs_df["lv"] == "LV246"].sort_values(
    "lv_diff", ascending=False
)
display(lv_df)
lvs_sel.append(lv_df)

Unnamed: 0,lv,lv_diff,trait
7,LV246,0.001847,20002_1473-Noncancer_illness_code_selfreported...
15,LV246,0.001268,I9_IHD-Ischaemic_heart_disease_wide_definition
16,LV246,0.001253,I70-Diagnoses_main_ICD10_I70_Atherosclerosis


In [68]:
lv_df = pd.concat(lvs_sel, ignore_index=True)
display(lv_df.head())

Unnamed: 0,lv,lv_diff,trait
0,LV116,0.00618,I70-Diagnoses_main_ICD10_I70_Atherosclerosis
1,LV116,0.005052,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
2,LV116,0.001646,6150_100-Vascularheart_problems_diagnosed_by_d...
3,LV116,0.001423,I9_IHD-Ischaemic_heart_disease_wide_definition
4,LV931,0.004843,I70-Diagnoses_main_ICD10_I70_Atherosclerosis


In [69]:
from traits import SHORT_TRAIT_NAMES

In [70]:
def get_trait_objs(phenotype_full_code):
    if Trait.is_efo_label(phenotype_full_code):
        traits = Trait.get_traits_from_efo(phenotype_full_code)
    else:
        traits = [Trait.get_trait(full_code=phenotype_full_code)]

    # sort by sample size
    return sorted(traits, key=lambda x: x.n_cases / x.n, reverse=True)


def get_trait_description(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    desc = traits[0].description
    if desc in SHORT_TRAIT_NAMES:
        return SHORT_TRAIT_NAMES[desc]

    return desc


def get_trait_n(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    return traits[0].n


def get_trait_n_cases(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    return traits[0].n_cases


def num_to_int_str(num):
    if pd.isnull(num):
        return ""

    return f"{num:,.0f}"


def get_part_clust(row):
    return f"{row.part_k} / {row.cluster_id}"

In [71]:
lv_df = lv_df.assign(trait_desc=lv_df["trait"].apply(get_trait_description))

In [72]:
lv_df = lv_df.assign(n=lv_df["trait"].apply(get_trait_n))

In [73]:
lv_df = lv_df.assign(n_cases=lv_df["trait"].apply(get_trait_n_cases))

In [74]:
lv_df = lv_df.assign(n=lv_df["n"].apply(num_to_int_str))

In [75]:
lv_df = lv_df.assign(n_cases=lv_df["n_cases"].apply(num_to_int_str))

In [76]:
CELL_TYPES_LVS = {
    "LV246": "Adipose tissue, liver",
    "LV116": "Immune cells, skin",
    "LV931": "Immune cells",
}

In [77]:
lv_df["Cell type"] = lv_df["lv"].apply(lambda x: CELL_TYPES_LVS[x])

In [78]:
lv_df["Niacin effect"] = lv_df["lv"].apply(
    lambda x: "-" if drug_data.loc[x] < 0 else "+"
)

In [79]:
lv_df = lv_df.rename(
    columns={
        "lv": "LV",
        "trait_desc": "Disease",
        "n": "Sample size",
        "n_cases": "Cases",
    }
)

In [80]:
lv_df[["LV", "Cell type", "Disease", "Sample size", "Cases"]]

Unnamed: 0,LV,Cell type,Disease,Sample size,Cases
0,LV116,"Immune cells, skin",Atherosclerosis (ICD10 I70),361194,566
1,LV116,"Immune cells, skin",Chronic ischaemic heart disease (ICD10 I25),361194,12769
2,LV116,"Immune cells, skin","Heart attack, angina, stroke or hypertension",360420,253565
3,LV116,"Immune cells, skin",Ischaemic heart disease (wide definition),361194,20857
4,LV931,Immune cells,Atherosclerosis (ICD10 I70),361194,566
5,LV931,Immune cells,Ischaemic heart disease (wide definition),361194,20857
6,LV931,Immune cells,"Heart attack, angina, stroke or hypertension",360420,253565
7,LV246,"Adipose tissue, liver",High cholesterol (self-reported),361141,43957
8,LV246,"Adipose tissue, liver",Ischaemic heart disease (wide definition),361194,20857
9,LV246,"Adipose tissue, liver",Atherosclerosis (ICD10 I70),361194,566


In [81]:
lv_df = lv_df[["LV", "Cell type", "Disease"]]

In [82]:
lv_df = (
    lv_df.sort_values(["LV", "Disease"])
    .set_index("LV")
    .loc[["LV116", "LV931", "LV246"]]
    .reset_index()
)

In [83]:
lv_df.loc[[1, 2, 3, 5, 6, 8, 9], ["LV", "Cell type"]] = ""

In [84]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_df)

Unnamed: 0,LV,Cell type,Disease
0,LV116,"Immune cells, skin",Atherosclerosis (ICD10 I70)
1,,,Chronic ischaemic heart disease (ICD10 I25)
2,,,"Heart attack, angina, stroke or hypertension"
3,,,Ischaemic heart disease (wide definition)
4,LV931,Immune cells,Atherosclerosis (ICD10 I70)
5,,,"Heart attack, angina, stroke or hypertension"
6,,,Ischaemic heart disease (wide definition)
7,LV246,"Adipose tissue, liver",Atherosclerosis (ICD10 I70)
8,,,High cholesterol (self-reported)
9,,,Ischaemic heart disease (wide definition)


## Save table

In [85]:
# result_set is either phenomexcan or emerge
LV_FILE_MARK_TEMPLATE = "<!-- niacin:cardiovascular:top_lvs:{position} -->"

In [88]:
# start
lv_file_mark_start = LV_FILE_MARK_TEMPLATE.format(position="start")
display(lv_file_mark_start)

# end
lv_file_mark_end = LV_FILE_MARK_TEMPLATE.format(position="end")
display(lv_file_mark_end)

'<!-- niacin:cardiovascular:top_lvs:start -->'

'<!-- niacin:cardiovascular:top_lvs:end -->'

In [89]:
new_content = lv_df.to_markdown(index=False, disable_numparse=True)

In [92]:
full_new_content = (
    lv_file_mark_start + "\n" + new_content.strip() + "\n" + lv_file_mark_end
)

In [93]:
with open(OUTPUT_FILE_PATH, "r", encoding="utf8") as f:
    file_content = f.read()

In [94]:
new_file_content = re.sub(
    lv_file_mark_start + ".*?" + lv_file_mark_end,
    full_new_content,
    file_content,
    flags=re.DOTALL,
)

In [95]:
with open(OUTPUT_FILE_PATH, "w", encoding="utf8") as f:
    f.write(new_file_content)  # .replace("\beta", r"\beta"))

# Niacin top LVs

In [96]:
drug_data.abs().sort_values(ascending=False).head(30)

LV881    0.154953
LV116    0.097558
LV743    0.097299
LV38     0.088232
LV841    0.086833
LV163    0.080580
LV161    0.070249
LV931    0.069832
LV9      0.069249
LV536    0.068436
LV50     0.067657
LV707    0.067467
LV142    0.064834
LV95     0.061820
LV97     0.061123
LV74     0.056454
LV66     0.055342
LV974    0.053342
LV678    0.051296
LV879    0.050419
LV969    0.047919
LV27     0.047240
LV509    0.046626
LV272    0.046522
LV905    0.045610
LV254    0.044915
LV720    0.043503
LV587    0.043327
LV608    0.043223
LV246    0.042009
Name: DB00627, dtype: float64

In [97]:
drug_data.sort_values(ascending=False).head(30)

LV881    0.154953
LV116    0.097558
LV743    0.097299
LV38     0.088232
LV974    0.053342
LV678    0.051296
LV879    0.050419
LV272    0.046522
LV254    0.044915
LV720    0.043503
LV587    0.043327
LV20     0.040827
LV840    0.039595
LV507    0.037171
LV684    0.034499
LV530    0.033392
LV55     0.033224
LV667    0.032972
LV885    0.032274
LV660    0.031523
LV762    0.031066
LV207    0.031020
LV598    0.030998
LV829    0.030331
LV420    0.029630
LV333    0.029385
LV940    0.028526
LV819    0.028512
LV175    0.028100
LV470    0.028095
Name: DB00627, dtype: float64

In [98]:
drug_data.sort_values(ascending=True).head(30)

LV841   -0.086833
LV163   -0.080580
LV161   -0.070249
LV931   -0.069832
LV9     -0.069249
LV536   -0.068436
LV50    -0.067657
LV707   -0.067467
LV142   -0.064834
LV95    -0.061820
LV97    -0.061123
LV74    -0.056454
LV66    -0.055342
LV969   -0.047919
LV27    -0.047240
LV509   -0.046626
LV905   -0.045610
LV608   -0.043223
LV246   -0.042009
LV854   -0.040733
LV903   -0.038761
LV310   -0.037819
LV824   -0.037622
LV19    -0.036921
LV517   -0.036731
LV513   -0.036216
LV864   -0.036016
LV674   -0.035938
LV670   -0.035053
LV189   -0.034486
Name: DB00627, dtype: float64