# Description

It runs GLSPhenoplier to compute an association between each selected LV and PhenomeXcan trait. Traits of interest are selected from the "complex branch" (clustering results), and LVs are those predicted (by a decision tree classifier) to be discriminative for those clusters in the "complex branch".

This notebook is the same as `30-gls_on_phenotypes-phenomexcan.ipynb`, but it includes more clusters of traits. The fact that it is separated is that we want to avoid running all again (we decided to analyze other clusters later).

# Environment variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import conf

In [3]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

3

In [4]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=3
env: OPEN_BLAS_NUM_THREADS=3
env: NUMEXPR_NUM_THREADS=3
env: OMP_NUM_THREADS=3


# Modules

In [5]:
from pathlib import Path

import statsmodels.api as sm
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [6]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls')

In [7]:
OUTPUT_FILENAME = OUTPUT_DIR / "gls_phenotypes-phenomexcan-2.pkl"
display(OUTPUT_FILENAME)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-phenomexcan-2.pkl')

# Load data

## PhenomeXcan (S-MultiXcan)

In [8]:
INPUT_SUBSET = "z_score_std"

In [9]:
INPUT_STEM = "projection-smultixcan-efo_partial-mashr-zscores"

In [10]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    f"{INPUT_SUBSET}-{INPUT_STEM}.pkl",
).resolve()

In [11]:
data = pd.read_pickle(input_filepath)

In [12]:
data.shape

(3752, 987)

In [13]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.695006,1.962565,0.057683,0.878731,-0.539977,1.481272,-0.396422,1.09018,0.759223,0.931395,...,1.129784,1.752343,-1.411403,2.823863,0.931116,-1.054519,0.432982,-0.633597,0.554279,-0.642479
100002_raw-Energy,-1.528127,-0.345309,-0.148953,-0.24206,0.373427,0.791092,0.263477,0.987702,0.354391,1.416059,...,0.224604,0.769882,-0.509482,0.091153,2.286789,-1.008256,-0.029764,1.737229,-0.272107,-0.526125
100003_raw-Protein,-0.704572,-1.011299,0.67142,0.143991,0.615212,0.874212,-0.040998,0.91517,0.254369,-0.084237,...,1.003019,1.044314,-2.376108,0.004778,0.053714,-0.892447,-0.1838,1.377991,-0.278794,-0.419733
100004_raw-Fat,-0.989832,-1.87549,0.261555,-1.420719,0.366238,1.167049,0.257387,0.717674,-0.997664,0.969825,...,0.585913,0.638314,0.119139,-0.140204,1.394326,-1.173402,0.555058,1.013982,-0.544506,-0.064061
100005_raw-Carbohydrate,-0.580143,0.243335,0.158966,-0.036558,0.068176,-0.202639,1.101281,0.675227,1.463432,1.010078,...,-0.249108,-0.026814,0.232713,0.323682,1.168642,-0.282935,0.653105,1.909526,0.199997,-1.656894


## Clustering results

In [14]:
CONSENSUS_CLUSTERING_DIR = Path(
    conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering"
).resolve()

display(CONSENSUS_CLUSTERING_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering')

In [15]:
input_file = Path(CONSENSUS_CLUSTERING_DIR, "best_partitions_by_k.pkl").resolve()
display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering/best_partitions_by_k.pkl')

In [16]:
best_partitions = pd.read_pickle(input_file)

In [17]:
best_partitions.shape

(59, 4)

In [18]:
best_partitions.head()

Unnamed: 0_level_0,method,partition,ari_median,selected
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.090117,True
22,scc_020,"[13, 18, 18, 18, 18, 18, 18, 18, 18, 13, 18, 1...",0.0901,True
13,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.08992,True
12,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.089894,True
11,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.089616,True


## MultiPLIER summary

In [19]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [20]:
multiplier_model_summary.shape

(2157, 5)

In [21]:
multiplier_model_summary.head()

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
1,KEGG_LYSINE_DEGRADATION,1,0.388059,0.866078,0.956005
2,REACTOME_MRNA_SPLICING,1,0.733057,4.8e-05,0.000582
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628,0.011366
4,KEGG_DNA_REPLICATION,1,0.549473,0.312155,0.539951
5,PID_MYC_ACTIVPATHWAY,1,0.639303,0.021702,0.083739


In [22]:
well_aligned_lvs = multiplier_model_summary[
    (multiplier_model_summary["FDR"] < 0.05) | (multiplier_model_summary["AUC"] >= 0.75)
]

display(well_aligned_lvs.shape)
display(well_aligned_lvs.head())

(469, 5)

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
2,REACTOME_MRNA_SPLICING,1,0.733057,4.772691e-05,0.0005816211
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628217,0.0113659
8,REACTOME_MITOTIC_G1_G1_S_PHASES,1,0.68617,0.0002517619,0.002392292
9,IRIS_Monocyte-Day0,2,0.890036,4.315812e-25,1.329887e-22
10,DMAP_MONO2,2,0.904676,1.31397e-16,1.574574e-14


In [23]:
well_aligned_lv_codes = set([f"LV{lvi}" for lvi in well_aligned_lvs["LV index"]])

In [24]:
len(well_aligned_lv_codes)

200

In [25]:
list(well_aligned_lv_codes)[:5]

['LV7', 'LV904', 'LV928', 'LV177', 'LV827']

# Select partition / cluster pairs

In [26]:
# This dictionary specifies in the keys the partition/clusters where traits will be selected from.
# To select the LVs, we will take those LVs that are discriminative for the partition/cluster in the key,
# but we also include an additional set of partition/clusters since there is a hierarchy of clustering solutions,
# one LV in those might have not been present in the original partition/cluster tuple. This is manually inferred
# by looking at the clustering tree. For example, within the "complex branch", we have the partition/cluster tuple
# (29,11), including coronary artery disease and other traits. This tuple has a set of (at most) 20 LVs that are
# discriminative for these traits. However, at k=26, this tuple is a children of (26,13) (which is a parent of (29,16)),
# which has another set of discriminative LVs. We also take those ones for (29,11).
#
# key: a tuple (partition_k or ID, cluster_id)
# value: a list of tuples (each tuple having two elements: (partition_k or ID, cluster_id))
PHENOTYPES_LVS_CONFIG = {
    # red blood cells
    (29, 4): [],
    (29, 2): [(16, 1)],
    (29, 5): [(16, 1)],
    (29, 23): [(16, 1)],
    # platelets
    (29, 1): [],
}

In [27]:
CLUSTER_LV_DIR = conf.RESULTS["CLUSTERING_INTERPRETATION"]["BASE_DIR"] / "cluster_lvs"
assert CLUSTER_LV_DIR.exists()

display(CLUSTER_LV_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/interpretation/cluster_lvs')

In [28]:
def _get_lvs_data(part_k, cluster_idx):
    """
    For a partition/cluster pair, it returns a list of LV names that are discriminative for that cluster.
    """
    cluster_lvs = pd.read_pickle(
        CLUSTER_LV_DIR
        / f"part{part_k}"
        / f"cluster_interpreter-part{part_k}_k{cluster_idx}.pkl"
    )

    return list(cluster_lvs["name"])

In [29]:
_get_lvs_data(29, 11)[:5]

['LV140', 'LV884', 'LV727', 'LV455', 'LV136']

# GLSPhenoplier

## Get list of phenotypes/lvs pairs

Here I get a list of phenotype/lv pairs to run GLSPhenoplier on. I do this because I don't need to train the model
for all LVs and all traits. The pairs are read from the `PHENOTYPES_LVS_CONFIG` dictionary specified before.

In [30]:
phenotypes_lvs_pairs = []

for (part_k, cluster_id), extra_for_lvs in PHENOTYPES_LVS_CONFIG.items():
    # get traits from the partition/cluster
    part = best_partitions.loc[part_k, "partition"]
    cluster_traits = data.index[part == cluster_id]

    # get first the LVs that are predictive for this partition/cluster
    # then, add extra LVs from the partition/cluster "parents" specified in
    # PHENOTYPES_LVS_CONFIG as a list of values
    lv_list = _get_lvs_data(part_k, cluster_id)

    for extra_part_k, extra_cluster_id in extra_for_lvs:
        extra_lv_list = _get_lvs_data(part_k, cluster_id)
        lv_list.extend(extra_lv_list)

    # now create the list of trait/lv pairs where GLSPhenoplier will be run on later
    for phenotype_code in cluster_traits:
        for lv_code in lv_list:
            phenotypes_lvs_pairs.append(
                {
                    "phenotype_part_k": part_k,
                    "phenotype_cluster_id": cluster_id,
                    "phenotype": phenotype_code,
                    "lv": lv_code,
                }
            )

phenotypes_lvs_pairs = pd.DataFrame(phenotypes_lvs_pairs).drop_duplicates()

In [31]:
phenotypes_lvs_pairs = phenotypes_lvs_pairs.sort_values("phenotype").reset_index(
    drop=True
)

In [32]:
phenotypes_lvs_pairs.shape

(400, 4)

In [33]:
phenotypes_lvs_pairs.head()

Unnamed: 0,phenotype_part_k,phenotype_cluster_id,phenotype,lv
0,29,5,30010_raw-Red_blood_cell_erythrocyte_count,LV928
1,29,5,30010_raw-Red_blood_cell_erythrocyte_count,LV895
2,29,5,30010_raw-Red_blood_cell_erythrocyte_count,LV382
3,29,5,30010_raw-Red_blood_cell_erythrocyte_count,LV677
4,29,5,30010_raw-Red_blood_cell_erythrocyte_count,LV588


## Run

In [34]:
results = []

pbar = tqdm(total=phenotypes_lvs_pairs.shape[0])

for idx, row in phenotypes_lvs_pairs.iterrows():
    phenotype_code = row["phenotype"]
    lv_code = row["lv"]

    pbar.set_description(f"{phenotype_code} - {lv_code}")

    gls_model = GLSPhenoplier(
        smultixcan_result_set_filepath=conf.PHENOMEXCAN[
            "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
        ]
    )
    gls_model.fit_named(lv_code, phenotype_code)
    res = gls_model.results

    results.append(
        {
            "part_k": row["phenotype_part_k"],
            "cluster_id": row["phenotype_cluster_id"],
            "phenotype": phenotype_code,
            "lv": lv_code,
            "lv_with_pathway": lv_code in well_aligned_lv_codes,
            "coef": res.params.loc["lv"],
            "pvalue": res.pvalues_onesided.loc["lv"],
            "pvalue_twosided": res.pvalues.loc["lv"],
            "summary": gls_model.results_summary,
        }
    )

    # save results every 10 models trained
    if (idx % 10) == 0:
        pd.DataFrame(results).to_pickle(OUTPUT_FILENAME)

    pbar.update(1)

pbar.close()

reticulocyte count - LV30: 100%|██████████| 400/400 [1:02:37<00:00,  9.39s/it]


In [35]:
results = pd.DataFrame(results)

In [36]:
results.shape

(400, 9)

In [37]:
results.head()

Unnamed: 0,part_k,cluster_id,phenotype,lv,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
0,29,5,30010_raw-Red_blood_cell_erythrocyte_count,LV928,True,0.10994,3.899674e-18,7.799348e-18,GLS Regression Res...
1,29,5,30010_raw-Red_blood_cell_erythrocyte_count,LV895,False,0.012789,0.1619516,0.3239032,GLS Regression Res...
2,29,5,30010_raw-Red_blood_cell_erythrocyte_count,LV382,False,0.025226,0.02491257,0.04982515,GLS Regression Res...
3,29,5,30010_raw-Red_blood_cell_erythrocyte_count,LV677,False,0.018216,0.07778114,0.1555623,GLS Regression Res...
4,29,5,30010_raw-Red_blood_cell_erythrocyte_count,LV588,False,0.001769,0.4446061,0.8892123,GLS Regression Res...


In [38]:
results.sort_values("pvalue").head(10)

Unnamed: 0,part_k,cluster_id,phenotype,lv,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
231,29,4,30250_raw-Reticulocyte_count,LV584,False,0.230159,2.574853e-116,5.149705e-116,GLS Regression Res...
209,29,4,30240_raw-Reticulocyte_percentage,LV584,False,0.211679,5.2018320000000003e-101,1.0403659999999999e-100,GLS Regression Res...
398,29,4,reticulocyte count,LV584,False,0.211275,3.73366e-99,7.46732e-99,GLS Regression Res...
327,29,4,30300_raw-High_light_scatter_reticulocyte_count,LV584,False,0.206401,7.539141e-95,1.507828e-94,GLS Regression Res...
306,29,4,30290_raw-High_light_scatter_reticulocyte_perc...,LV584,False,0.187418,4.073651e-80,8.147302e-80,GLS Regression Res...
95,29,2,30050_raw-Mean_corpuscular_haemoglobin,LV847,True,0.234947,7.614625000000001e-76,1.5229250000000001e-75,GLS Regression Res...
243,29,2,30260_raw-Mean_reticulocyte_volume,LV30,True,0.201376,2.2009590000000002e-54,4.4019180000000004e-54,GLS Regression Res...
61,29,2,30040_raw-Mean_corpuscular_volume,LV847,True,0.197762,2.466464e-53,4.932928e-53,GLS Regression Res...
276,29,2,30270_raw-Mean_sphered_cell_volume,LV928,True,0.19308,4.152256e-49,8.304513e-49,GLS Regression Res...
242,29,2,30260_raw-Mean_reticulocyte_volume,LV928,True,0.185119,8.467597e-47,1.693519e-46,GLS Regression Res...


## Save

In [39]:
results.to_pickle(OUTPUT_FILENAME)