# Description

This notebook is similar to `30` and `35`, but here I use the LVs that we found to be significantly enriched for the lipids CRISPR analysis, which might or might not coincide with the previously used LVs (those that discriminate clusters).
The traits here are from PhenomeXcan, and we select those from the main clusters found (see below) indicated in the clustering tree figure in the manuscript.

# Environment variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import conf

In [3]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

3

In [4]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=3
env: OPEN_BLAS_NUM_THREADS=3
env: NUMEXPR_NUM_THREADS=3
env: OMP_NUM_THREADS=3


# Modules

In [5]:
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [6]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls')

In [7]:
OUTPUT_FILENAME = OUTPUT_DIR / "gls_phenotypes-crispr_lvs-phenomexcan.pkl"
display(OUTPUT_FILENAME)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-crispr_lvs-phenomexcan.pkl')

# Load data

## PhenomeXcan (S-MultiXcan)

In [8]:
INPUT_SUBSET = "z_score_std"

In [9]:
INPUT_STEM = "projection-smultixcan-efo_partial-mashr-zscores"

In [10]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    f"{INPUT_SUBSET}-{INPUT_STEM}.pkl",
).resolve()

In [11]:
data = pd.read_pickle(input_filepath)

In [12]:
data.shape

(3752, 987)

In [13]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.695006,1.962565,0.057683,0.878731,-0.539977,1.481272,-0.396422,1.09018,0.759223,0.931395,...,1.129784,1.752343,-1.411403,2.823863,0.931116,-1.054519,0.432982,-0.633597,0.554279,-0.642479
100002_raw-Energy,-1.528127,-0.345309,-0.148953,-0.24206,0.373427,0.791092,0.263477,0.987702,0.354391,1.416059,...,0.224604,0.769882,-0.509482,0.091153,2.286789,-1.008256,-0.029764,1.737229,-0.272107,-0.526125
100003_raw-Protein,-0.704572,-1.011299,0.67142,0.143991,0.615212,0.874212,-0.040998,0.91517,0.254369,-0.084237,...,1.003019,1.044314,-2.376108,0.004778,0.053714,-0.892447,-0.1838,1.377991,-0.278794,-0.419733
100004_raw-Fat,-0.989832,-1.87549,0.261555,-1.420719,0.366238,1.167049,0.257387,0.717674,-0.997664,0.969825,...,0.585913,0.638314,0.119139,-0.140204,1.394326,-1.173402,0.555058,1.013982,-0.544506,-0.064061
100005_raw-Carbohydrate,-0.580143,0.243335,0.158966,-0.036558,0.068176,-0.202639,1.101281,0.675227,1.463432,1.010078,...,-0.249108,-0.026814,0.232713,0.323682,1.168642,-0.282935,0.653105,1.909526,0.199997,-1.656894


## Clustering results

In [14]:
CONSENSUS_CLUSTERING_DIR = Path(
    conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering"
).resolve()

display(CONSENSUS_CLUSTERING_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering')

In [15]:
input_file = Path(CONSENSUS_CLUSTERING_DIR, "best_partitions_by_k.pkl").resolve()
display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering/best_partitions_by_k.pkl')

In [16]:
best_partitions = pd.read_pickle(input_file)

In [17]:
best_partitions.shape

(59, 4)

In [18]:
best_partitions.head()

Unnamed: 0_level_0,method,partition,ari_median,selected
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.090117,True
22,scc_020,"[13, 18, 18, 18, 18, 18, 18, 18, 18, 13, 18, 1...",0.0901,True
13,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.08992,True
12,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.089894,True
11,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.089616,True


## MultiPLIER summary

In [19]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [20]:
multiplier_model_summary.shape

(2157, 5)

In [21]:
multiplier_model_summary.head()

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
1,KEGG_LYSINE_DEGRADATION,1,0.388059,0.866078,0.956005
2,REACTOME_MRNA_SPLICING,1,0.733057,4.8e-05,0.000582
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628,0.011366
4,KEGG_DNA_REPLICATION,1,0.549473,0.312155,0.539951
5,PID_MYC_ACTIVPATHWAY,1,0.639303,0.021702,0.083739


In [22]:
well_aligned_lvs = multiplier_model_summary[
    (
        multiplier_model_summary["FDR"] < 0.05
    )  # & (multiplier_model_summary["AUC"] >= 0.75)
]

display(well_aligned_lvs.shape)
display(well_aligned_lvs.head())

(463, 5)

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
2,REACTOME_MRNA_SPLICING,1,0.733057,4.772691e-05,0.0005816211
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628217,0.0113659
8,REACTOME_MITOTIC_G1_G1_S_PHASES,1,0.68617,0.0002517619,0.002392292
9,IRIS_Monocyte-Day0,2,0.890036,4.315812e-25,1.329887e-22
10,DMAP_MONO2,2,0.904676,1.31397e-16,1.574574e-14


In [23]:
well_aligned_lv_codes = set([f"LV{lvi}" for lvi in well_aligned_lvs["LV index"]])

In [24]:
len(well_aligned_lv_codes)

199

In [25]:
list(well_aligned_lv_codes)[:5]

['LV246', 'LV750', 'LV288', 'LV637', 'LV557']

# Select LVs from CRISPR analysis

In [26]:
deg_enrich = pd.read_csv(
    conf.RESULTS["CRISPR_ANALYSES"]["BASE_DIR"],
    "fgsea-hi_conf-all_lvs.tsv",
    sep="\t",
)

In [27]:
deg_enrich.shape

(1974, 11)

In [28]:
deg_enrich.head()

Unnamed: 0,pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx,fdr
0,gene_set_decrease,0.116883,0.233766,0.12564,0.909509,1.237006,5,"PCYT2, UBE2J2, FBXW7",LV1,8,0.967285
1,gene_set_increase,0.285714,0.285714,0.07218,0.840253,1.258282,3,"ACACA, MBTPS1",LV1,2,0.967285
2,gene_set_decrease,0.741259,0.741259,0.026956,0.751964,1.005845,5,"TCF7L2, UBE2J2, PTEN",LV10,10,0.967285
3,gene_set_increase,0.427572,0.72028,0.052805,0.81414,1.320263,3,"MBTPS1, DGAT2",LV10,4,0.967285
4,gene_set_decrease,0.908092,0.908092,0.014514,0.5404,0.778719,5,"PTEN, TCF7L2",LV100,1,0.97425


In [31]:
deg_enrich = deg_enrich.assign(
    lv_aligned=deg_enrich["lv"].apply(lambda x: x in well_aligned_lv_codes)
)

In [32]:
deg_enrich = deg_enrich[(deg_enrich["lv_aligned"])]

In [33]:
deg_enrich.shape

(398, 12)

## Lipids-increasing gene sets

In [34]:
deg_increase = deg_enrich[
    deg_enrich["pathway"].isin(("gene_set_increase",)) & (deg_enrich["pval"] < 0.01)
].sort_values("pval", ascending=True)

In [35]:
deg_increase.shape

(4, 12)

In [36]:
deg_increase.head()

Unnamed: 0,pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx,fdr,lv_aligned
327,gene_set_increase,0.003533,0.007067,0.431708,0.998221,1.582398,3,"DGAT2, ACACA",LV246,6,0.967285,True
1129,gene_set_increase,0.005837,0.011675,0.407018,0.996739,1.51066,3,"ACACA, DGAT2",LV607,10,0.967285,True
1423,gene_set_increase,0.007847,0.015693,0.38073,0.990514,1.474472,3,"MBTPS1, DGAT2",LV74,6,0.967285,True
1701,gene_set_increase,0.009192,0.018383,0.38073,0.993627,1.457584,3,"ACACA, DGAT2",LV865,10,0.967285,True


In [37]:
lvs_increase = deg_increase["lv"].unique()

In [38]:
lvs_increase.shape

(4,)

In [39]:
lvs_increase

array(['LV246', 'LV607', 'LV74', 'LV865'], dtype=object)

## Lipids-decreasing gene sets

In [40]:
deg_decrease = deg_enrich[
    deg_enrich["pathway"].isin(("gene_set_decrease",)) & (deg_enrich["pval"] < 0.01)
].sort_values("pval", ascending=True)

In [41]:
deg_decrease.shape

(2, 12)

In [42]:
deg_decrease.head()

Unnamed: 0,pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx,fdr,lv_aligned
1140,gene_set_decrease,0.00357,0.007141,0.431708,0.999299,1.400503,5,"PTEN, FBXW7",LV612,3,0.967285,True
1640,gene_set_decrease,0.006953,0.013906,0.407018,0.99407,1.374274,5,"UBE2J2, TCF7L2",LV838,6,0.967285,True


In [43]:
lvs_decrease = deg_decrease["lv"].unique()

In [44]:
lvs_decrease.shape

(2,)

In [45]:
lvs_decrease

array(['LV612', 'LV838'], dtype=object)

## Merge into one dataframe

In [46]:
_tmp0 = pd.DataFrame({"lv": lvs_increase, "lv_set": "lipids-increasing"})

_tmp1 = pd.DataFrame({"lv": lvs_decrease, "lv_set": "lipids-decreasing"})

In [47]:
gls_selected_lvs = pd.concat([_tmp0, _tmp1], ignore_index=True)

In [48]:
gls_selected_lvs.shape

(6, 2)

In [49]:
gls_selected_lvs.head()

Unnamed: 0,lv,lv_set
0,LV246,lipids-increasing
1,LV607,lipids-increasing
2,LV74,lipids-increasing
3,LV865,lipids-increasing
4,LV612,lipids-decreasing


# Select traits from main clusters

For this run on the LVs related to the lipids CRISPR analysis, I'm only interested in the main clusters of the cardiovascular sub-branch.

In [50]:
k = 29

In [51]:
part29 = pd.Series(best_partitions.loc[k, "partition"])

In [52]:
part29.value_counts()

0     3293
28     135
25      87
27      47
20      18
19      17
22      15
17      14
18      13
15      12
13      10
14       9
9        8
10       8
6        7
21       7
16       6
11       6
4        6
1        5
12       5
2        4
3        4
5        4
24       3
7        3
8        3
26       2
23       1
dtype: int64

In [53]:
part29_clusters = part29.unique()

In [54]:
part29_clusters

array([ 0, 22, 28,  6, 17, 13, 19, 27, 18, 20,  9,  5,  2, 23,  1, 12,  7,
        3, 24,  4, 15, 25, 10, 14, 21, 16,  8, 11, 26], dtype=int32)

In [55]:
# number of traits?
part29.value_counts().loc[[x for x in part29_clusters if x not in (0,)]].sum()

459

In [56]:
# I only exclude the cluster labeled as 0, which has the "not-clustered" traits.
PHENOTYPES_CONFIG = [(k, x) for x in part29_clusters if x not in (0,)]

# GLSPhenoplier

## Get list of phenotypes/lvs pairs

In [57]:
phenotypes_lvs_pairs = []

for part_k, cluster_id in PHENOTYPES_CONFIG:
    # get traits from the partition/cluster
    part = best_partitions.loc[part_k, "partition"]
    cluster_traits = data.index[part == cluster_id]

    for phenotype_code in cluster_traits:
        for idx, lv_row in gls_selected_lvs.iterrows():
            phenotypes_lvs_pairs.append(
                {
                    "phenotype_part_k": part_k,
                    "phenotype_cluster_id": cluster_id,
                    "phenotype": phenotype_code,
                    "lv": lv_row["lv"],
                    "lv_set": lv_row["lv_set"],
                }
            )

phenotypes_lvs_pairs = pd.DataFrame(phenotypes_lvs_pairs).drop_duplicates()

In [58]:
phenotypes_lvs_pairs = phenotypes_lvs_pairs.sort_values(
    ["phenotype", "lv"]
).reset_index(drop=True)

In [59]:
phenotypes_lvs_pairs.shape

(2754, 5)

In [60]:
phenotypes_lvs_pairs.head()

Unnamed: 0,phenotype_part_k,phenotype_cluster_id,phenotype,lv,lv_set
0,29,22,100002_raw-Energy,LV246,lipids-increasing
1,29,22,100002_raw-Energy,LV607,lipids-increasing
2,29,22,100002_raw-Energy,LV612,lipids-decreasing
3,29,22,100002_raw-Energy,LV74,lipids-increasing
4,29,22,100002_raw-Energy,LV838,lipids-decreasing


## Run

In [61]:
results = []

pbar = tqdm(total=phenotypes_lvs_pairs.shape[0])

for idx, row in phenotypes_lvs_pairs.iterrows():
    phenotype_code = row["phenotype"]
    lv_code = row["lv"]

    pbar.set_description(f"{phenotype_code} - {lv_code}")

    gls_model = GLSPhenoplier(
        smultixcan_result_set_filepath=conf.PHENOMEXCAN[
            "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
        ]
    )
    gls_model.fit_named(lv_code, phenotype_code)
    res = gls_model.results

    results.append(
        {
            "part_k": row["phenotype_part_k"],
            "cluster_id": row["phenotype_cluster_id"],
            "phenotype": phenotype_code,
            "lv": lv_code,
            "lv_set": row["lv_set"],
            "lv_with_pathway": lv_code in well_aligned_lv_codes,
            "coef": res.params.loc["lv"],
            "pvalue": res.pvalues_onesided.loc["lv"],
            "pvalue_twosided": res.pvalues.loc["lv"],
            "summary": gls_model.results_summary,
        }
    )

    # save results every 10 models trained
    if (idx % 10) == 0:
        pd.DataFrame(results).to_pickle(OUTPUT_FILENAME)

    pbar.update(1)

pbar.close()

worry measurement - LV865: 100%|██████████| 2754/2754 [7:11:25<00:00,  9.40s/it]


In [62]:
results = pd.DataFrame(results)

In [63]:
results.shape

(2754, 10)

In [64]:
results.head()

Unnamed: 0,part_k,cluster_id,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
0,29,22,100002_raw-Energy,LV246,lipids-increasing,True,0.002401,0.424213,0.848425,GLS Regression Res...
1,29,22,100002_raw-Energy,LV607,lipids-increasing,True,-0.006323,0.691499,0.617002,GLS Regression Res...
2,29,22,100002_raw-Energy,LV612,lipids-decreasing,True,-0.000822,0.525786,0.948428,GLS Regression Res...
3,29,22,100002_raw-Energy,LV74,lipids-increasing,True,-0.006035,0.685313,0.629373,GLS Regression Res...
4,29,22,100002_raw-Energy,LV838,lipids-decreasing,True,0.024454,0.023446,0.046891,GLS Regression Res...


In [65]:
results.sort_values("pvalue").head(10)

Unnamed: 0,part_k,cluster_id,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
2238,29,16,MAGNETIC_CH2.DB.ratio,LV246,lipids-increasing,True,0.095366,2.183932e-14,4.367864e-14,GLS Regression Res...
2490,29,17,hypercholesterolemia,LV246,lipids-increasing,True,0.066665,3.672563e-08,7.345126e-08,GLS Regression Res...
1362,29,18,3143_raw-Ankle_spacing_width,LV246,lipids-increasing,True,0.059177,1.331584e-06,2.663168e-06,GLS Regression Res...
1943,29,25,6160_2-Leisuresocial_activities_Pub_or_social_...,LV865,lipids-increasing,True,0.055382,5.590409e-06,1.118082e-05,GLS Regression Res...
2450,29,11,fasting blood insulin measurement,LV612,lipids-decreasing,True,0.056333,5.754859e-06,1.150972e-05,GLS Regression Res...
1452,29,18,4119_raw-Ankle_spacing_width_right,LV246,lipids-increasing,True,0.054647,6.822593e-06,1.364519e-05,GLS Regression Res...
1598,29,10,5135_raw-3mm_strong_meridian_left,LV612,lipids-decreasing,True,0.053394,1.218497e-05,2.436994e-05,GLS Regression Res...
2244,29,16,MAGNETIC_HDL.C,LV246,lipids-increasing,True,0.050872,2.096086e-05,4.192172e-05,GLS Regression Res...
1824,29,17,6153_1-Medication_for_cholesterol_blood_pressu...,LV246,lipids-increasing,True,0.050766,2.17941e-05,4.358819e-05,GLS Regression Res...
1586,29,10,5133_raw-6mm_strong_meridian_right,LV612,lipids-decreasing,True,0.051322,2.389128e-05,4.778256e-05,GLS Regression Res...


## Save

In [66]:
results.to_pickle(OUTPUT_FILENAME)