# Description

This notebook is similar to `30` and `35`, but here I use the LVs that we found to be significantly enriched for the lipids CRISPR analysis, which might or might not coincide with the previously used LVs (those that discriminate clusters).
The traits here are from PhenomeXcan.

# Environment variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import conf

In [3]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

3

In [4]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=3
env: OPEN_BLAS_NUM_THREADS=3
env: NUMEXPR_NUM_THREADS=3
env: OMP_NUM_THREADS=3


# Modules

In [5]:
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [6]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls')

In [7]:
OUTPUT_FILENAME = OUTPUT_DIR / "gls_phenotypes-crispr_lvs-phenomexcan.pkl"
display(OUTPUT_FILENAME)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-crispr_lvs-phenomexcan.pkl')

# Load data

## PhenomeXcan (S-MultiXcan)

In [8]:
INPUT_SUBSET = "z_score_std"

In [9]:
INPUT_STEM = "projection-smultixcan-efo_partial-mashr-zscores"

In [10]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    f"{INPUT_SUBSET}-{INPUT_STEM}.pkl",
).resolve()

In [11]:
data = pd.read_pickle(input_filepath)

In [12]:
data.shape

(3752, 987)

In [13]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.695006,1.962565,0.057683,0.878731,-0.539977,1.481272,-0.396422,1.09018,0.759223,0.931395,...,1.129784,1.752343,-1.411403,2.823863,0.931116,-1.054519,0.432982,-0.633597,0.554279,-0.642479
100002_raw-Energy,-1.528127,-0.345309,-0.148953,-0.24206,0.373427,0.791092,0.263477,0.987702,0.354391,1.416059,...,0.224604,0.769882,-0.509482,0.091153,2.286789,-1.008256,-0.029764,1.737229,-0.272107,-0.526125
100003_raw-Protein,-0.704572,-1.011299,0.67142,0.143991,0.615212,0.874212,-0.040998,0.91517,0.254369,-0.084237,...,1.003019,1.044314,-2.376108,0.004778,0.053714,-0.892447,-0.1838,1.377991,-0.278794,-0.419733
100004_raw-Fat,-0.989832,-1.87549,0.261555,-1.420719,0.366238,1.167049,0.257387,0.717674,-0.997664,0.969825,...,0.585913,0.638314,0.119139,-0.140204,1.394326,-1.173402,0.555058,1.013982,-0.544506,-0.064061
100005_raw-Carbohydrate,-0.580143,0.243335,0.158966,-0.036558,0.068176,-0.202639,1.101281,0.675227,1.463432,1.010078,...,-0.249108,-0.026814,0.232713,0.323682,1.168642,-0.282935,0.653105,1.909526,0.199997,-1.656894


## Clustering results

In [14]:
CONSENSUS_CLUSTERING_DIR = Path(
    conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering"
).resolve()

display(CONSENSUS_CLUSTERING_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering')

In [15]:
input_file = Path(CONSENSUS_CLUSTERING_DIR, "best_partitions_by_k.pkl").resolve()
display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering/best_partitions_by_k.pkl')

In [16]:
best_partitions = pd.read_pickle(input_file)

In [17]:
best_partitions.shape

(59, 4)

In [18]:
best_partitions.head()

Unnamed: 0_level_0,method,partition,ari_median,selected
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.090117,True
22,scc_020,"[13, 18, 18, 18, 18, 18, 18, 18, 18, 13, 18, 1...",0.0901,True
13,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.08992,True
12,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.089894,True
11,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.089616,True


## MultiPLIER summary

In [19]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [20]:
multiplier_model_summary.shape

(2157, 5)

In [21]:
multiplier_model_summary.head()

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
1,KEGG_LYSINE_DEGRADATION,1,0.388059,0.866078,0.956005
2,REACTOME_MRNA_SPLICING,1,0.733057,4.8e-05,0.000582
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628,0.011366
4,KEGG_DNA_REPLICATION,1,0.549473,0.312155,0.539951
5,PID_MYC_ACTIVPATHWAY,1,0.639303,0.021702,0.083739


In [22]:
well_aligned_lvs = multiplier_model_summary[
    (multiplier_model_summary["FDR"] < 0.05) | (multiplier_model_summary["AUC"] >= 0.75)
]

display(well_aligned_lvs.shape)
display(well_aligned_lvs.head())

(469, 5)

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
2,REACTOME_MRNA_SPLICING,1,0.733057,4.772691e-05,0.0005816211
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628217,0.0113659
8,REACTOME_MITOTIC_G1_G1_S_PHASES,1,0.68617,0.0002517619,0.002392292
9,IRIS_Monocyte-Day0,2,0.890036,4.315812e-25,1.329887e-22
10,DMAP_MONO2,2,0.904676,1.31397e-16,1.574574e-14


In [23]:
well_aligned_lv_codes = set([f"LV{lvi}" for lvi in well_aligned_lvs["LV index"]])

In [24]:
len(well_aligned_lv_codes)

200

In [25]:
list(well_aligned_lv_codes)[:5]

['LV750', 'LV976', 'LV746', 'LV68', 'LV921']

# Select LVs from CRISPR analysis

In [26]:
# FIXME: there will be a specific folder for crispr analysis in the future, that should be replaced here
deg_enrich = pd.read_csv(
    Path(
        conf.RESULTS["BASE_DIR"],
        "crispr_analyses",
        "fgsea-all_lvs.tsv",
    ).resolve(),
    sep="\t",
)

In [27]:
deg_enrich.shape

(1973, 11)

In [28]:
deg_enrich.head()

Unnamed: 0,pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx,fdr
0,gene_set_decrease,0.198801,0.251748,0.09168,0.578418,1.123767,35,"PTBP1, KEAP1, PEX14, DLST, PCYT2, MAD2L2, GLRX...",LV1,2,0.755621
1,gene_set_increase,0.251748,0.251748,0.078711,0.539155,1.078565,63,"CHERP, RANGAP1, HNRNPL, RPS2, E4F1, TAF1C, GAT...",LV1,2,0.779947
2,gene_set_decrease,0.963037,0.999001,0.008938,0.434799,0.678421,20,"VDR, ACVR1B, KEAP1, NDUFV2, PEX14, NDUFS3, WDR26",LV10,5,1.0
3,gene_set_increase,1.0,1.0,0.0,0.236619,0.378164,25,"USP39, SRP19, ZNF3, RPL18, RPS19, RAP1GDS1, CH...",LV10,2,1.0
4,gene_set_decrease,0.042957,0.085914,0.216543,0.831205,1.711885,27,"NDUFB7, PTBP1, RRAGC, PPP2R2B, SQLE",LV100,6,0.718678


In [29]:
deg_enrich_max_idx = deg_enrich.groupby(["lv", "pathway"])["padj"].idxmax()

In [30]:
deg_enrich = deg_enrich.loc[deg_enrich_max_idx].reset_index(drop=True)
display(deg_enrich.shape)
display(deg_enrich.head())

(1973, 11)

Unnamed: 0,pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx,fdr
0,gene_set_decrease,0.198801,0.251748,0.09168,0.578418,1.123767,35,"PTBP1, KEAP1, PEX14, DLST, PCYT2, MAD2L2, GLRX...",LV1,2,0.755621
1,gene_set_increase,0.251748,0.251748,0.078711,0.539155,1.078565,63,"CHERP, RANGAP1, HNRNPL, RPS2, E4F1, TAF1C, GAT...",LV1,2,0.779947
2,gene_set_decrease,0.963037,0.999001,0.008938,0.434799,0.678421,20,"VDR, ACVR1B, KEAP1, NDUFV2, PEX14, NDUFS3, WDR26",LV10,5,1.0
3,gene_set_increase,1.0,1.0,0.0,0.236619,0.378164,25,"USP39, SRP19, ZNF3, RPL18, RPS19, RAP1GDS1, CH...",LV10,2,1.0
4,gene_set_decrease,0.042957,0.085914,0.216543,0.831205,1.711885,27,"NDUFB7, PTBP1, RRAGC, PPP2R2B, SQLE",LV100,6,0.718678


## Lipids-increasing gene sets

In [31]:
deg_increase = deg_enrich[
    deg_enrich["pathway"].isin(("gene_set_increase",)) & (deg_enrich["padj"] < 0.05)
].sort_values("padj", ascending=True)

In [32]:
deg_increase.shape

(27, 11)

In [33]:
deg_increase.head()

Unnamed: 0,pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx,fdr
1350,gene_set_increase,1.738856e-07,3.477712e-07,0.690132,0.905734,1.565818,49,"RPS14, RPL31, RPS19, RPS11, RPS6, RPL37, RPSA,...",LV707,3,0.000257
1790,gene_set_increase,4.293645e-05,8.587291e-05,0.557332,0.732205,1.486815,60,"RPS6, RPL6, RPLP0, RPL19, RPL31, RPL7, RPS13, ...",LV905,6,0.028238
1812,gene_set_increase,6.373604e-05,0.0001274721,0.538434,0.627667,1.444149,83,"SAFB, LUC7L3, HSP90B1, CHD4, SNRPD3, ISY1, DKC...",LV915,4,0.031438
1446,gene_set_increase,0.0005632729,0.001126546,0.477271,0.803565,1.288959,61,"RPL34, RPL35A, RPL31, RPS13, RPS6, RPL6, RPL7,...",LV750,8,0.222267
539,gene_set_increase,0.0007830825,0.001566165,0.477271,0.971974,1.978913,33,"ZNF3, MDM2, RPS6",LV341,9,0.257504


In [34]:
lvs_increase = deg_increase["lv"].unique()

In [35]:
lvs_increase.shape

(27,)

In [36]:
lvs_increase

array(['LV707', 'LV905', 'LV915', 'LV750', 'LV341', 'LV310', 'LV48',
       'LV509', 'LV467', 'LV64', 'LV490', 'LV550', 'LV621', 'LV775',
       'LV415', 'LV504', 'LV507', 'LV494', 'LV399', 'LV246', 'LV120',
       'LV122', 'LV515', 'LV489', 'LV783', 'LV768', 'LV577'], dtype=object)

## Lipids-decreasing gene sets

In [37]:
deg_decrease = deg_enrich[
    deg_enrich["pathway"].isin(("gene_set_decrease",)) & (deg_enrich["padj"] < 0.05)
].sort_values("padj", ascending=True)

In [38]:
deg_decrease.shape

(24, 11)

In [39]:
deg_decrease.head()

Unnamed: 0,pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx,fdr
1283,gene_set_decrease,2.605291e-07,5.210582e-07,0.674963,0.858542,1.636153,43,"NDUFA4, COX6A1, ATP5O, NDUFB10, COX5A, NDUFS3,...",LV678,3,0.000257
1199,gene_set_decrease,0.00120574,0.002411481,0.45506,0.753059,1.430708,35,"PEX14, COX17, NDUFS3, ATP5O, COX6A1, DLST, NDU...",LV64,2,0.339847
1769,gene_set_decrease,0.001610715,0.00322143,0.45506,0.807492,1.596196,25,"NDUFA4, PEX14, STX18, OGDH, COX6A1, COX17, PIK...",LV897,3,0.397243
1055,gene_set_decrease,0.004275828,0.008551656,0.407018,0.897219,1.742375,23,"CSK, MRPS12",LV575,4,0.718678
1413,gene_set_decrease,0.00576304,0.01152608,0.407018,0.936411,1.741728,19,"COX7C, NDUFS6, FLCN, COX5A, NDUFS2",LV736,3,0.718678


In [40]:
lvs_decrease = deg_decrease["lv"].unique()

In [41]:
lvs_decrease.shape

(24,)

In [42]:
lvs_decrease

array(['LV678', 'LV64', 'LV897', 'LV575', 'LV736', 'LV157', 'LV421',
       'LV469', 'LV250', 'LV420', 'LV558', 'LV630', 'LV750', 'LV99',
       'LV676', 'LV775', 'LV131', 'LV199', 'LV896', 'LV783', 'LV467',
       'LV498', 'LV515', 'LV343'], dtype=object)

## Merge into one dataframe

In [43]:
_tmp0 = pd.DataFrame({"lv": lvs_increase, "lv_set": "lipids-increasing"})

_tmp1 = pd.DataFrame({"lv": lvs_decrease, "lv_set": "lipids-decreasing"})

In [44]:
gls_selected_lvs = pd.concat([_tmp0, _tmp1], ignore_index=True)

In [45]:
gls_selected_lvs.shape

(51, 2)

In [46]:
gls_selected_lvs.head()

Unnamed: 0,lv,lv_set
0,LV707,lipids-increasing
1,LV905,lipids-increasing
2,LV915,lipids-increasing
3,LV750,lipids-increasing
4,LV341,lipids-increasing


# Select traits from specific partition/cluster

For this run on the LVs related to the lipids CRISPR analysis, I'm only interested in the main clusters of the cardiovascular sub-branch.

In [47]:
PHENOTYPES_CONFIG = [
    # cardiovascular
    (29, 14),
    (29, 16),
    (29, 11),
    (29, 21),
    (29, 17),
]

# GLSPhenoplier

## Get list of phenotypes/lvs pairs

In [48]:
phenotypes_lvs_pairs = []

for part_k, cluster_id in PHENOTYPES_CONFIG:
    # get traits from the partition/cluster
    part = best_partitions.loc[part_k, "partition"]
    cluster_traits = data.index[part == cluster_id]

    for phenotype_code in cluster_traits:
        for idx, lv_row in gls_selected_lvs.iterrows():
            phenotypes_lvs_pairs.append(
                {
                    "phenotype_part_k": part_k,
                    "phenotype_cluster_id": cluster_id,
                    "phenotype": phenotype_code,
                    "lv": lv_row["lv"],
                    "lv_set": lv_row["lv_set"],
                }
            )

phenotypes_lvs_pairs = pd.DataFrame(phenotypes_lvs_pairs).drop_duplicates()

In [49]:
phenotypes_lvs_pairs = phenotypes_lvs_pairs.sort_values(
    ["phenotype", "lv"]
).reset_index(drop=True)

In [50]:
phenotypes_lvs_pairs.shape

(2142, 5)

In [51]:
phenotypes_lvs_pairs.head()

Unnamed: 0,phenotype_part_k,phenotype_cluster_id,phenotype,lv,lv_set
0,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV120,lipids-increasing
1,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV122,lipids-increasing
2,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV131,lipids-decreasing
3,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV157,lipids-decreasing
4,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV199,lipids-decreasing


## Run

In [52]:
results = []

pbar = tqdm(total=phenotypes_lvs_pairs.shape[0])

for idx, row in phenotypes_lvs_pairs.iterrows():
    phenotype_code = row["phenotype"]
    lv_code = row["lv"]

    pbar.set_description(f"{phenotype_code} - {lv_code}")

    gls_model = GLSPhenoplier(
        smultixcan_result_set_filepath=conf.PHENOMEXCAN[
            "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
        ]
    )
    gls_model.fit_named(lv_code, phenotype_code)
    res = gls_model.results

    results.append(
        {
            "part_k": row["phenotype_part_k"],
            "cluster_id": row["phenotype_cluster_id"],
            "phenotype": phenotype_code,
            "lv": lv_code,
            "lv_set": row["lv_set"],
            "lv_with_pathway": lv_code in well_aligned_lv_codes,
            "coef": res.params.loc["lv"],
            "pvalue": res.pvalues_onesided.loc["lv"],
            "pvalue_twosided": res.pvalues.loc["lv"],
            "summary": gls_model.results_summary,
        }
    )

    # save results every 10 models trained
    if (idx % 10) == 0:
        pd.DataFrame(results).to_pickle(OUTPUT_FILENAME)

    pbar.update(1)

pbar.close()

schizophrenia - LV99: 100%|██████████| 2142/2142 [5:41:31<00:00,  9.57s/it]


In [53]:
results = pd.DataFrame(results)

In [54]:
results.shape

(2142, 10)

In [55]:
results.head()

Unnamed: 0,part_k,cluster_id,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
0,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV120,lipids-increasing,False,0.011685,0.172618,0.345237,GLS Regression Res...
1,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV122,lipids-increasing,False,-0.00437,0.63817,0.723659,GLS Regression Res...
2,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV131,lipids-decreasing,False,-0.000511,0.51659,0.966819,GLS Regression Res...
3,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV157,lipids-decreasing,False,0.010968,0.176044,0.352087,GLS Regression Res...
4,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV199,lipids-decreasing,False,-0.009505,0.7824,0.435201,GLS Regression Res...


In [56]:
results.sort_values("pvalue").head(10)

Unnamed: 0,part_k,cluster_id,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
1076,29,16,MAGNETIC_CH2.DB.ratio,LV246,lipids-increasing,True,0.095366,2.183932e-14,4.367864e-14,GLS Regression Res...
2100,29,21,schizophrenia,LV343,lipids-decreasing,False,0.070336,1.388092e-09,2.776184e-09,GLS Regression Res...
1841,29,17,hypercholesterolemia,LV246,lipids-increasing,True,0.066665,3.672563e-08,7.345126e-08,GLS Regression Res...
484,29,17,6153_2-Medication_for_cholesterol_blood_pressu...,LV515,lipids-increasing,False,0.058867,3.298919e-06,6.597838e-06,GLS Regression Res...
485,29,17,6153_2-Medication_for_cholesterol_blood_pressu...,LV515,lipids-decreasing,False,0.058867,3.298919e-06,6.597838e-06,GLS Regression Res...
1990,29,21,intracranial volume measurement,LV122,lipids-increasing,False,0.053315,7.984697e-06,1.596939e-05,GLS Regression Res...
77,29,17,20003_1141194794-Treatmentmedication_code_bend...,LV515,lipids-decreasing,False,0.055429,1.028495e-05,2.05699e-05,GLS Regression Res...
76,29,17,20003_1141194794-Treatmentmedication_code_bend...,LV515,lipids-increasing,False,0.055429,1.028495e-05,2.05699e-05,GLS Regression Res...
1127,29,16,MAGNETIC_HDL.C,LV246,lipids-increasing,True,0.050872,2.096086e-05,4.192172e-05,GLS Regression Res...
362,29,17,6153_1-Medication_for_cholesterol_blood_pressu...,LV246,lipids-increasing,True,0.050766,2.17941e-05,4.358819e-05,GLS Regression Res...


## Save

In [57]:
results.to_pickle(OUTPUT_FILENAME)