# Description

This notebook is similar to `40` (using LVs that we found to be significantly enriched for the lipids CRISPR analysis), but traits here are from eMERGE.

# Environment variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import conf

In [3]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

3

In [4]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=3
env: OPEN_BLAS_NUM_THREADS=3
env: NUMEXPR_NUM_THREADS=3
env: OMP_NUM_THREADS=3


# Modules

In [5]:
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [6]:
N_TOP_TRAITS_FROM_LV = 20

In [7]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls')

In [8]:
OUTPUT_FILENAME = OUTPUT_DIR / "gls_phenotypes-crispr_lvs-emerge.pkl"
display(OUTPUT_FILENAME)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-crispr_lvs-emerge.pkl')

# Load data

## MultiPLIER summary

In [9]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [10]:
multiplier_model_summary.shape

(2157, 5)

In [11]:
multiplier_model_summary.head()

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
1,KEGG_LYSINE_DEGRADATION,1,0.388059,0.866078,0.956005
2,REACTOME_MRNA_SPLICING,1,0.733057,4.8e-05,0.000582
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628,0.011366
4,KEGG_DNA_REPLICATION,1,0.549473,0.312155,0.539951
5,PID_MYC_ACTIVPATHWAY,1,0.639303,0.021702,0.083739


In [12]:
well_aligned_lvs = multiplier_model_summary[
    (multiplier_model_summary["FDR"] < 0.05) | (multiplier_model_summary["AUC"] >= 0.75)
]

display(well_aligned_lvs.shape)
display(well_aligned_lvs.head())

(469, 5)

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
2,REACTOME_MRNA_SPLICING,1,0.733057,4.772691e-05,0.0005816211
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628217,0.0113659
8,REACTOME_MITOTIC_G1_G1_S_PHASES,1,0.68617,0.0002517619,0.002392292
9,IRIS_Monocyte-Day0,2,0.890036,4.315812e-25,1.329887e-22
10,DMAP_MONO2,2,0.904676,1.31397e-16,1.574574e-14


In [13]:
well_aligned_lv_codes = set([f"LV{lvi}" for lvi in well_aligned_lvs["LV index"]])

In [14]:
len(well_aligned_lv_codes)

200

In [15]:
list(well_aligned_lv_codes)[:5]

['LV42', 'LV707', 'LV34', 'LV898', 'LV735']

## eMERGE traits info

In [16]:
# FIXME: in the future, there will be a specific entry in config for the eMERGE directory that should be replaced here
input_filepath = Path(
    conf.DATA_DIR,
    "emerge",
    "eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt",
).resolve()
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge/eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt')

In [17]:
emerge_traits_df = pd.read_csv(
    input_filepath,
    sep="\t",
    dtype={"phecode": str},
    usecols=["phecode", "phenotype", "category"],
)

In [18]:
emerge_traits_df = emerge_traits_df.rename(
    columns={
        "phenotype": "phecode_phenotype",
        "category": "phecode_category",
    }
)

In [19]:
emerge_traits_df.shape

(309, 3)

In [20]:
emerge_traits_df.head()

Unnamed: 0,phecode,phecode_phenotype,phecode_category
0,8.0,Intestinal infection,infectious diseases
1,8.5,Bacterial enteritis,infectious diseases
2,8.52,Intestinal infection due to C. difficile,infectious diseases
3,38.0,Septicemia,infectious diseases
4,38.3,Bacteremia,infectious diseases


## eMERGE (S-MultiXcan)

In [21]:
# FIXME: in the future, there will be a specific entry in config for the eMERGE directory that should be replaced here
emerge_smultixcan_zscores_filepath = Path(
    conf.DATA_DIR,
    "emerge",
    "gene_assoc",
    "emerge-smultixcan-mashr-zscores.pkl",
).resolve()

display(emerge_smultixcan_zscores_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge/gene_assoc/emerge-smultixcan-mashr-zscores.pkl')

In [22]:
_tmp = pd.read_pickle(emerge_smultixcan_zscores_filepath)

In [23]:
_tmp.shape

(22198, 309)

In [24]:
_tmp.head()

Unnamed: 0_level_0,292.3,079,741,418.1,280,747,591,426.21,244,070.3,...,427,585.4,345,366.2,420.2,195,250.2,250.1,371,562
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,1.043436,1.134028,1.595979,0.399451,0.7251,1.655468,1.803598,0.12525,1.053218,0.922592,...,0.364789,0.453873,0.759558,1.173671,0.298944,1.029646,0.826238,0.685711,2.069701,1.544766
ENSG00000000457,1.236752,0.119837,0.22412,0.20061,1.30419,0.541478,0.646474,0.493759,0.18968,1.563273,...,1.5195,0.20129,1.020996,0.592006,0.632661,0.995453,0.069362,0.460979,0.217046,1.350475
ENSG00000000460,0.497108,0.00569,0.31771,0.184918,1.136574,1.708245,0.288962,1.070026,0.40351,1.106321,...,0.678253,1.166204,1.138126,0.608565,0.449551,0.096689,0.887467,0.143815,0.236647,1.048859
ENSG00000000938,0.374442,1.099899,1.975795,0.107667,0.90352,0.673819,0.855749,0.499088,0.375406,0.815757,...,1.585687,0.584143,0.751384,2.75823,0.572712,0.141023,1.437022,0.288943,0.665727,1.225229
ENSG00000000971,1.108762,1.133163,0.227188,0.222948,0.072709,0.256124,0.540911,0.606947,0.229133,1.557433,...,2.349373,1.122458,0.155346,0.990123,0.326914,1.10021,1.099537,2.511358,0.950262,1.467305


## eMERGE (S-MultiXcan) projection

In [25]:
input_filepath = Path(
    conf.RESULTS["PROJECTIONS_DIR"],
    "projection-emerge-smultixcan-mashr-zscores.pkl",
).resolve()
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections/projection-emerge-smultixcan-mashr-zscores.pkl')

In [26]:
emerge_projection = pd.read_pickle(input_filepath)

In [27]:
emerge_projection.shape

(987, 309)

In [28]:
emerge_projection.head()

Unnamed: 0,292.3,079,741,418.1,280,747,591,426.21,244,070.3,...,427,585.4,345,366.2,420.2,195,250.2,250.1,371,562
LV1,0.055748,-0.019274,0.024491,0.012926,-0.026501,-0.043919,0.025264,-0.053251,0.043269,0.002465,...,-0.031739,0.014192,0.00871,-0.024261,-0.00478,0.057256,0.059702,0.036648,-0.02204,0.034427
LV2,-0.04563,-0.00857,0.040969,-0.016198,-0.032321,-0.017629,0.009238,-0.011634,-0.005854,0.033497,...,0.019678,0.040569,-0.009344,-0.012921,-0.055292,0.003193,0.013844,0.008327,0.004302,0.035154
LV3,0.025341,-0.059962,-0.027354,-0.033864,0.009571,0.023318,-0.003872,0.001095,0.028179,0.003639,...,0.041022,-0.036422,0.006088,-0.041302,0.017492,-0.039591,-0.000801,0.009127,0.045741,0.004434
LV4,-0.039736,0.013238,-0.002819,0.029253,-0.029388,-0.003176,0.037212,-0.02237,0.003168,0.039998,...,-0.029197,0.078351,0.044476,-0.0273,0.005332,0.028581,0.004288,0.015797,-0.074106,-0.020936
LV5,-0.01225,-0.002846,-0.025161,0.01388,0.009457,0.024885,0.001274,0.064774,0.039294,-0.000256,...,0.032792,-0.035128,-0.011787,-0.006511,-0.035433,-0.008735,-0.011139,-0.036651,0.003044,0.006505


## GLS results on PhenomeXcan

In [29]:
input_filepath = conf.RESULTS["GLS"] / "gls_phenotypes-crispr_lvs-phenomexcan.pkl"
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-crispr_lvs-phenomexcan.pkl')

In [30]:
gls_phenomexcan_crispr = pd.read_pickle(input_filepath)

In [31]:
gls_phenomexcan_crispr.shape

(300, 8)

In [32]:
gls_phenomexcan_crispr.head()

Unnamed: 0,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
0,100007_raw-Polyunsaturated_fat,LV890,lipids-increasing,False,0.044519,0.000228,0.000455,GLS Regression Res...
1,100360-Decaffeinated_coffee,LV520,lipids-decreasing,False,0.041107,0.000391,0.000782,GLS Regression Res...
2,102280-Milk_chocolate_intake,LV702,lipids-increasing,False,0.043164,0.000335,0.00067,GLS Regression Res...
3,102_raw-Pulse_rate_automated_reading,LV246,lipids-increasing,True,0.037216,0.001377,0.002754,GLS Regression Res...
4,102_raw-Pulse_rate_automated_reading,LV801,lipids-decreasing,False,0.048699,6.6e-05,0.000131,GLS Regression Res...


# Select LV from previous GLS run on PhenomeXcan

This selectes the LVs enriched for the lipids CRISPR anaysis.

In [33]:
gls_phenomexcan_lvs = (
    gls_phenomexcan_crispr[["lv", "lv_set"]].drop_duplicates().reset_index(drop=True)
)

In [34]:
gls_phenomexcan_lvs.shape

(15, 2)

In [35]:
gls_phenomexcan_lvs.head()

Unnamed: 0,lv,lv_set
0,LV890,lipids-increasing
1,LV520,lipids-decreasing
2,LV702,lipids-increasing
3,LV246,lipids-increasing
4,LV801,lipids-decreasing


# Select relevant traits from eMERGE

~Here we don't have partitions/cluters as with PhenomeXcan. So for this analysis (LVs related to lipids), I select relevant categories of traits from eMERGE (we have a "category" column for this results).~

In [36]:
# emerge_traits_df["phecode_category"].unique()

In [37]:
# gls_traits = emerge_traits_df[
#     emerge_traits_df["phecode_category"].isin(
#         [
#             "circulatory system",
#             "endocrine/metabolic",
#             "neurological",
#             "mental disorders",
#         ]
#     )
# ]["phecode"].unique()

In [38]:
# gls_traits.shape

In [39]:
# gls_traits

# GLSPhenoplier

## Get list of phenotypes/lvs pairs

In [40]:
phenotypes_lvs_pairs = []

# for each LV, I take the top `N_TOP_TRAITS_FROM_LV` traits in eMERGE
for idx, row in gls_phenomexcan_lvs.iterrows():
    lv_name = row["lv"]
    lv_set = row["lv_set"]

    lv_traits = emerge_projection.loc[lv_name]
    lv_traits = lv_traits[lv_traits > 0.0]
    lv_traits = lv_traits.sort_values(ascending=False).head(N_TOP_TRAITS_FROM_LV)

    for phenotype_code in set(lv_traits.index.tolist()):
        phenotypes_lvs_pairs.append(
            {
                "phenotype": phenotype_code,
                "lv": lv_name,
                "lv_set": row["lv_set"],
            }
        )

phenotypes_lvs_pairs = pd.DataFrame(phenotypes_lvs_pairs).drop_duplicates()

In [41]:
phenotypes_lvs_pairs = phenotypes_lvs_pairs.sort_values("phenotype").reset_index(
    drop=True
)

In [42]:
phenotypes_lvs_pairs.shape

(300, 3)

In [43]:
phenotypes_lvs_pairs.head()

Unnamed: 0,phenotype,lv,lv_set
0,8.0,LV74,lipids-increasing
1,8.5,LV959,lipids-decreasing
2,38.0,LV612,lipids-decreasing
3,53.0,LV841,lipids-increasing
4,70.0,LV838,lipids-decreasing


## Run

In [44]:
results = []

pbar = tqdm(total=phenotypes_lvs_pairs.shape[0])

for idx, row in phenotypes_lvs_pairs.iterrows():
    phenotype_code = row["phenotype"]
    lv_code = row["lv"]

    pbar.set_description(f"{phenotype_code} - {lv_code}")

    gls_model = GLSPhenoplier(
        smultixcan_result_set_filepath=emerge_smultixcan_zscores_filepath
    )
    gls_model.fit_named(lv_code, phenotype_code)
    res = gls_model.results

    results.append(
        {
            "phenotype": phenotype_code,
            "lv": lv_code,
            "lv_set": row["lv_set"],
            "lv_with_pathway": lv_code in well_aligned_lv_codes,
            "coef": res.params.loc["lv"],
            "pvalue": res.pvalues_onesided.loc["lv"],
            "pvalue_twosided": res.pvalues.loc["lv"],
            "summary": gls_model.results_summary,
        }
    )

    # save results every 10 models trained
    if (idx % 10) == 0:
        pd.DataFrame(results).to_pickle(OUTPUT_FILENAME)

    pbar.update(1)

pbar.close()

747.11 - LV41: 100%|██████████| 300/300 [46:13<00:00,  9.25s/it]


In [45]:
results = pd.DataFrame(results)

In [46]:
results.shape

(300, 8)

In [47]:
results.head()

Unnamed: 0,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
0,8.0,LV74,lipids-increasing,True,0.019816,0.056707,0.113414,GLS Regression Res...
1,8.5,LV959,lipids-decreasing,False,0.026731,0.016509,0.033018,GLS Regression Res...
2,38.0,LV612,lipids-decreasing,True,0.028221,0.013968,0.027937,GLS Regression Res...
3,53.0,LV841,lipids-increasing,False,0.019209,0.064738,0.129477,GLS Regression Res...
4,70.0,LV838,lipids-decreasing,True,0.016759,0.087191,0.174381,GLS Regression Res...


In [48]:
results.sort_values("pvalue").head(10)

Unnamed: 0,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
46,276.41,LV246,lipids-increasing,True,0.054395,8e-06,1.6e-05,GLS Regression Res...
45,276.4,LV246,lipids-increasing,True,0.053259,1.2e-05,2.4e-05,GLS Regression Res...
141,427.21,LV512,lipids-decreasing,False,0.051893,1.2e-05,2.5e-05,GLS Regression Res...
139,427.2,LV512,lipids-decreasing,False,0.050576,1.9e-05,3.8e-05,GLS Regression Res...
146,427.5,LV512,lipids-decreasing,False,0.050703,2.3e-05,4.7e-05,GLS Regression Res...
51,278.1,LV702,lipids-increasing,False,0.046992,0.000107,0.000214,GLS Regression Res...
162,429.1,LV838,lipids-decreasing,True,0.045425,0.000112,0.000223,GLS Regression Res...
291,743.11,LV702,lipids-increasing,False,0.044913,0.000189,0.000378,GLS Regression Res...
154,428.2,LV512,lipids-decreasing,False,0.044195,0.000195,0.000389,GLS Regression Res...
257,694.0,LV959,lipids-decreasing,False,0.042986,0.000298,0.000595,GLS Regression Res...


## Save

In [49]:
results.to_pickle(OUTPUT_FILENAME)