# Description

This notebook is similar to `40` (using LVs that we found to be significantly enriched for the lipids CRISPR analysis), but traits here are from eMERGE.

# Environment variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import conf

In [3]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

4

In [4]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=4
env: OPEN_BLAS_NUM_THREADS=4
env: NUMEXPR_NUM_THREADS=4
env: OMP_NUM_THREADS=4


# Modules

In [5]:
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [6]:
N_TOP_TRAITS_FROM_LV = 20

In [7]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/gls')

In [8]:
OUTPUT_FILENAME = OUTPUT_DIR / "gls_phenotypes-crispr_lvs-emerge.pkl"
display(OUTPUT_FILENAME)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/gls/gls_phenotypes-crispr_lvs-emerge.pkl')

# Load data

## MultiPLIER summary

In [9]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [10]:
multiplier_model_summary.shape

(2157, 5)

In [11]:
multiplier_model_summary.head()

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
1,KEGG_LYSINE_DEGRADATION,1,0.388059,0.866078,0.956005
2,REACTOME_MRNA_SPLICING,1,0.733057,4.8e-05,0.000582
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628,0.011366
4,KEGG_DNA_REPLICATION,1,0.549473,0.312155,0.539951
5,PID_MYC_ACTIVPATHWAY,1,0.639303,0.021702,0.083739


In [12]:
well_aligned_lvs = multiplier_model_summary[
    (multiplier_model_summary["FDR"] < 0.05) | (multiplier_model_summary["AUC"] >= 0.75)
]

display(well_aligned_lvs.shape)
display(well_aligned_lvs.head())

(469, 5)

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
2,REACTOME_MRNA_SPLICING,1,0.733057,4.772691e-05,0.0005816211
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628217,0.0113659
8,REACTOME_MITOTIC_G1_G1_S_PHASES,1,0.68617,0.0002517619,0.002392292
9,IRIS_Monocyte-Day0,2,0.890036,4.315812e-25,1.329887e-22
10,DMAP_MONO2,2,0.904676,1.31397e-16,1.574574e-14


In [13]:
well_aligned_lv_codes = set([f"LV{lvi}" for lvi in well_aligned_lvs["LV index"]])

In [14]:
len(well_aligned_lv_codes)

200

In [15]:
list(well_aligned_lv_codes)[:5]

['LV649', 'LV57', 'LV937', 'LV249', 'LV11']

## eMERGE traits info

In [16]:
# FIXME: in the future, there will be a specific entry in config for the eMERGE directory that should be replaced here
input_filepath = Path(
    conf.DATA_DIR,
    "emerge",
    "eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt",
).resolve()
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/data/emerge/eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt')

In [17]:
emerge_traits_df = pd.read_csv(
    input_filepath,
    sep="\t",
    dtype={"phecode": str},
    usecols=["phecode", "phenotype", "category"],
)

In [18]:
emerge_traits_df = emerge_traits_df.rename(
    columns={
        "phenotype": "phecode_phenotype",
        "category": "phecode_category",
    }
)

In [19]:
emerge_traits_df.shape

(309, 3)

In [20]:
emerge_traits_df.head()

Unnamed: 0,phecode,phecode_phenotype,phecode_category
0,8.0,Intestinal infection,infectious diseases
1,8.5,Bacterial enteritis,infectious diseases
2,8.52,Intestinal infection due to C. difficile,infectious diseases
3,38.0,Septicemia,infectious diseases
4,38.3,Bacteremia,infectious diseases


## eMERGE (S-MultiXcan)

In [None]:
# FIXME: in the future, there will be a specific entry in config for the eMERGE directory that should be replaced here
emerge_smultixcan_zscores_filepath = Path(
    conf.DATA_DIR,
    "emerge",
    "gene_assoc",
    "emerge-smultixcan-mashr-zscores.pkl",
).resolve()

display(emerge_smultixcan_zscores_filepath)

In [None]:
_tmp = pd.read_pickle(emerge_smultixcan_zscores_filepath)

In [None]:
_tmp.shape

In [None]:
_tmp.head()

## eMERGE (S-MultiXcan) projection

In [25]:
input_filepath = Path(
    conf.RESULTS["PROJECTIONS_DIR"],
    "projection-emerge-smultixcan-mashr-zscores.pkl",
).resolve()
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/projections/projection-emerge-smultixcan-mashr-zscores.pkl')

In [26]:
emerge_projection = pd.read_pickle(input_filepath)

In [27]:
emerge_projection.shape

(987, 309)

In [28]:
emerge_projection.head()

Unnamed: 0,292.3,079,741,418.1,280,747,591,426.21,244,070.3,...,427,585.4,345,366.2,420.2,195,250.2,250.1,371,562
LV1,0.055748,-0.019274,0.024491,0.012926,-0.026501,-0.043919,0.025264,-0.053251,0.043269,0.002465,...,-0.031739,0.014192,0.00871,-0.024261,-0.00478,0.057256,0.059702,0.036648,-0.02204,0.034427
LV2,-0.04563,-0.00857,0.040969,-0.016198,-0.032321,-0.017629,0.009238,-0.011634,-0.005854,0.033497,...,0.019678,0.040569,-0.009344,-0.012921,-0.055292,0.003193,0.013844,0.008327,0.004302,0.035154
LV3,0.025341,-0.059962,-0.027354,-0.033864,0.009571,0.023318,-0.003872,0.001095,0.028179,0.003639,...,0.041022,-0.036422,0.006088,-0.041302,0.017492,-0.039591,-0.000801,0.009127,0.045741,0.004434
LV4,-0.039736,0.013238,-0.002819,0.029253,-0.029388,-0.003176,0.037212,-0.02237,0.003168,0.039998,...,-0.029197,0.078351,0.044476,-0.0273,0.005332,0.028581,0.004288,0.015797,-0.074106,-0.020936
LV5,-0.01225,-0.002846,-0.025161,0.01388,0.009457,0.024885,0.001274,0.064774,0.039294,-0.000256,...,0.032792,-0.035128,-0.011787,-0.006511,-0.035433,-0.008735,-0.011139,-0.036651,0.003044,0.006505


## GLS results on PhenomeXcan

In [29]:
input_filepath = conf.RESULTS["GLS"] / "gls_phenotypes-crispr_lvs-phenomexcan.pkl"
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/gls/gls_phenotypes-crispr_lvs-phenomexcan.pkl')

In [30]:
gls_phenomexcan_crispr = pd.read_pickle(input_filepath)

In [31]:
gls_phenomexcan_crispr.shape

(630, 10)

In [32]:
gls_phenomexcan_crispr.head()

Unnamed: 0,part_k,cluster_id,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
0,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV246,lipids-increasing,True,0.036054,0.001882,0.003763,GLS Regression Res...
1,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV302,lipids-decreasing,False,-0.006249,0.699373,0.601253,GLS Regression Res...
2,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV41,lipids-decreasing,False,-0.019236,0.942844,0.114313,GLS Regression Res...
3,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV512,lipids-decreasing,False,-0.02241,0.965432,0.069135,GLS Regression Res...
4,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV520,lipids-decreasing,False,-0.011205,0.822209,0.355583,GLS Regression Res...


# Select LV from previous GLS run on PhenomeXcan

This selectes the LVs enriched for the lipids CRISPR anaysis.

In [33]:
gls_phenomexcan_lvs = (
    gls_phenomexcan_crispr[["lv", "lv_set"]].drop_duplicates().reset_index(drop=True)
)

In [34]:
gls_phenomexcan_lvs.shape

(15, 2)

In [35]:
gls_phenomexcan_lvs.head()

Unnamed: 0,lv,lv_set
0,LV246,lipids-increasing
1,LV302,lipids-decreasing
2,LV41,lipids-decreasing
3,LV512,lipids-decreasing
4,LV520,lipids-decreasing


# Select relevant traits from eMERGE

~Here we don't have partitions/cluters as with PhenomeXcan. So for this analysis (LVs related to lipids), I select relevant categories of traits from eMERGE (we have a "category" column for this results).~

In [49]:
# emerge_traits_df["phecode_category"].unique()

In [37]:
# gls_traits = emerge_traits_df[
#     emerge_traits_df["phecode_category"].isin(
#         [
#             "circulatory system",
#             "endocrine/metabolic",
#             "neurological",
#             "mental disorders",
#         ]
#     )
# ]["phecode"].unique()

In [38]:
# gls_traits.shape

In [39]:
# gls_traits

# GLSPhenoplier

## Get list of phenotypes/lvs pairs

In [45]:
phenotypes_lvs_pairs = []

# for each LV, I take the top `N_TOP_TRAITS_FROM_LV` traits in eMERGE
for idx, row in gls_phenomexcan_lvs.iterrows():
    lv_name = row["lv"]
    lv_set = row["lv_set"]

    lv_traits = emerge_projection.loc[lv_name]
    lv_traits = lv_traits[lv_traits > 0.0]
    lv_traits = lv_traits.sort_values(ascending=False).head(N_TOP_TRAITS_FROM_LV)

    for phenotype_code in set(lv_traits.index.tolist()):
        phenotypes_lvs_pairs.append(
            {
                "phenotype": phenotype_code,
                "lv": lv_name,
                "lv_set": row["lv_set"],
            }
        )

phenotypes_lvs_pairs = pd.DataFrame(phenotypes_lvs_pairs).drop_duplicates()

In [46]:
phenotypes_lvs_pairs = phenotypes_lvs_pairs.sort_values("phenotype").reset_index(
    drop=True
)

In [47]:
phenotypes_lvs_pairs.shape

(300, 3)

In [48]:
phenotypes_lvs_pairs.head()

Unnamed: 0,phenotype,lv,lv_set
0,8.0,LV74,lipids-increasing
1,8.5,LV959,lipids-decreasing
2,38.0,LV612,lipids-decreasing
3,53.0,LV841,lipids-increasing
4,70.0,LV838,lipids-decreasing


## Run

In [39]:
results = []

pbar = tqdm(total=phenotypes_lvs_pairs.shape[0])

for idx, row in phenotypes_lvs_pairs.iterrows():
    phenotype_code = row["phenotype"]
    lv_code = row["lv"]

    pbar.set_description(f"{phenotype_code} - {lv_code}")

    gls_model = GLSPhenoplier(
        smultixcan_result_set_filepath=emerge_smultixcan_zscores_filepath
    )
    gls_model.fit_named(lv_code, phenotype_code)
    res = gls_model.results

    results.append(
        {
            "phenotype": phenotype_code,
            "lv": lv_code,
            "lv_set": row["lv_set"],
            "lv_with_pathway": lv_code in well_aligned_lv_codes,
            "coef": res.params.loc["lv"],
            "pvalue": res.pvalues_onesided.loc["lv"],
            "pvalue_twosided": res.pvalues.loc["lv"],
            "summary": gls_model.results_summary,
        }
    )

    # save results every 10 models trained
    if (idx % 10) == 0:
        pd.DataFrame(results).to_pickle(OUTPUT_FILENAME)

    pbar.update(1)

pbar.close()

458.9 - LV959: 100%|██████████| 2100/2100 [5:26:31<00:00,  9.33s/it]


In [40]:
results = pd.DataFrame(results)

In [41]:
results.shape

(2100, 8)

In [42]:
results.head()

Unnamed: 0,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
0,241,LV246,lipids-increasing,True,0.004652,0.355387,0.710773,GLS Regression Res...
1,241,LV302,lipids-decreasing,False,0.002508,0.417611,0.835222,GLS Regression Res...
2,241,LV512,lipids-decreasing,False,-0.014417,0.876977,0.246046,GLS Regression Res...
3,241,LV865,lipids-increasing,True,0.011181,0.184413,0.368827,GLS Regression Res...
4,241,LV890,lipids-increasing,False,0.026705,0.016937,0.033874,GLS Regression Res...


In [43]:
results.sort_values("pvalue").head(10)

Unnamed: 0,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
579,276.41,LV246,lipids-increasing,True,0.054395,8e-06,1.6e-05,GLS Regression Res...
567,276.4,LV246,lipids-increasing,True,0.053259,1.2e-05,2.4e-05,GLS Regression Res...
1543,427.21,LV512,lipids-decreasing,False,0.051893,1.2e-05,2.5e-05,GLS Regression Res...
1528,427.2,LV512,lipids-decreasing,False,0.050576,1.9e-05,3.8e-05,GLS Regression Res...
1588,427.5,LV512,lipids-decreasing,False,0.050703,2.3e-05,4.7e-05,GLS Regression Res...
656,278.1,LV702,lipids-increasing,False,0.046992,0.000107,0.000214,GLS Regression Res...
1728,429.1,LV838,lipids-decreasing,True,0.045425,0.000112,0.000223,GLS Regression Res...
1640,428.0,LV865,lipids-increasing,True,0.045718,0.00012,0.000239,GLS Regression Res...
1671,428.2,LV512,lipids-decreasing,False,0.044195,0.000195,0.000389,GLS Regression Res...
30,241.2,LV959,lipids-decreasing,False,0.043105,0.000315,0.000631,GLS Regression Res...


## Save

In [44]:
results.to_pickle(OUTPUT_FILENAME)