# Description

This notebook is similar to `40` (using LVs that we found to be significantly enriched for the lipids CRISPR analysis), but traits here are from eMERGE.

# Environment variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import conf

In [3]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

3

In [4]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=3
env: OPEN_BLAS_NUM_THREADS=3
env: NUMEXPR_NUM_THREADS=3
env: OMP_NUM_THREADS=3


# Modules

In [5]:
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [6]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls')

In [7]:
OUTPUT_FILENAME = OUTPUT_DIR / "gls_phenotypes-crispr_lvs-emerge.pkl"
display(OUTPUT_FILENAME)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-crispr_lvs-emerge.pkl')

# Load data

## MultiPLIER summary

In [8]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [9]:
multiplier_model_summary.shape

(2157, 5)

In [10]:
multiplier_model_summary.head()

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
1,KEGG_LYSINE_DEGRADATION,1,0.388059,0.866078,0.956005
2,REACTOME_MRNA_SPLICING,1,0.733057,4.8e-05,0.000582
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628,0.011366
4,KEGG_DNA_REPLICATION,1,0.549473,0.312155,0.539951
5,PID_MYC_ACTIVPATHWAY,1,0.639303,0.021702,0.083739


In [11]:
well_aligned_lvs = multiplier_model_summary[
    (multiplier_model_summary["FDR"] < 0.05) | (multiplier_model_summary["AUC"] >= 0.75)
]

display(well_aligned_lvs.shape)
display(well_aligned_lvs.head())

(469, 5)

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
2,REACTOME_MRNA_SPLICING,1,0.733057,4.772691e-05,0.0005816211
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628217,0.0113659
8,REACTOME_MITOTIC_G1_G1_S_PHASES,1,0.68617,0.0002517619,0.002392292
9,IRIS_Monocyte-Day0,2,0.890036,4.315812e-25,1.329887e-22
10,DMAP_MONO2,2,0.904676,1.31397e-16,1.574574e-14


In [12]:
well_aligned_lv_codes = set([f"LV{lvi}" for lvi in well_aligned_lvs["LV index"]])

In [13]:
len(well_aligned_lv_codes)

200

In [14]:
list(well_aligned_lv_codes)[:5]

['LV123', 'LV310', 'LV931', 'LV735', 'LV39']

## eMERGE traits info

In [15]:
# FIXME: in the future, there will be a specific entry in config for the eMERGE directory that should be replaced here
input_filepath = Path(
    conf.DATA_DIR,
    "emerge",
    "eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt",
).resolve()
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge/eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt')

In [16]:
emerge_traits_df = pd.read_csv(
    input_filepath,
    sep="\t",
    dtype={"phecode": str},
    usecols=["phecode", "phenotype", "category"],
)

In [17]:
emerge_traits_df = emerge_traits_df.rename(
    columns={
        "phenotype": "phecode_phenotype",
        "category": "phecode_category",
    }
)

In [18]:
emerge_traits_df.shape

(309, 3)

In [19]:
emerge_traits_df.head()

Unnamed: 0,phecode,phecode_phenotype,phecode_category
0,8.0,Intestinal infection,infectious diseases
1,8.5,Bacterial enteritis,infectious diseases
2,8.52,Intestinal infection due to C. difficile,infectious diseases
3,38.0,Septicemia,infectious diseases
4,38.3,Bacteremia,infectious diseases


## eMERGE (S-MultiXcan)

In [20]:
# FIXME: in the future, there will be a specific entry in config for the eMERGE directory that should be replaced here
emerge_smultixcan_zscores_filepath = Path(
    conf.DATA_DIR,
    "emerge",
    "gene_assoc",
    "emerge-smultixcan-mashr-zscores.pkl",
).resolve()

display(emerge_smultixcan_zscores_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge/gene_assoc/emerge-smultixcan-mashr-zscores.pkl')

In [21]:
_tmp = pd.read_pickle(emerge_smultixcan_zscores_filepath)

In [22]:
_tmp.shape

(22198, 309)

In [23]:
_tmp.head()

Unnamed: 0_level_0,292.3,079,741,418.1,280,747,591,426.21,244,070.3,...,427,585.4,345,366.2,420.2,195,250.2,250.1,371,562
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,1.043436,1.134028,1.595979,0.399451,0.7251,1.655468,1.803598,0.12525,1.053218,0.922592,...,0.364789,0.453873,0.759558,1.173671,0.298944,1.029646,0.826238,0.685711,2.069701,1.544766
ENSG00000000457,1.236752,0.119837,0.22412,0.20061,1.30419,0.541478,0.646474,0.493759,0.18968,1.563273,...,1.5195,0.20129,1.020996,0.592006,0.632661,0.995453,0.069362,0.460979,0.217046,1.350475
ENSG00000000460,0.497108,0.00569,0.31771,0.184918,1.136574,1.708245,0.288962,1.070026,0.40351,1.106321,...,0.678253,1.166204,1.138126,0.608565,0.449551,0.096689,0.887467,0.143815,0.236647,1.048859
ENSG00000000938,0.374442,1.099899,1.975795,0.107667,0.90352,0.673819,0.855749,0.499088,0.375406,0.815757,...,1.585687,0.584143,0.751384,2.75823,0.572712,0.141023,1.437022,0.288943,0.665727,1.225229
ENSG00000000971,1.108762,1.133163,0.227188,0.222948,0.072709,0.256124,0.540911,0.606947,0.229133,1.557433,...,2.349373,1.122458,0.155346,0.990123,0.326914,1.10021,1.099537,2.511358,0.950262,1.467305


## GLS results on PhenomeXcan

In [24]:
input_filepath = conf.RESULTS["GLS"] / "gls_phenotypes-crispr_lvs-phenomexcan.pkl"
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-crispr_lvs-phenomexcan.pkl')

In [25]:
gls_phenomexcan_crispr = pd.read_pickle(input_filepath)

In [26]:
gls_phenomexcan_crispr.shape

(630, 10)

In [27]:
gls_phenomexcan_crispr.head()

Unnamed: 0,part_k,cluster_id,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
0,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV246,lipids-increasing,True,0.036054,0.001882,0.003763,GLS Regression Res...
1,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV302,lipids-decreasing,False,-0.006249,0.699373,0.601253,GLS Regression Res...
2,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV41,lipids-decreasing,False,-0.019236,0.942844,0.114313,GLS Regression Res...
3,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV512,lipids-decreasing,False,-0.02241,0.965432,0.069135,GLS Regression Res...
4,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV520,lipids-decreasing,False,-0.011205,0.822209,0.355583,GLS Regression Res...


# Select LV from previous GLS run on PhenomeXcan

This selectes the LVs enriched for the lipids CRISPR anaysis.

In [28]:
gls_phenomexcan_lvs = (
    gls_phenomexcan_crispr[["lv", "lv_set"]].drop_duplicates().reset_index(drop=True)
)

In [29]:
gls_phenomexcan_lvs.shape

(15, 2)

In [30]:
gls_phenomexcan_lvs.head()

Unnamed: 0,lv,lv_set
0,LV246,lipids-increasing
1,LV302,lipids-decreasing
2,LV41,lipids-decreasing
3,LV512,lipids-decreasing
4,LV520,lipids-decreasing


# Select relevant traits from eMERGE

Here we don't have partitions/cluters as with PhenomeXcan. So for this analysis (LVs related to lipids), I select relevant categories of traits from eMERGE (we have a "category" column for this results).

In [31]:
emerge_traits_df["phecode_category"].unique()

array(['infectious diseases', 'neoplasms', 'endocrine/metabolic',
       'hematopoietic', 'mental disorders', 'neurological',
       'sense organs', 'circulatory system', 'respiratory', 'digestive',
       'genitourinary', 'dermatologic', 'musculoskeletal',
       'congenital anomalies'], dtype=object)

In [32]:
gls_traits = emerge_traits_df[
    emerge_traits_df["phecode_category"].isin(
        [
            "circulatory system",
            "endocrine/metabolic",
            "neurological",
            "mental disorders",
        ]
    )
]["phecode"].unique()

In [33]:
gls_traits.shape

(140,)

In [34]:
gls_traits

array(['241', '241.1', '241.2', '244', '244.2', '244.4', '249', '250',
       '250.1', '250.2', '250.22', '250.24', '250.4', '250.41', '250.6',
       '260', '260.2', '261', '261.4', '269', '270', '272', '272.1',
       '272.11', '272.13', '274', '274.1', '274.11', '275', '275.3',
       '275.5', '275.53', '276', '276.1', '276.12', '276.13', '276.14',
       '276.4', '276.41', '276.5', '276.6', '277.7', '278', '278.1',
       '278.11', '279', '292', '292.3', '292.4', '327', '327.3', '327.32',
       '327.4', '327.7', '340', '345', '350.2', '356', '394', '394.2',
       '394.7', '395', '395.1', '395.2', '401', '401.1', '401.2',
       '401.21', '401.22', '411', '411.1', '411.2', '411.3', '411.4',
       '411.8', '414', '415', '415.1', '415.11', '415.2', '415.21', '416',
       '418', '418.1', '420', '420.2', '420.3', '425', '425.1', '425.2',
       '426', '426.2', '426.21', '426.24', '426.3', '426.31', '426.32',
       '427', '427.1', '427.11', '427.12', '427.2', '427.21', '427.22',
   

# GLSPhenoplier

## Get list of phenotypes/lvs pairs

In [35]:
phenotypes_lvs_pairs = []

for idx, lv_row in gls_phenomexcan_lvs.iterrows():
    for phenotype_code in gls_traits:
        phenotypes_lvs_pairs.append(
            {
                "phenotype": phenotype_code,
                "lv": lv_row["lv"],
                "lv_set": lv_row["lv_set"],
            }
        )

phenotypes_lvs_pairs = pd.DataFrame(phenotypes_lvs_pairs).drop_duplicates()

In [36]:
phenotypes_lvs_pairs = phenotypes_lvs_pairs.sort_values("phenotype").reset_index(
    drop=True
)

In [37]:
phenotypes_lvs_pairs.shape

(2100, 3)

In [38]:
phenotypes_lvs_pairs.head()

Unnamed: 0,phenotype,lv,lv_set
0,241,LV246,lipids-increasing
1,241,LV302,lipids-decreasing
2,241,LV512,lipids-decreasing
3,241,LV865,lipids-increasing
4,241,LV890,lipids-increasing


## Run

In [39]:
results = []

pbar = tqdm(total=phenotypes_lvs_pairs.shape[0])

for idx, row in phenotypes_lvs_pairs.iterrows():
    phenotype_code = row["phenotype"]
    lv_code = row["lv"]

    pbar.set_description(f"{phenotype_code} - {lv_code}")

    gls_model = GLSPhenoplier(
        smultixcan_result_set_filepath=emerge_smultixcan_zscores_filepath
    )
    gls_model.fit_named(lv_code, phenotype_code)
    res = gls_model.results

    results.append(
        {
            "phenotype": phenotype_code,
            "lv": lv_code,
            "lv_set": row["lv_set"],
            "lv_with_pathway": lv_code in well_aligned_lv_codes,
            "coef": res.params.loc["lv"],
            "pvalue": res.pvalues_onesided.loc["lv"],
            "pvalue_twosided": res.pvalues.loc["lv"],
            "summary": gls_model.results_summary,
        }
    )

    # save results every 10 models trained
    if (idx % 10) == 0:
        pd.DataFrame(results).to_pickle(OUTPUT_FILENAME)

    pbar.update(1)

pbar.close()

458.9 - LV959: 100%|██████████| 2100/2100 [5:26:31<00:00,  9.33s/it]


In [40]:
results = pd.DataFrame(results)

In [41]:
results.shape

(2100, 8)

In [42]:
results.head()

Unnamed: 0,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
0,241,LV246,lipids-increasing,True,0.004652,0.355387,0.710773,GLS Regression Res...
1,241,LV302,lipids-decreasing,False,0.002508,0.417611,0.835222,GLS Regression Res...
2,241,LV512,lipids-decreasing,False,-0.014417,0.876977,0.246046,GLS Regression Res...
3,241,LV865,lipids-increasing,True,0.011181,0.184413,0.368827,GLS Regression Res...
4,241,LV890,lipids-increasing,False,0.026705,0.016937,0.033874,GLS Regression Res...


In [43]:
results.sort_values("pvalue").head(10)

Unnamed: 0,phenotype,lv,lv_set,lv_with_pathway,coef,pvalue,pvalue_twosided,summary
579,276.41,LV246,lipids-increasing,True,0.054395,8e-06,1.6e-05,GLS Regression Res...
567,276.4,LV246,lipids-increasing,True,0.053259,1.2e-05,2.4e-05,GLS Regression Res...
1543,427.21,LV512,lipids-decreasing,False,0.051893,1.2e-05,2.5e-05,GLS Regression Res...
1528,427.2,LV512,lipids-decreasing,False,0.050576,1.9e-05,3.8e-05,GLS Regression Res...
1588,427.5,LV512,lipids-decreasing,False,0.050703,2.3e-05,4.7e-05,GLS Regression Res...
656,278.1,LV702,lipids-increasing,False,0.046992,0.000107,0.000214,GLS Regression Res...
1728,429.1,LV838,lipids-decreasing,True,0.045425,0.000112,0.000223,GLS Regression Res...
1640,428.0,LV865,lipids-increasing,True,0.045718,0.00012,0.000239,GLS Regression Res...
1671,428.2,LV512,lipids-decreasing,False,0.044195,0.000195,0.000389,GLS Regression Res...
30,241.2,LV959,lipids-decreasing,False,0.043105,0.000315,0.000631,GLS Regression Res...


## Save

In [44]:
results.to_pickle(OUTPUT_FILENAME)