# Description

This notebook is similar to `30-gls_on_phenotypes...` but for eMERGE S-MultiXcan results instead of PhenomeXcan.

Since we don't have partition/clusters of eMERGE results, we selected the same set of LVs from the run on PhenomeXcan (`30-gls_on_phenotypes...`); and regarding traits, we take the top 20 traits from each LVs to create a list of trait/LVs pairs to run GLSPhenoplier on.

# Environment variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import conf

In [3]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

3

In [4]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=3
env: OPEN_BLAS_NUM_THREADS=3
env: NUMEXPR_NUM_THREADS=3
env: OMP_NUM_THREADS=3


# Modules

In [5]:
from pathlib import Path

import statsmodels.api as sm
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [None]:
N_TOP_TRAITS_FROM_LV = 20

In [6]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls')

# Load data

## eMERGE traits info

In [7]:
# FIXME: in the future, there will be a specific entry in config for the eMERGE directory that should be replaced here
input_filepath = Path(
    conf.DATA_DIR,
    "emerge",
    "eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt",
).resolve()
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge/eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt')

In [8]:
emerge_traits_df = pd.read_csv(
    input_filepath,
    sep="\t",
    dtype={"phecode": str},
    usecols=["phecode", "phenotype", "category"],
)

In [9]:
emerge_traits_df = emerge_traits_df.rename(
    columns={
        "phenotype": "phecode_phenotype",
        "category": "phecode_category",
    }
)

In [10]:
emerge_traits_df.shape

(309, 3)

In [11]:
emerge_traits_df.head()

Unnamed: 0,phecode,phecode_phenotype,phecode_category
0,8.0,Intestinal infection,infectious diseases
1,8.5,Bacterial enteritis,infectious diseases
2,8.52,Intestinal infection due to C. difficile,infectious diseases
3,38.0,Septicemia,infectious diseases
4,38.3,Bacteremia,infectious diseases


## eMERGE to PhenomeXcan maps

In [12]:
# FIXME: in the future, there will be a specific entry in config for the eMERGE directory that should be replaced here
emerge_phenomexcan_maps_filepath = Path(
    conf.DATA_DIR,
    "emerge",
    "phecodes_phenomexcan_maps.tsv",
).resolve()
display(emerge_phenomexcan_maps_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge/phecodes_phenomexcan_maps.tsv')

In [13]:
emerge_phenomexcan_maps = pd.read_csv(
    emerge_phenomexcan_maps_filepath, sep="\t", dtype={"phecode": str}
)

In [14]:
emerge_phenomexcan_maps = emerge_phenomexcan_maps.dropna(
    subset=["phecode", "phenomexcan"], how="any"
)

In [15]:
emerge_phenomexcan_maps.shape

(381, 6)

In [16]:
emerge_phenomexcan_maps.head()

Unnamed: 0,phecode,phecode_phenotype,phecode_category,icd10,efo,phenomexcan
0,8.0,Intestinal infection,infectious diseases,A09,gastroenteritis AND gastroenteritis,A09-Diagnoses_main_ICD10_A09_Diarrhoea_and_gas...
5,8.5,Bacterial enteritis,infectious diseases,A04,intestinal disease AND bacterial disease,A04-Diagnoses_main_ICD10_A04_Other_bacterial_i...
6,38.0,Septicemia,infectious diseases,A41,sepsis,20002_1657-Noncancer_illness_code_selfreported...
7,38.0,Septicemia,infectious diseases,A41,sepsis,A41-Diagnoses_main_ICD10_A41_Other_septicaemia
10,53.0,Herpes zoster,infectious diseases,B02,herpes zoster,B02-Diagnoses_main_ICD10_B02_Zoster_herpes_zoster


## eMERGE (S-MultiXcan) projection

In [17]:
input_filepath = Path(
    conf.RESULTS["PROJECTIONS_DIR"],
    "projection-emerge-smultixcan-mashr-zscores.pkl",
).resolve()
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections/projection-emerge-smultixcan-mashr-zscores.pkl')

In [18]:
emerge_projection = pd.read_pickle(input_filepath)

In [19]:
emerge_projection.shape

(987, 309)

In [20]:
emerge_projection.head()

Unnamed: 0,292.3,079,741,418.1,280,747,591,426.21,244,070.3,...,427,585.4,345,366.2,420.2,195,250.2,250.1,371,562
LV1,0.055748,-0.019274,0.024491,0.012926,-0.026501,-0.043919,0.025264,-0.053251,0.043269,0.002465,...,-0.031739,0.014192,0.00871,-0.024261,-0.00478,0.057256,0.059702,0.036648,-0.02204,0.034427
LV2,-0.04563,-0.00857,0.040969,-0.016198,-0.032321,-0.017629,0.009238,-0.011634,-0.005854,0.033497,...,0.019678,0.040569,-0.009344,-0.012921,-0.055292,0.003193,0.013844,0.008327,0.004302,0.035154
LV3,0.025341,-0.059962,-0.027354,-0.033864,0.009571,0.023318,-0.003872,0.001095,0.028179,0.003639,...,0.041022,-0.036422,0.006088,-0.041302,0.017492,-0.039591,-0.000801,0.009127,0.045741,0.004434
LV4,-0.039736,0.013238,-0.002819,0.029253,-0.029388,-0.003176,0.037212,-0.02237,0.003168,0.039998,...,-0.029197,0.078351,0.044476,-0.0273,0.005332,0.028581,0.004288,0.015797,-0.074106,-0.020936
LV5,-0.01225,-0.002846,-0.025161,0.01388,0.009457,0.024885,0.001274,0.064774,0.039294,-0.000256,...,0.032792,-0.035128,-0.011787,-0.006511,-0.035433,-0.008735,-0.011139,-0.036651,0.003044,0.006505


## eMERGE (S-MultiXcan)

In [21]:
# FIXME: in the future, there will be a specific entry in config for the eMERGE directory that should be replaced here
emerge_smultixcan_zscores_filepath = Path(
    conf.DATA_DIR,
    "emerge",
    "gene_assoc",
    "emerge-smultixcan-mashr-zscores.pkl",
).resolve()

display(emerge_smultixcan_zscores_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge/gene_assoc/emerge-smultixcan-mashr-zscores.pkl')

In [22]:
_tmp = pd.read_pickle(emerge_smultixcan_zscores_filepath)

In [23]:
_tmp.shape

(22198, 309)

In [24]:
_tmp.head()

Unnamed: 0_level_0,292.3,079,741,418.1,280,747,591,426.21,244,070.3,...,427,585.4,345,366.2,420.2,195,250.2,250.1,371,562
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,1.043436,1.134028,1.595979,0.399451,0.7251,1.655468,1.803598,0.12525,1.053218,0.922592,...,0.364789,0.453873,0.759558,1.173671,0.298944,1.029646,0.826238,0.685711,2.069701,1.544766
ENSG00000000457,1.236752,0.119837,0.22412,0.20061,1.30419,0.541478,0.646474,0.493759,0.18968,1.563273,...,1.5195,0.20129,1.020996,0.592006,0.632661,0.995453,0.069362,0.460979,0.217046,1.350475
ENSG00000000460,0.497108,0.00569,0.31771,0.184918,1.136574,1.708245,0.288962,1.070026,0.40351,1.106321,...,0.678253,1.166204,1.138126,0.608565,0.449551,0.096689,0.887467,0.143815,0.236647,1.048859
ENSG00000000938,0.374442,1.099899,1.975795,0.107667,0.90352,0.673819,0.855749,0.499088,0.375406,0.815757,...,1.585687,0.584143,0.751384,2.75823,0.572712,0.141023,1.437022,0.288943,0.665727,1.225229
ENSG00000000971,1.108762,1.133163,0.227188,0.222948,0.072709,0.256124,0.540911,0.606947,0.229133,1.557433,...,2.349373,1.122458,0.155346,0.990123,0.326914,1.10021,1.099537,2.511358,0.950262,1.467305


## GLS results on PhenomeXcan

Read results obtained with `30-gls_on_phenotypes.ipynb` (PhenomeXcan)

In [25]:
input_filepath = conf.RESULTS["GLS"] / "gls_phenotypes.pkl"
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes.pkl')

In [26]:
gls_phenomexcan = pd.read_pickle(input_filepath)

In [27]:
gls_phenomexcan.shape

(2580, 8)

In [28]:
gls_phenomexcan.head()

Unnamed: 0,part_k,cluster_id,phenotype,lv,lv_with_pathway,coef,pvalue,summary
0,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV744,False,0.017663,0.166692,GLS Regression Res...
1,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV42,True,-0.013276,0.294912,GLS Regression Res...
2,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV289,False,-0.001951,0.871501,GLS Regression Res...
3,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV147,False,0.00649,0.602219,GLS Regression Res...
4,29,17,20003_1140861958-Treatmentmedication_code_simv...,LV514,False,0.025729,0.041014,GLS Regression Res...


## MultiPLIER summary

In [29]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [30]:
multiplier_model_summary.shape

(2157, 5)

In [31]:
multiplier_model_summary.head()

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
1,KEGG_LYSINE_DEGRADATION,1,0.388059,0.866078,0.956005
2,REACTOME_MRNA_SPLICING,1,0.733057,4.8e-05,0.000582
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628,0.011366
4,KEGG_DNA_REPLICATION,1,0.549473,0.312155,0.539951
5,PID_MYC_ACTIVPATHWAY,1,0.639303,0.021702,0.083739


In [32]:
well_aligned_lvs = multiplier_model_summary[
    (multiplier_model_summary["FDR"] < 0.05) | (multiplier_model_summary["AUC"] >= 0.75)
]

display(well_aligned_lvs.shape)
display(well_aligned_lvs.head())

(469, 5)

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
2,REACTOME_MRNA_SPLICING,1,0.733057,4.772691e-05,0.0005816211
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628217,0.0113659
8,REACTOME_MITOTIC_G1_G1_S_PHASES,1,0.68617,0.0002517619,0.002392292
9,IRIS_Monocyte-Day0,2,0.890036,4.315812e-25,1.329887e-22
10,DMAP_MONO2,2,0.904676,1.31397e-16,1.574574e-14


In [33]:
well_aligned_lv_codes = set([f"LV{lvi}" for lvi in well_aligned_lvs["LV index"]])

In [34]:
len(well_aligned_lv_codes)

200

In [35]:
list(well_aligned_lv_codes)[:5]

['LV524', 'LV623', 'LV848', 'LV921', 'LV30']

# Select LVs from previous GLS run on PhenomeXcan

In [36]:
gls_phenomexcan_lvs = gls_phenomexcan["lv"].unique()

In [37]:
gls_phenomexcan_lvs.shape

(190,)

In [38]:
gls_phenomexcan_lvs

array(['LV744', 'LV42', 'LV289', 'LV147', 'LV514', 'LV367', 'LV797',
       'LV91', 'LV530', 'LV257', 'LV224', 'LV575', 'LV381', 'LV455',
       'LV516', 'LV847', 'LV515', 'LV65', 'LV841', 'LV330', 'LV502',
       'LV610', 'LV342', 'LV913', 'LV568', 'LV271', 'LV605', 'LV984',
       'LV844', 'LV983', 'LV155', 'LV948', 'LV942', 'LV54', 'LV987',
       'LV57', 'LV425', 'LV864', 'LV11', 'LV453', 'LV338', 'LV301',
       'LV270', 'LV238', 'LV307', 'LV51', 'LV277', 'LV5', 'LV591',
       'LV613', 'LV856', 'LV877', 'LV403', 'LV391', 'LV954', 'LV808',
       'LV510', 'LV387', 'LV443', 'LV851', 'LV985', 'LV34', 'LV444',
       'LV705', 'LV941', 'LV853', 'LV586', 'LV335', 'LV679', 'LV363',
       'LV882', 'LV390', 'LV774', 'LV96', 'LV525', 'LV564', 'LV207',
       'LV644', 'LV294', 'LV874', 'LV359', 'LV50', 'LV418', 'LV117',
       'LV879', 'LV241', 'LV24', 'LV402', 'LV237', 'LV835', 'LV171',
       'LV452', 'LV904', 'LV918', 'LV587', 'LV94', 'LV300', 'LV132',
       'LV128', 'LV107', 'LV484', 

# Select eMERGE traits

This is an attempt to first get those eMERGE traits that correspond to the same EFO code in PhenomeXcan.
However, turns out that we only have a few matchings. That's why I'm taking the top 20 traits from each LVs in addition
to these ones.

In [39]:
gls_phenomexcan_traits = gls_phenomexcan["phenotype"].unique()

In [40]:
gls_phenomexcan_traits.shape

(129,)

In [41]:
gls_phenomexcan_in_emerge = emerge_phenomexcan_maps[
    (emerge_phenomexcan_maps["efo"].isin(gls_phenomexcan_traits))
    | (emerge_phenomexcan_maps["phenomexcan"].isin(gls_phenomexcan_traits))
]

In [42]:
gls_phenomexcan_in_emerge

Unnamed: 0,phecode,phecode_phenotype,phecode_category,icd10,efo,phenomexcan
143,411.2,Myocardial infarction,circulatory system,I21,acute myocardial infarction,I21-Diagnoses_main_ICD10_I21_Acute_myocardial_...
148,411.2,Myocardial infarction,circulatory system,I25,coronary artery disease,CARDIoGRAM_C4D_CAD_ADDITIVE
149,411.2,Myocardial infarction,circulatory system,I25,coronary artery disease,I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
289,577.0,Diseases of pancreas,digestive,K90,malabsorption syndrome,K90-Diagnoses_main_ICD10_K90_Intestinal_malabs...
329,714.0,Rheumatoid arthritis and other inflammatory po...,musculoskeletal,M06,rheumatoid arthritis,M05-Diagnoses_main_ICD10_M05_Seropositive_rheu...
330,714.0,Rheumatoid arthritis and other inflammatory po...,musculoskeletal,M06,rheumatoid arthritis,20002_1464-Noncancer_illness_code_selfreported...
331,714.0,Rheumatoid arthritis and other inflammatory po...,musculoskeletal,M06,rheumatoid arthritis,M06-Diagnoses_main_ICD10_M06_Other_rheumatoid_...
332,714.0,Rheumatoid arthritis and other inflammatory po...,musculoskeletal,M06,rheumatoid arthritis,RA_OKADA_TRANS_ETHNIC
333,714.1,Rheumatoid arthritis,musculoskeletal,M05,rheumatoid arthritis,M05-Diagnoses_main_ICD10_M05_Seropositive_rheu...
334,714.1,Rheumatoid arthritis,musculoskeletal,M05,rheumatoid arthritis,20002_1464-Noncancer_illness_code_selfreported...


In [43]:
gls_emerge_phecodes = gls_phenomexcan_in_emerge["phecode"].unique().tolist()

In [44]:
# these are the mapped traits from PhenomeXcan to phecodes
gls_emerge_phecodes

['411.2', '577', '714', '714.1', '743.9']

# GLSPhenoplier

## Get list of phenotypes/lvs pairs

In [46]:
phenotypes_lvs_pairs = []

# for each LV that was run on PhenomeXcan, I take the top `N_TOP_TRAITS_FROM_LV` traits
# in eMERGE + global mapped phenotypes (variable `gls_emerge_phecodes`)
for lv in gls_phenomexcan_lvs:
    lv_traits = emerge_projection.loc[lv]
    lv_traits = lv_traits[lv_traits > 0.0]
    lv_traits = lv_traits.sort_values(ascending=False).head(N_TOP_TRAITS_FROM_LV)

    for phenotype_code in set(lv_traits.index.tolist() + gls_emerge_phecodes):
        phenotypes_lvs_pairs.append(
            {
                "phenotype": phenotype_code,
                "lv": lv,
            }
        )

phenotypes_lvs_pairs = pd.DataFrame(phenotypes_lvs_pairs).drop_duplicates()

In [47]:
phenotypes_lvs_pairs = phenotypes_lvs_pairs.sort_values("phenotype").reset_index(
    drop=True
)

In [48]:
phenotypes_lvs_pairs.shape

(4670, 2)

In [49]:
phenotypes_lvs_pairs.head()

Unnamed: 0,phenotype,lv
0,8,LV631
1,8,LV18
2,8,LV906
3,8,LV650
4,8,LV514


## Run

In [50]:
output_file = OUTPUT_DIR / "gls_phenotypes-emerge.pkl"
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-emerge.pkl')

In [51]:
results = []

pbar = tqdm(total=phenotypes_lvs_pairs.shape[0])

for idx, row in phenotypes_lvs_pairs.iterrows():
    phenotype_code = row["phenotype"]
    lv_code = row["lv"]

    pbar.set_description(f"{phenotype_code} - {lv_code}")

    gls_model = GLSPhenoplier(
        smultixcan_result_set_filepath=emerge_smultixcan_zscores_filepath
    )
    gls_model.fit_named(lv_code, phenotype_code)
    res = gls_model.results

    results.append(
        {
            "phenotype": phenotype_code,
            "lv": lv_code,
            "lv_with_pathway": lv_code in well_aligned_lv_codes,
            "coef": res.params.loc["lv"],
            "pvalue": res.pvalues.loc["lv"],
            "summary": gls_model.results_summary,
        }
    )

    # save results every 10 models trained
    if (idx % 10) == 0:
        pd.DataFrame(results).to_pickle(output_file)

    pbar.update(1)

pbar.close()

796 - LV913: 100%|██████████| 4670/4670 [12:03:46<00:00,  9.30s/it]


In [52]:
results = pd.DataFrame(results)

In [53]:
results.shape

(4670, 6)

In [54]:
results.head()

Unnamed: 0,phenotype,lv,lv_with_pathway,coef,pvalue,summary
0,8,LV631,False,0.016437,0.1736,GLS Regression Res...
1,8,LV18,True,0.024829,0.043225,GLS Regression Res...
2,8,LV906,False,0.052092,5e-06,GLS Regression Res...
3,8,LV650,False,0.026647,0.035321,GLS Regression Res...
4,8,LV514,False,0.027276,0.031988,GLS Regression Res...


In [55]:
results.sort_values("pvalue").head(10)

Unnamed: 0,phenotype,lv,lv_with_pathway,coef,pvalue,summary
440,250.1,LV54,False,0.105369,4.3693610000000004e-17,GLS Regression Res...
3494,585.32,LV45,True,0.101602,8.892851e-16,GLS Regression Res...
446,250.1,LV425,False,0.093755,4.106841e-14,GLS Regression Res...
2483,443.0,LV45,True,0.091128,5.454329e-13,GLS Regression Res...
390,244.4,LV57,True,0.089947,5.301912e-12,GLS Regression Res...
4177,714.1,LV425,False,0.086154,5.624855e-12,GLS Regression Res...
364,244.0,LV57,True,0.089541,5.949932e-12,GLS Regression Res...
699,274.1,LV45,True,0.085661,9.787409e-12,GLS Regression Res...
4192,714.1,LV844,True,0.088236,1.584024e-11,GLS Regression Res...
3493,585.32,LV847,True,0.085519,4.019723e-11,GLS Regression Res...


## Save

In [56]:
results.to_pickle(output_file)