# Description

This notebook takes all results per cohort (PhenomeXcan and eMERGE), removes repeated runs, and adjust their pvalues.

# Environment variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import conf

In [3]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

3

In [4]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=3
env: OPEN_BLAS_NUM_THREADS=3
env: NUMEXPR_NUM_THREADS=3
env: OMP_NUM_THREADS=3


# Modules

In [5]:
from pathlib import Path

import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [6]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

assert OUTPUT_DIR.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls')

# Get results files

In [7]:
INPUT_FILES_PER_COHORT = {
    "phenomexcan": list(OUTPUT_DIR.glob("gls_phenotypes*phenomexcan*.pkl")),
    "emerge": list(OUTPUT_DIR.glob("gls_phenotypes*emerge*.pkl")),
}

In [8]:
display(INPUT_FILES_PER_COHORT)

{'phenomexcan': [PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-combined-phenomexcan.pkl'),
  PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-phenomexcan-2.pkl'),
  PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-crispr_lvs-phenomexcan.pkl'),
  PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-phenomexcan.pkl')],
 'emerge': [PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-combined-emerge.pkl'),
  PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-emerge.pkl')]}

# Combine by cohort

In [9]:
for cohort, result_files in INPUT_FILES_PER_COHORT.items():
    display(cohort)

    dfs = []
    for res in result_files:
        dfs.append(pd.read_pickle(res))

    dfs = pd.concat(dfs, ignore_index=True)

    # remove duplicate runs
    dfs = dfs.drop_duplicates(subset=["phenotype", "lv"])

    # adjust pvalues
    adj_pval = multipletests(dfs["pvalue"], alpha=0.05, method="fdr_bh")
    dfs = dfs.assign(fdr=adj_pval[1])

    # drop unneeded columns
    dfs = dfs.drop(columns=["lv_set", "lv_with_pathway", "summary"], errors="ignore")

    output_file = OUTPUT_DIR / f"gls_phenotypes-combined-{cohort}.pkl"
    display(output_file)

    dfs.to_pickle(output_file)

'phenomexcan'

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-combined-phenomexcan.pkl')

'emerge'

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-combined-emerge.pkl')