# Description

* reads MultiXcan results on a random phenotype file (using Elastic Net models)
* runs PhenoPLIER on all LVs to compute the null

# Environment variables

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import conf

In [3]:
# N_JOBS = conf.GENERAL["N_JOBS"]
# set N_JOBS manually, because we are parallelizing outside
N_JOBS = 1
display(N_JOBS)

1

In [4]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=1
env: OPEN_BLAS_NUM_THREADS=1
env: NUMEXPR_NUM_THREADS=1
env: OMP_NUM_THREADS=1


# Modules

In [5]:
import itertools
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
from scipy import stats
import pandas as pd
from tqdm import tqdm

from utils import chunker
from entity import Gene
from gls import GLSPhenoplier

# Settings

In [6]:
N_SIMULATED_PHENOTYPES = 1  # disable generation of derived random phenotypes
CHUNK_SIZE = 50
EQTL_MODEL = "ELASTIC_NET"

# Paths

In [7]:
OUTPUT_DIR = conf.RESULTS["GLS"] / "null_simulations"
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/opt/data/results/gls/null_simulations')

In [8]:
OUTPUT_FILENAME = OUTPUT_DIR / "en-null_simulations.pkl"
display(OUTPUT_FILENAME)

PosixPath('/opt/data/results/gls/null_simulations/en-null_simulations.pkl')

# Load data

## MultiXcan on random phenotype

This result was downloaded from the MultiXcan paper here: https://github.com/hakyimlab/multixcan-paper

In [9]:
multixcan_random_phenotype = pd.read_csv(
    conf.PHENOMEXCAN["BASE_DIR"] / "random__ccn30__mt_results.txt",
    sep="\t",
    usecols=["gene", "pvalue"],
)

In [10]:
multixcan_random_phenotype.shape

(17434, 2)

In [11]:
multixcan_random_phenotype.head()

Unnamed: 0,gene,pvalue
0,ENSG00000111215.7,0.000274
1,ENSG00000160752.10,0.00034
2,ENSG00000119950.16,0.000437
3,ENSG00000196505.6,0.000552
4,ENSG00000166762.12,0.000557


In [12]:
multixcan_random_phenotype["gene"] = multixcan_random_phenotype["gene"].str.split(
    ".", n=1, expand=True
)[0]

In [13]:
multixcan_random_phenotype = multixcan_random_phenotype.set_index("gene")

In [14]:
multixcan_random_phenotype.head()

Unnamed: 0_level_0,pvalue
gene,Unnamed: 1_level_1
ENSG00000111215,0.000274
ENSG00000160752,0.00034
ENSG00000119950,0.000437
ENSG00000196505,0.000552
ENSG00000166762,0.000557


In [15]:
assert multixcan_random_phenotype.index.is_unique

## MultiPLIER Z matrix

In [16]:
# multiplier_z_matrix = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [17]:
# multiplier_z_matrix.shape

In [18]:
# multiplier_z_matrix.head()

In [19]:
# lv_codes = list(multiplier_z_matrix.columns)
# display(lv_codes[:5])

# Preprocess MultiXcan results

## Convert gene IDs to Gene names

In [20]:
smultixcan_results = multixcan_random_phenotype.rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [21]:
smultixcan_results.shape

(17434, 1)

In [22]:
smultixcan_results.head()

Unnamed: 0_level_0,pvalue
gene,Unnamed: 1_level_1
PRR4,0.000274
FDPS,0.00034
MXI1,0.000437
GDAP2,0.000552
CATSPER2,0.000557


## Remove duplicated gene entries

In [23]:
_tmp = smultixcan_results.index[smultixcan_results.index.duplicated(keep="first")]
assert _tmp.shape[0] == 0

## Convert p-values to z-scores

In [24]:
smultixcan_results = smultixcan_results.assign(
    zscore=np.abs(stats.norm.ppf(smultixcan_results["pvalue"].to_numpy() / 2))
)

In [25]:
smultixcan_results = smultixcan_results.drop(columns="pvalue").squeeze()

In [26]:
smultixcan_results.head()

gene
PRR4        3.638969
FDPS        3.583010
MXI1        3.516956
GDAP2       3.454321
CATSPER2    3.451753
Name: zscore, dtype: float64

In [27]:
smultixcan_results.describe()

count    17434.000000
mean         0.799982
std          0.601722
min          0.000017
25%          0.319986
50%          0.677941
75%          1.151688
max          3.638969
Name: zscore, dtype: float64

## Some checks

In [28]:
# the data should have no NaN values
assert smultixcan_results.shape == smultixcan_results.dropna(how="any").shape

# GLSPhenoplier

## Identify clusters of non-related genes

In [29]:
en_gene_corr = GLSPhenoplier._get_data(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"],
    model_type="ELASTIC_NET",
)[0]

In [30]:
_comm_genes = en_gene_corr.index.intersection(smultixcan_results.index)

In [31]:
en_gene_corr = en_gene_corr.loc[_comm_genes, _comm_genes]

In [32]:
en_gene_corr.shape

(6324, 6324)

In [33]:
en_gene_corr.head()

Unnamed: 0,GAS6,MMP14,DSP,MARCKSL1,SPARC,CTSD,EPAS1,PALLD,PHC2,LGALS3BP,...,LDHB,LDHC,ACAP2,ACAP3,CFL2,CFL1,NFIB,PLEKHG6,GNGT2,SERPINH1
GAS6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MMP14,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.000186,0.0,0.0,0.0,0.0,0.0
DSP,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MARCKSL1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SPARC,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
from sklearn.cluster import AgglomerativeClustering

In [35]:
en_gene_dist = en_gene_corr.abs().copy()
np.fill_diagonal(en_gene_dist.values, 0.0)

In [36]:
en_gene_dist

Unnamed: 0,GAS6,MMP14,DSP,MARCKSL1,SPARC,CTSD,EPAS1,PALLD,PHC2,LGALS3BP,...,LDHB,LDHC,ACAP2,ACAP3,CFL2,CFL1,NFIB,PLEKHG6,GNGT2,SERPINH1
GAS6,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.00000
MMP14,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000186,0.00000,0.0,0.0,0.0,0.00000
DSP,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.00000
MARCKSL1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.00000
SPARC,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CFL1,0.0,0.0,0.0,0.0,0.0,0.002901,0.0,0.0,0.0,0.000000,...,0.000000,0.009373,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.00016
NFIB,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.00000
PLEKHG6,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000741,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.00000
GNGT2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.002965,...,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.00000


In [37]:
_tmp = en_gene_dist.unstack()
_tmp = _tmp[(_tmp > 0.0) & (_tmp < 1.0)]

In [38]:
_tmp.sort_values()

PADI4     POU2F1      1.016482e-09
POU2F1    PADI4       1.016482e-09
SLC6A6    PRICKLE2    1.415990e-09
PRICKLE2  SLC6A6      1.415990e-09
ESPL1     CRADD       1.634566e-09
                          ...     
CYP2D6    NAGA        7.044598e-01
ZNF589    NME6        7.159745e-01
NME6      ZNF589      7.159745e-01
SUOX      RPS26       7.884358e-01
RPS26     SUOX        7.884358e-01
Length: 1864056, dtype: float32

In [39]:
ac = AgglomerativeClustering(
    n_clusters=None,
    compute_full_tree=True,
    linkage="complete",
    affinity="precomputed",
    distance_threshold=1e-100,
)

In [40]:
ac.fit(en_gene_dist)

AgglomerativeClustering(affinity='precomputed', compute_full_tree=True,
                        distance_threshold=1e-100, linkage='complete',
                        n_clusters=None)

In [41]:
gene_part = pd.Series(ac.labels_)
display(gene_part.value_counts())

141    67
285    62
244    58
236    58
98     58
       ..
388     1
384     1
365     1
377     1
391     1
Length: 393, dtype: int64

In [42]:
en_gene_dist.loc[(ac.labels_ == 141), (ac.labels_ == 141)]

Unnamed: 0,GAS6,MMP14,DSP,MARCKSL1,SPARC,CTSD,EPAS1,PALLD,PHC2,LGALS3BP,...,AGRP,TRA2A,GH1,OSBP,RPLP1,SEC14L2,GNPNAT1,RPL24,TP53,MVD
GAS6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MARCKSL1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SEC14L2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GNPNAT1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RPL24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TP53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
phenotype_gene_clusters = {
    cluster_id: smultixcan_results.loc[en_gene_dist.index[gene_part == cluster_id]]
    for cluster_id in gene_part.value_counts().index
}

In [44]:
phenotype_gene_clusters[141]

GAS6        0.222637
MMP14       0.787712
DSP         0.152267
MARCKSL1    1.260680
SPARC       0.437722
              ...   
SEC14L2     1.189622
GNPNAT1     1.474504
RPL24       2.842350
TP53        0.492931
MVD         1.201320
Name: zscore, Length: 67, dtype: float64

## Functions

In [45]:
rs = np.random.RandomState(0)

In [46]:
def get_shuffled_phenotype():
    shuffled_gene_clusters = []
    for cluster_id, gene_assoc_cluster in phenotype_gene_clusters.items():
        gc = gene_assoc_cluster.copy()
        rs.shuffle(gc)
        shuffled_gene_clusters.append(gc)

    return pd.concat(shuffled_gene_clusters)

In [47]:
def get_df_from_results(results_list):
    df = pd.DataFrame(results_list).astype(
        {
            "phenotype": "category",
            "lv": "category",
        }
    )

    return df

## Load `phenotype_assocs` and `lv_weights`

In [48]:
lv_weights = GLSPhenoplier._get_data(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"],
    model_type=EQTL_MODEL,
)[2]

In [49]:
lv_weights.shape

(6450, 987)

In [50]:
lv_weights.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


## Generate simulated phenotypes

In [51]:
# phenotype_codes = rs.choice(phenotype_list, size=N_SIMULATED_PHENOTYPES, replace=False)
# display(phenotype_codes[:3])
# display(len(phenotype_codes))
# assert len(phenotype_codes) == N_SIMULATED_PHENOTYPES

simulated_phenotypes = {
    "smultixcan phenotype 0": smultixcan_results.loc[en_gene_dist.index]
}

for idx in tqdm(range(1, N_SIMULATED_PHENOTYPES)):
    simulated_phenotypes[f"smultixcan phenotype {idx}"] = get_shuffled_phenotype()

0it [00:00, ?it/s]


In [52]:
display(len(simulated_phenotypes))
assert len(simulated_phenotypes) == N_SIMULATED_PHENOTYPES

1

In [53]:
simulated_phenotypes[list(simulated_phenotypes.keys())[0]]

GAS6        0.222637
MMP14       0.787712
DSP         0.152267
MARCKSL1    1.260680
SPARC       0.437722
              ...   
CFL1        0.542548
NFIB        1.618520
PLEKHG6     1.668045
GNGT2       0.281426
SERPINH1    1.165463
Name: zscore, Length: 6324, dtype: float64

In [54]:
simulated_phenotypes = pd.DataFrame(simulated_phenotypes)

In [55]:
simulated_phenotypes.shape

(6324, 1)

In [56]:
simulated_phenotypes.head()

Unnamed: 0,smultixcan phenotype 0
GAS6,0.222637
MMP14,0.787712
DSP,0.152267
MARCKSL1,1.26068
SPARC,0.437722


In [57]:
simulated_phenotypes.describe()

Unnamed: 0,smultixcan phenotype 0
count,6324.0
mean,0.804025
std,0.605742
min,0.000122
25%,0.31945
50%,0.679934
75%,1.159146
max,3.58301


## Merge simulated phenotypes and LVs into one dataframe

In [58]:
# smultixcan_results = smultixcan_results.loc[smultixcan_results.index.intersection(lv_weights.index)]

In [59]:
# smultixcan_results.shape

In [60]:
# smultixcan_results.head()

In [61]:
# assert not smultixcan_results.isna().any()

In [62]:
# simulated_phenotypes = pd.DataFrame({"smultixcan_random_phenotype": smultixcan_results})

In [63]:
# simulated_phenotypes.shape

In [64]:
# simulated_phenotypes.head()

In [65]:
run_confs = pd.DataFrame(
    data=itertools.product(
        list(simulated_phenotypes.columns), list(lv_weights.columns)
    ),
    columns=["phenotype", "lv"],
)

In [66]:
run_confs

Unnamed: 0,phenotype,lv
0,smultixcan phenotype 0,LV1
1,smultixcan phenotype 0,LV2
2,smultixcan phenotype 0,LV3
3,smultixcan phenotype 0,LV4
4,smultixcan phenotype 0,LV5
...,...,...
982,smultixcan phenotype 0,LV983
983,smultixcan phenotype 0,LV984
984,smultixcan phenotype 0,LV985
985,smultixcan phenotype 0,LV986


## Split run configurations

In [67]:
run_confs_chunks = chunker(run_confs.sample(frac=1, random_state=rs), CHUNK_SIZE)

## Run

In [68]:
def run(run_confs_subset):
    results = []

    for phenotype_code, lv_code in run_confs_subset.itertuples(name=None, index=False):
        phenotype = simulated_phenotypes[phenotype_code]

        gls_model = GLSPhenoplier(
            smultixcan_result_set_filepath=conf.PHENOMEXCAN[
                "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
            ],
            model_type=EQTL_MODEL,
        )
        gls_model.fit_named(lv_code, phenotype)
        res = gls_model.results

        results.append(
            {
                "phenotype": phenotype_code,
                "lv": lv_code,
                "coef": res.params.loc["lv"],
                "pvalue": res.pvalues_onesided.loc["lv"],
            }
        )

    return get_df_from_results(results)

In [69]:
all_results = []

with tqdm(total=run_confs.shape[0]) as pbar:
    with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
        tasks = [executor.submit(run, chunk) for chunk in run_confs_chunks]

        for future in as_completed(tasks):
            res = future.result()
            all_results.append(res)

            if (len(all_results) % conf.GENERAL["N_JOBS"]) == 0:
                df = pd.concat(all_results, ignore_index=True)
                df.to_pickle(OUTPUT_FILENAME)

            pbar.update(res.shape[0])

100%|██████████| 987/987 [2:12:41<00:00,  8.07s/it]


In [70]:
all_results = pd.concat(all_results, ignore_index=True)

In [71]:
# results = get_df_from_results(results)

In [72]:
all_results.shape

(987, 4)

In [73]:
all_results.head()

Unnamed: 0,phenotype,lv,coef,pvalue
0,smultixcan phenotype 0,LV936,-0.001195,0.53807
1,smultixcan phenotype 0,LV962,3.3e-05,0.49897
2,smultixcan phenotype 0,LV248,0.028117,0.013027
3,smultixcan phenotype 0,LV895,0.009369,0.229212
4,smultixcan phenotype 0,LV368,0.007564,0.274593


In [74]:
all_results.sort_values("pvalue").head(10)

Unnamed: 0,phenotype,lv,coef,pvalue
459,smultixcan phenotype 0,LV303,0.051052,2.5e-05
869,smultixcan phenotype 0,LV198,0.039076,0.000789
826,smultixcan phenotype 0,LV33,0.039386,0.000906
640,smultixcan phenotype 0,LV456,0.035331,0.001697
770,smultixcan phenotype 0,LV789,0.03651,0.001839
426,smultixcan phenotype 0,LV877,0.034665,0.002816
949,smultixcan phenotype 0,LV43,0.034236,0.003367
231,smultixcan phenotype 0,LV565,0.03396,0.003379
496,smultixcan phenotype 0,LV314,0.033711,0.003778
312,smultixcan phenotype 0,LV49,0.033041,0.004258


## Save

In [75]:
all_results.to_pickle(OUTPUT_FILENAME)