# Description

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

**TODO:** update

This notebook computes predicted expression correlations between all genes in the MultiPLIER models.

It also has a parameter set for papermill to run on a single chromosome to run in parallel (see under `Settings` below).

This notebook is not directly run. See README.md.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# from random import sample, seed
# import warnings
from pathlib import Path
import pickle

import numpy as np

# from scipy.spatial.distance import squareform
import pandas as pd
from tqdm import tqdm

# import matplotlib.pyplot as plt
# import seaborn as sns

import conf
from entity import Gene

# Settings

In [3]:
# a cohort name (it could be something like UK_BIOBANK, etc)
COHORT_NAME = None

# a string with a path pointing to an imputed GWAS
GWAS_FILE = None

# a string with a path pointing where S-PrediXcan results (tissue-specific are located
SPREDIXCAN_FOLDER = None

# an f-string with one placeholder {tissue}
SPREDIXCAN_FILE_PATTERN = None

# a string with a path pointing to an S-MultiXcan result
SMULTIXCAN_FILE = None

# predictions models such as MASHR or ELASTIC_NET
EQTL_MODEL = None

In [4]:
# Parameters
COHORT_NAME = "1000G_EUR"
GWAS_FILE = "/opt/data/results/gls/null_sims/final_imputed_gwas/random.pheno0.glm-imputed.txt.gz"
SPREDIXCAN_FOLDER = "/opt/data/results/gls/null_sims/twas/spredixcan/"
SPREDIXCAN_FILE_PATTERN = "random.pheno0-gtex_v8-mashr-{tissue}.csv"
SMULTIXCAN_FILE = "/opt/data/results/gls/null_sims/twas/smultixcan/random.pheno0-gtex_v8-mashr-smultixcan.txt"
EQTL_MODEL = "MASHR"


In [5]:
assert COHORT_NAME is not None and len(COHORT_NAME) > 0, "A cohort name must be given"

COHORT_NAME = COHORT_NAME.lower()
display(f"Cohort name: {COHORT_NAME}")

'Cohort name: 1000g_eur'

In [6]:
assert GWAS_FILE is not None and len(GWAS_FILE) > 0, "A GWAS file path must be given"
GWAS_FILE = Path(GWAS_FILE).resolve()
assert GWAS_FILE.exists(), "GWAS file does not exist"

display(f"GWAS file path: {str(GWAS_FILE)}")

'GWAS file path: /opt/data/results/gls/null_sims/final_imputed_gwas/random.pheno0.glm-imputed.txt.gz'

In [7]:
assert (
    SPREDIXCAN_FOLDER is not None and len(SPREDIXCAN_FOLDER) > 0
), "An S-PrediXcan folder path must be given"
SPREDIXCAN_FOLDER = Path(SPREDIXCAN_FOLDER).resolve()
assert SPREDIXCAN_FOLDER.exists(), "S-PrediXcan folder does not exist"

display(f"S-PrediXcan folder path: {str(SPREDIXCAN_FOLDER)}")

'S-PrediXcan folder path: /opt/data/results/gls/null_sims/twas/spredixcan'

In [8]:
assert (
    SPREDIXCAN_FILE_PATTERN is not None and len(SPREDIXCAN_FILE_PATTERN) > 0
), "An S-PrediXcan file pattern must be given"
assert (
    "{tissue}" in SPREDIXCAN_FILE_PATTERN
), "S-PrediXcan file pattern must have a '{tissue}' placeholder"

display(f"S-PrediXcan file template: {SPREDIXCAN_FILE_PATTERN}")

'S-PrediXcan file template: random.pheno0-gtex_v8-mashr-{tissue}.csv'

In [9]:
assert (
    SMULTIXCAN_FILE is not None and len(SMULTIXCAN_FILE) > 0
), "An S-MultiXcan result file path must be given"
SMULTIXCAN_FILE = Path(SMULTIXCAN_FILE).resolve()
assert SMULTIXCAN_FILE.exists(), "S-MultiXcan result file does not exist"

display(f"S-MultiXcan file path: {str(SMULTIXCAN_FILE)}")

'S-MultiXcan file path: /opt/data/results/gls/null_sims/twas/smultixcan/random.pheno0-gtex_v8-mashr-smultixcan.txt'

In [10]:
assert (
    EQTL_MODEL is not None and len(EQTL_MODEL) > 0
), "A prediction/eQTL model must be given"

display(f"eQTL model: {EQTL_MODEL}")

'eQTL model: MASHR'

In [11]:
OUTPUT_DIR_BASE = conf.RESULTS["GLS"] / "gene_corrs" / "cohorts" / COHORT_NAME
OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

display(f"Using output dir base: {OUTPUT_DIR_BASE}")

'Using output dir base: /opt/data/results/gls/gene_corrs/cohorts/1000g_eur'

# Load MultiPLIER Z genes

In [12]:
multiplier_z_genes = pd.read_pickle(
    conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"]
).index.tolist()

In [13]:
len(multiplier_z_genes)

6750

In [14]:
assert len(multiplier_z_genes) == len(set(multiplier_z_genes))

In [15]:
multiplier_z_genes[:5]

['GAS6', 'MMP14', 'DSP', 'MARCKSL1', 'SPARC']

# GWAS

In [16]:
gwas_file_columns = pd.read_csv(GWAS_FILE, sep="\t", nrows=2).columns
assert (
    "panel_variant_id" in gwas_file_columns
), "GWAS file must be final imputed one with column 'panel_variant_id'"
# FIXME: add other needed columns here

In [17]:
gwas_data = pd.read_csv(
    GWAS_FILE,
    sep="\t",
    usecols=["panel_variant_id", "pvalue", "zscore", "imputation_status"],
)

In [18]:
gwas_data.shape

(8339505, 4)

In [19]:
gwas_data.head()

Unnamed: 0,panel_variant_id,zscore,pvalue,imputation_status
0,chr1_54490_G_A_b38,1.567452,0.117009,original
1,chr1_87021_T_C_b38,0.244372,0.806943,imputed
2,chr1_263722_C_G_b38,1.155239,0.247993,imputed
3,chr1_594402_C_T_b38,0.851234,0.394639,imputed
4,chr1_630555_C_T_b38,0.855129,0.39248,imputed


In [20]:
gwas_data["imputation_status"].unique()

array(['original', 'imputed'], dtype=object)

In [21]:
gwas_data.dropna().shape

(8339505, 4)

In [22]:
# remove SNPs with no results
gwas_data = gwas_data.dropna()

In [23]:
gwas_data.shape

(8339505, 4)

## Save GWAS variants

In [24]:
gwas_data.head()

Unnamed: 0,panel_variant_id,zscore,pvalue,imputation_status
0,chr1_54490_G_A_b38,1.567452,0.117009,original
1,chr1_87021_T_C_b38,0.244372,0.806943,imputed
2,chr1_263722_C_G_b38,1.155239,0.247993,imputed
3,chr1_594402_C_T_b38,0.851234,0.394639,imputed
4,chr1_630555_C_T_b38,0.855129,0.39248,imputed


In [25]:
assert gwas_data["panel_variant_id"].is_unique

In [26]:
gwas_variants_ids_set = frozenset(gwas_data["panel_variant_id"])
list(gwas_variants_ids_set)[:5]

['chr3_36680774_C_T_b38',
 'chr7_8936021_C_G_b38',
 'chr17_33558929_G_A_b38',
 'chr1_88354607_TAAAAC_T_b38',
 'chr8_61816515_T_C_b38']

In [27]:
with open(OUTPUT_DIR_BASE / "gwas_variant_ids.pkl", "wb") as handle:
    pickle.dump(gwas_variants_ids_set, handle, protocol=pickle.HIGHEST_PROTOCOL)

# TWAS

## Available tissues for eQTL model

In [28]:
prediction_model_tissues = conf.PHENOMEXCAN["PREDICTION_MODELS"][
    f"{EQTL_MODEL}_TISSUES"
].split(" ")

In [29]:
len(prediction_model_tissues)

49

In [30]:
prediction_model_tissues[:5]

['Skin_Not_Sun_Exposed_Suprapubic',
 'Cells_EBV-transformed_lymphocytes',
 'Brain_Frontal_Cortex_BA9',
 'Kidney_Cortex',
 'Brain_Substantia_nigra']

## S-PrediXcan results

### Load results across all tissues

In [31]:
spredixcan_result_files = {
    t: SPREDIXCAN_FOLDER / SPREDIXCAN_FILE_PATTERN.format(tissue=t)
    for t in prediction_model_tissues
}

In [32]:
assert len(spredixcan_result_files) == len(prediction_model_tissues)
display(list(spredixcan_result_files.values())[:5])

[PosixPath('/opt/data/results/gls/null_sims/twas/spredixcan/random.pheno0-gtex_v8-mashr-Skin_Not_Sun_Exposed_Suprapubic.csv'),
 PosixPath('/opt/data/results/gls/null_sims/twas/spredixcan/random.pheno0-gtex_v8-mashr-Cells_EBV-transformed_lymphocytes.csv'),
 PosixPath('/opt/data/results/gls/null_sims/twas/spredixcan/random.pheno0-gtex_v8-mashr-Brain_Frontal_Cortex_BA9.csv'),
 PosixPath('/opt/data/results/gls/null_sims/twas/spredixcan/random.pheno0-gtex_v8-mashr-Kidney_Cortex.csv'),
 PosixPath('/opt/data/results/gls/null_sims/twas/spredixcan/random.pheno0-gtex_v8-mashr-Brain_Substantia_nigra.csv')]

In [33]:
assert all(f.exists() for f in spredixcan_result_files.values())

In [34]:
spredixcan_dfs = [
    pd.read_csv(f, usecols=["gene", "zscore", "pvalue"]).dropna().assign(tissue=t)
    for t, f in spredixcan_result_files.items()
]

In [35]:
assert len(spredixcan_dfs) == len(prediction_model_tissues)

In [36]:
spredixcan_dfs = pd.concat(spredixcan_dfs)

In [37]:
assert spredixcan_dfs["tissue"].unique().shape[0] == len(prediction_model_tissues)

In [38]:
spredixcan_dfs.shape

(653140, 4)

In [39]:
spredixcan_dfs.head()

Unnamed: 0,gene,zscore,pvalue,tissue
0,ENSG00000131236.16,-3.882684,0.000103,Skin_Not_Sun_Exposed_Suprapubic
1,ENSG00000196172.9,-3.710444,0.000207,Skin_Not_Sun_Exposed_Suprapubic
2,ENSG00000280789.1,3.403263,0.000666,Skin_Not_Sun_Exposed_Suprapubic
3,ENSG00000215218.3,3.385093,0.000712,Skin_Not_Sun_Exposed_Suprapubic
4,ENSG00000167371.17,3.381834,0.00072,Skin_Not_Sun_Exposed_Suprapubic


### Count number of tissues available per gene

In [40]:
spredixcan_genes_n_models = spredixcan_dfs.groupby("gene")["tissue"].nunique()

In [41]:
spredixcan_genes_n_models

gene
ENSG00000000419.12     2
ENSG00000000457.13    48
ENSG00000000460.16    39
ENSG00000000938.12    36
ENSG00000000971.15    34
                      ..
ENSG00000284430.1      6
ENSG00000284452.1      1
ENSG00000284513.1      1
ENSG00000284526.1     42
ENSG00000284552.1      1
Name: tissue, Length: 22314, dtype: int64

### Get tissues available per gene

In [42]:
spredixcan_genes_models = spredixcan_dfs.groupby("gene")["tissue"].apply(
    lambda x: frozenset(x.tolist())
)

In [43]:
spredixcan_genes_models

gene
ENSG00000000419.12         (Brain_Substantia_nigra, Brain_Hypothalamus)
ENSG00000000457.13    (Adrenal_Gland, Brain_Cerebellum, Small_Intest...
ENSG00000000460.16    (Adrenal_Gland, Brain_Cerebellum, Small_Intest...
ENSG00000000938.12    (Brain_Cerebellum, Brain_Caudate_basal_ganglia...
ENSG00000000971.15    (Brain_Cerebellum, Small_Intestine_Terminal_Il...
                                            ...                        
ENSG00000284430.1     (Ovary, Vagina, Esophagus_Gastroesophageal_Jun...
ENSG00000284452.1                                              (Testis)
ENSG00000284513.1                         (Brain_Cerebellar_Hemisphere)
ENSG00000284526.1     (Adrenal_Gland, Small_Intestine_Terminal_Ileum...
ENSG00000284552.1                                              (Spleen)
Name: tissue, Length: 22314, dtype: object

In [44]:
assert spredixcan_genes_n_models.shape[0] == spredixcan_genes_models.shape[0]

In [45]:
assert spredixcan_genes_n_models.index.equals(spredixcan_genes_models.index)

In [46]:
assert (spredixcan_genes_models.apply(len) <= len(prediction_model_tissues)).all()

In [47]:
spredixcan_genes_models.apply(len).describe()

count    22314.000000
mean        29.270413
std         17.208404
min          1.000000
25%         13.000000
50%         35.000000
75%         45.000000
max         49.000000
Name: tissue, dtype: float64

In [48]:
# testing
assert (
    spredixcan_genes_models.loc[spredixcan_genes_n_models.index]
    .apply(len)
    .equals(spredixcan_genes_n_models)
)

### Get simple gene id and add gene name

In [49]:
spredixcan_genes_models = spredixcan_genes_models.to_frame().reset_index()

In [50]:
spredixcan_genes_models.head()

Unnamed: 0,gene,tissue
0,ENSG00000000419.12,"(Brain_Substantia_nigra, Brain_Hypothalamus)"
1,ENSG00000000457.13,"(Adrenal_Gland, Brain_Cerebellum, Small_Intest..."
2,ENSG00000000460.16,"(Adrenal_Gland, Brain_Cerebellum, Small_Intest..."
3,ENSG00000000938.12,"(Brain_Cerebellum, Brain_Caudate_basal_ganglia..."
4,ENSG00000000971.15,"(Brain_Cerebellum, Small_Intestine_Terminal_Il..."


In [51]:
spredixcan_genes_models = spredixcan_genes_models.assign(
    gene_id=spredixcan_genes_models["gene"].apply(lambda g: g.split(".")[0])
)

In [52]:
spredixcan_genes_models.head()

Unnamed: 0,gene,tissue,gene_id
0,ENSG00000000419.12,"(Brain_Substantia_nigra, Brain_Hypothalamus)",ENSG00000000419
1,ENSG00000000457.13,"(Adrenal_Gland, Brain_Cerebellum, Small_Intest...",ENSG00000000457
2,ENSG00000000460.16,"(Adrenal_Gland, Brain_Cerebellum, Small_Intest...",ENSG00000000460
3,ENSG00000000938.12,"(Brain_Cerebellum, Brain_Caudate_basal_ganglia...",ENSG00000000938
4,ENSG00000000971.15,"(Brain_Cerebellum, Small_Intestine_Terminal_Il...",ENSG00000000971


In [53]:
spredixcan_genes_models = spredixcan_genes_models.assign(
    gene_name=spredixcan_genes_models["gene_id"].apply(
        lambda g: Gene.GENE_ID_TO_NAME_MAP[g]
    )
)

In [54]:
spredixcan_genes_models = spredixcan_genes_models[["gene_id", "gene_name", "tissue"]]

In [55]:
spredixcan_genes_models.head()

Unnamed: 0,gene_id,gene_name,tissue
0,ENSG00000000419,DPM1,"(Brain_Substantia_nigra, Brain_Hypothalamus)"
1,ENSG00000000457,SCYL3,"(Adrenal_Gland, Brain_Cerebellum, Small_Intest..."
2,ENSG00000000460,C1orf112,"(Adrenal_Gland, Brain_Cerebellum, Small_Intest..."
3,ENSG00000000938,FGR,"(Brain_Cerebellum, Brain_Caudate_basal_ganglia..."
4,ENSG00000000971,CFH,"(Brain_Cerebellum, Small_Intestine_Terminal_Il..."


### Save

In [56]:
spredixcan_genes_models.to_pickle(OUTPUT_DIR_BASE / "gene_tissues.pkl")

## S-MultiXcan results

In [57]:
# TODO: something that could be interesting to do is to compare `n_indep` with the number of independent components I get
smultixcan_results = pd.read_csv(
    SMULTIXCAN_FILE, sep="\t", usecols=["gene", "gene_name", "pvalue", "n"]
)

In [58]:
smultixcan_results.shape

(22317, 4)

In [59]:
smultixcan_results = smultixcan_results.dropna()

In [60]:
smultixcan_results.shape

(22314, 4)

In [61]:
smultixcan_results.head()

Unnamed: 0,gene,gene_name,pvalue,n
0,ENSG00000131941.7,RHPN2,4e-05,48.0
1,ENSG00000076650.6,GPATCH1,7.8e-05,40.0
2,ENSG00000100906.10,NFKBIA,9.6e-05,1.0
3,ENSG00000136319.11,TTC5,0.000109,47.0
4,ENSG00000152990.13,ADGRA3,0.000135,41.0


In [62]:
assert smultixcan_results["gene"].is_unique

In [63]:
# testing
_tmp_smultixcan_results_n_models = (
    smultixcan_results.set_index("gene")["n"].astype(int).rename("tissue")
)

assert spredixcan_genes_n_models.shape[0] == _tmp_smultixcan_results_n_models.shape[0]
assert spredixcan_genes_n_models.equals(
    _tmp_smultixcan_results_n_models.loc[spredixcan_genes_n_models.index]
)

### Remove duplicated gene names

In [64]:
smultixcan_results["gene_name"].is_unique

False

In [65]:
# list duplicated gene names
_smultixcan_duplicated_gene_names = smultixcan_results[
    smultixcan_results["gene_name"].duplicated(keep=False)
]
display(_smultixcan_duplicated_gene_names)

Unnamed: 0,gene,gene_name,pvalue,n
1461,ENSG00000235641.4,LINC00484,0.06845,35.0
2390,ENSG00000237667.5,LINC01115,0.112329,22.0
5198,ENSG00000272342.1,LINC01115,0.243122,4.0
5883,ENSG00000229694.6,LINC00484,0.274814,40.0
6129,ENSG00000147676.13,MAL2,0.286003,29.0
6207,ENSG00000283992.1,LYNX1,0.289628,45.0
10559,ENSG00000180155.19,LYNX1,0.48696,48.0
14026,ENSG00000235271.5,LINC01422,0.643355,38.0
14251,ENSG00000253972.5,MAL2,0.654873,6.0
20086,ENSG00000182957.15,SPATA13,0.906729,40.0


In [66]:
# TODO: my strategy below to handle duplicated gene names is to keep the first one
#  it might be better to have another strategy, maybe keeping the most significant

In [67]:
smultixcan_results = smultixcan_results.drop_duplicates(
    subset=["gene_name"], keep="first"
)
display(smultixcan_results.shape)

(22308, 4)

### Get common genes with MultiPLIER

In [68]:
common_genes = set(multiplier_z_genes).intersection(
    set(smultixcan_results["gene_name"])
)

In [69]:
len(common_genes)

6444

In [70]:
sorted(list(common_genes))[:5]

['A2M', 'AAAS', 'AANAT', 'AARS', 'AARS2']

In [71]:
assert smultixcan_results[smultixcan_results["gene_name"].isin(common_genes)].shape[
    0
] == len(common_genes)

### Save

In [72]:
with open(OUTPUT_DIR_BASE / "common_genes.pkl", "wb") as handle:
    pickle.dump(common_genes, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Get gene objects

In [73]:
multiplier_gene_obj = {
    gene_name: Gene(name=gene_name)
    for gene_name in common_genes
    if gene_name in Gene.GENE_NAME_TO_ID_MAP
}

In [74]:
len(multiplier_gene_obj)

6444

In [75]:
assert multiplier_gene_obj["GAS6"].ensembl_id == "ENSG00000183087"

In [76]:
_gene_obj = list(multiplier_gene_obj.values())

genes_info = pd.DataFrame(
    {
        "name": [g.name for g in _gene_obj],
        "id": [g.ensembl_id for g in _gene_obj],
        "chr": [g.chromosome for g in _gene_obj],
        "band": [g.band for g in _gene_obj],
        "start_position": [g.get_attribute("start_position") for g in _gene_obj],
        "end_position": [g.get_attribute("end_position") for g in _gene_obj],
    }
)

In [77]:
genes_info = genes_info.assign(
    gene_length=genes_info.apply(
        lambda x: x["end_position"] - x["start_position"], axis=1
    )
)

In [78]:
genes_info.shape

(6444, 7)

In [79]:
genes_info.head()

Unnamed: 0,name,id,chr,band,start_position,end_position,gene_length
0,MFN2,ENSG00000116688,1,1p36.22,11980181.0,12013514.0,33333.0
1,PPA2,ENSG00000138777,4,4q24,105369077.0,105474067.0,104990.0
2,ADRA2A,ENSG00000150594,10,10q25.2,111077163.0,111080907.0,3744.0
3,RYR2,ENSG00000198626,1,1q43,237042184.0,237833988.0,791804.0
4,VAV2,ENSG00000160293,9,9q34.2,133761894.0,133992604.0,230710.0


## Save

In [80]:
genes_info.to_pickle(OUTPUT_DIR_BASE / "genes_info.pkl")