# Description

It computes a single matrix with all traits and genes in PhenomeXcan using S-PrediXcan results, which have direction of effect (in contrast to S-MultiXcan results). For each gene-trait pair, it takes the most significant result across all tissues.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

import conf
from data.cache import read_data
from data.hdf5 import simplify_trait_fullcode, HDF5_FILE_PATTERN

# Settings

In [3]:
SPREDIXCAN_H5_FOLDER = Path(
    conf.PHENOMEXCAN["SPREDIXCAN_MASHR_ZSCORES_FOLDER"],
    "hdf5",
)
assert SPREDIXCAN_H5_FOLDER.is_dir(), "The folder does not exist"

# Get all PhenomeXcan traits

## Get all PhenomeXcan trait full codes

In [4]:
from entity import Trait

In [5]:
all_phenomexcan_traits = [
    trait_fullcode
    for trait_fullcode in read_data(
        conf.PHENOMEXCAN["SMULTIXCAN_MASHR_ZSCORES_FILE"]
    ).columns
]

In [6]:
_tmp = set(all_phenomexcan_traits)
display(len(_tmp))
assert len(_tmp) == 4091

4091

# Get list of files

In [7]:
from glob import glob

In [8]:
spredixcan_files = list(SPREDIXCAN_H5_FOLDER.glob("*.h5"))

In [9]:
display(spredixcan_files[:5])
assert len(spredixcan_files) == 49

[PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/hdf5/spredixcan-Vagina-zscore.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/hdf5/spredixcan-Ovary-zscore.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/hdf5/spredixcan-Cells_Cultured_fibroblasts-zscore.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/hdf5/spredixcan-Pituitary-zscore.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/hdf5/spredixcan-Spleen-zscore.h5')]

# Get all tissues

In [10]:
import re

In [11]:
all_tissues = [
    re.search(HDF5_FILE_PATTERN, file.name).group("tissue") for file in spredixcan_files
]

In [12]:
display(all_tissues[:5])
assert len(all_tissues) == len(spredixcan_files)

['Vagina', 'Ovary', 'Cells_Cultured_fibroblasts', 'Pituitary', 'Spleen']

# Function to get most significant results

In [13]:
from data.hdf5 import read_spredixcan

In [14]:
def get_most_signif(trait_fullcode):
    """
    TODO: describe
    """
    trait_data = (
        pd.concat(
            [
                read_spredixcan(SPREDIXCAN_H5_FOLDER, trait_fullcode, t)
                for t in all_tissues
            ],
            keys=all_tissues,
            axis=0,
        )
        .reset_index()
        .drop(columns=["level_0"])[["gene_id", trait_fullcode]]
        .dropna()
    )

    return (
        trait_data.loc[
            trait_data[trait_fullcode].abs().groupby(trait_data["gene_id"]).idxmax()
        ]
        .set_index("gene_id")
        .squeeze()
    )

In [15]:
get_most_signif("100002_raw-Energy")

gene_id
ENSG00000000419    1.203957
ENSG00000000457    2.236525
ENSG00000000460    1.716387
ENSG00000000938    0.881469
ENSG00000000971   -1.372867
                     ...   
ENSG00000284430   -1.120714
ENSG00000284452    0.975901
ENSG00000284513    1.817369
ENSG00000284526    2.203900
ENSG00000284552   -0.729526
Name: 100002_raw-Energy, Length: 22255, dtype: float64

## Testing

In [16]:
_trait = "100001_raw-Food_weight"

_trait_data = pd.DataFrame(
    {t: read_spredixcan(SPREDIXCAN_H5_FOLDER, _trait, t) for t in all_tissues}
)

In [17]:
_trait_data.loc["ENSG00000225595"].dropna().sort_values()

Brain_Hypothalamus   -0.561372
Testis                0.242423
Name: ENSG00000225595, dtype: float64

In [18]:
_trait = "pgc.scz2"
_data_ms = get_most_signif(_trait)

In [19]:
_gene_id = "ENSG00000158691"
assert _data_ms.loc[_gene_id].round(3) == 11.067

_gene_id = "ENSG00000204713"
assert _data_ms.loc[_gene_id].round(3) == 10.825

_gene_id = "ENSG00000225595"
assert _data_ms.loc[_gene_id].round(3) == -10.956

In [20]:
_trait = "100001_raw-Food_weight"
_data_ms = get_most_signif(_trait)

In [21]:
_gene_id = "ENSG00000225595"
assert _data_ms.loc[_gene_id].round(3) == -0.561

_gene_id = "ENSG00000183323"
assert _data_ms.loc[_gene_id].round(3) == 4.444

_gene_id = "ENSG00000182901"
assert _data_ms.loc[_gene_id].round(3) == -4.369

# Compute most significant for all traits

In [22]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

In [23]:
def _run(trait):
    return {trait: get_most_signif(trait)}

In [24]:
all_results = {}
with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
    tasks = [executor.submit(_run, trait) for trait in all_phenomexcan_traits]
    for future in tqdm(as_completed(tasks), total=len(all_phenomexcan_traits)):
        res = future.result()
        all_results.update(res)

100%|██████████| 4091/4091 [1:11:26<00:00,  1.05s/it]


In [25]:
data_most_signif = pd.DataFrame(all_results)

In [26]:
data_most_signif.shape

(22515, 4091)

In [27]:
data_most_signif.head()

Unnamed: 0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
ENSG00000000419,-0.521616,-0.354356,-0.647688,1.110194,-1.786116,1.714687,0.999024,-1.662682,1.576522,-1.402812,...,1.614588,0.954165,-1.325516,-0.290283,1.00958,-1.237774,-1.083244,1.71117,-1.144057,-0.803475
ENSG00000000457,-1.885895,-2.056025,-0.693315,1.342271,1.631252,-1.902825,-1.485516,-2.356658,3.181447,2.094172,...,2.582993,2.004654,2.918376,1.886111,1.242073,1.184227,1.190373,0.773194,-2.302876,1.702888
ENSG00000000460,-1.269773,-2.088193,-2.868509,1.653372,1.307943,-2.114711,1.162425,-1.418749,2.363683,-1.48093,...,1.57785,1.156873,-1.480989,-1.38481,1.820841,1.872488,-1.279276,-1.054766,2.876039,-2.125799
ENSG00000000938,1.521316,-1.344366,-2.155603,1.313485,1.537092,4.348633,-0.60439,-1.816184,-1.049993,1.669054,...,1.369958,-0.446664,-3.131358,1.459237,1.007082,2.096814,0.932845,-2.452785,1.710248,1.730114
ENSG00000000971,-1.409875,-1.325785,-1.674708,1.316074,-1.145304,-0.788946,1.549621,1.224362,-0.670558,-1.915256,...,-1.882554,2.691985,-3.261818,2.756464,-1.038795,-0.576466,-3.605774,1.423109,-1.578721,-1.200703


# Save

In [28]:
output_folder = Path(
    conf.PHENOMEXCAN["SPREDIXCAN_MASHR_ZSCORES_FOLDER"],
    "most_signif",
).resolve()
output_folder.mkdir(exist_ok=True)

In [29]:
output_file = Path(output_folder, "spredixcan-most_signif.pkl").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/most_signif/spredixcan-most_signif.pkl')

In [30]:
data_most_signif.to_pickle(output_file)