# Description

This notebook reads .h5 files generated in PhenomeXcan by [this notebook](https://github.com/hakyimlab/phenomexcan/blob/master/scripts/100_postprocessing/05_spredixcan.ipynb), and saves one file per tissue with the results in pandas DataFrame format (genes in rows, traits in columns). It saves these in two formats: pickle and tsv.gz

The notebook will generate two other folders in the parent of `SPREDIXCAN_H5_FOLDER`: `pkl` and `tsv`

**The idea** is to have the data in a friendly format.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

import conf
from data.cache import read_data
from data.hdf5 import simplify_trait_fullcode, HDF5_FILE_PATTERN

# Settings

In [3]:
SPREDIXCAN_H5_FOLDER = Path(
    conf.PHENOMEXCAN["SPREDIXCAN_MASHR_ZSCORES_FOLDER"],
    "hdf5",
)
display(SPREDIXCAN_H5_FOLDER)
assert SPREDIXCAN_H5_FOLDER.is_dir(), "The folder does not exist"

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/hdf5')

In [4]:
spredixcan_pkl_output_folder = Path(SPREDIXCAN_H5_FOLDER.parent, "pkl").resolve()
spredixcan_pkl_output_folder.mkdir(exist_ok=True)
display(spredixcan_pkl_output_folder)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/pkl')

In [5]:
spredixcan_tsv_output_folder = Path(SPREDIXCAN_H5_FOLDER.parent, "tsv").resolve()
spredixcan_tsv_output_folder.mkdir(exist_ok=True)
display(spredixcan_tsv_output_folder)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/tsv')

# Read S-PrediXcan results

## Get list of files

In [6]:
from glob import glob

In [7]:
spredixcan_files = list(SPREDIXCAN_H5_FOLDER.glob("*.h5"))

In [8]:
display(spredixcan_files[:5])
assert len(spredixcan_files) == 49

[PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/hdf5/spredixcan-Vagina-zscore.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/hdf5/spredixcan-Ovary-zscore.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/hdf5/spredixcan-Cells_Cultured_fibroblasts-zscore.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/hdf5/spredixcan-Pituitary-zscore.h5'),
 PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/spredixcan/hdf5/spredixcan-Spleen-zscore.h5')]

## Get tissue name from file list

In [9]:
import re

In [10]:
# some testing
match = re.search(HDF5_FILE_PATTERN, "spredixcan-Esophagus_Muscularis-zscore.h5")
assert match.group("tissue") == "Esophagus_Muscularis"

match = re.search(
    HDF5_FILE_PATTERN, "spredixcan-Brain_Anterior_cingulate_cortex_BA24-zscore.h5"
)
assert match.group("tissue") == "Brain_Anterior_cingulate_cortex_BA24"

# Load S-PrediXcan results

## Get all PhenomeXcan trait full codes

In [11]:
from entity import Trait

In [12]:
all_phenomexcan_traits = {
    trait_fullcode
    for trait_fullcode in read_data(
        conf.PHENOMEXCAN["SMULTIXCAN_MASHR_ZSCORES_FILE"]
    ).columns
}

In [13]:
display(len(all_phenomexcan_traits))
assert len(all_phenomexcan_traits) == 4091

4091

## Read all results

In [14]:
for f_idx, f in enumerate(spredixcan_files):
    f_tissue = re.search(HDF5_FILE_PATTERN, f.name).group("tissue")
    print(f"{f_idx}. {f.name}")

    with pd.HDFStore(f, mode="r") as store:
        traits_keys = list(store.keys())
        assert len(traits_keys) == len(all_phenomexcan_traits)

        store_data = {}

        for trait_fullcode in all_phenomexcan_traits:
            trait_hdf5 = simplify_trait_fullcode(trait_fullcode)

            trait_data = store[trait_hdf5].rename_axis("gene_id")

            store_data[trait_fullcode] = trait_data

        df = pd.DataFrame(store_data)
        assert df.shape[1] == len(all_phenomexcan_traits)
        assert df.index.is_unique

        # output filename
        output_filename_prefix = f"spredixcan-mashr-zscores-{f_tissue}"

        # Save pickle
        df.to_pickle(
            Path(spredixcan_pkl_output_folder, f"{output_filename_prefix}.pkl")
        )

        # Save tsv
        df.to_csv(
            Path(spredixcan_tsv_output_folder, f"{output_filename_prefix}.tsv.gz"),
            sep="\t",
            float_format="%.5e",
        )

0. spredixcan-Vagina-zscore.h5


1. spredixcan-Ovary-zscore.h5


2. spredixcan-Cells_Cultured_fibroblasts-zscore.h5


3. spredixcan-Pituitary-zscore.h5


4. spredixcan-Spleen-zscore.h5


5. spredixcan-Heart_Left_Ventricle-zscore.h5


6. spredixcan-Brain_Substantia_nigra-zscore.h5


7. spredixcan-Brain_Hypothalamus-zscore.h5


8. spredixcan-Artery_Coronary-zscore.h5


9. spredixcan-Liver-zscore.h5


10. spredixcan-Muscle_Skeletal-zscore.h5


11. spredixcan-Artery_Tibial-zscore.h5


12. spredixcan-Esophagus_Gastroesophageal_Junction-zscore.h5


13. spredixcan-Adipose_Visceral_Omentum-zscore.h5


14. spredixcan-Skin_Sun_Exposed_Lower_leg-zscore.h5


15. spredixcan-Stomach-zscore.h5


16. spredixcan-Brain_Spinal_cord_cervical_c-1-zscore.h5


17. spredixcan-Artery_Aorta-zscore.h5


18. spredixcan-Esophagus_Muscularis-zscore.h5


19. spredixcan-Esophagus_Mucosa-zscore.h5


20. spredixcan-Brain_Anterior_cingulate_cortex_BA24-zscore.h5


21. spredixcan-Brain_Cerebellar_Hemisphere-zscore.h5


22. spredixcan-Adipose_Subcutaneous-zscore.h5


23. spredixcan-Brain_Cortex-zscore.h5


24. spredixcan-Pancreas-zscore.h5


25. spredixcan-Brain_Cerebellum-zscore.h5


26. spredixcan-Brain_Nucleus_accumbens_basal_ganglia-zscore.h5


27. spredixcan-Testis-zscore.h5


28. spredixcan-Brain_Hippocampus-zscore.h5


29. spredixcan-Adrenal_Gland-zscore.h5


30. spredixcan-Prostate-zscore.h5


31. spredixcan-Brain_Frontal_Cortex_BA9-zscore.h5


32. spredixcan-Lung-zscore.h5


33. spredixcan-Whole_Blood-zscore.h5


34. spredixcan-Skin_Not_Sun_Exposed_Suprapubic-zscore.h5


35. spredixcan-Colon_Transverse-zscore.h5


36. spredixcan-Brain_Putamen_basal_ganglia-zscore.h5


37. spredixcan-Heart_Atrial_Appendage-zscore.h5


38. spredixcan-Uterus-zscore.h5


39. spredixcan-Nerve_Tibial-zscore.h5


40. spredixcan-Brain_Amygdala-zscore.h5


41. spredixcan-Breast_Mammary_Tissue-zscore.h5


42. spredixcan-Cells_EBV-transformed_lymphocytes-zscore.h5


43. spredixcan-Colon_Sigmoid-zscore.h5


44. spredixcan-Small_Intestine_Terminal_Ileum-zscore.h5


45. spredixcan-Thyroid-zscore.h5


46. spredixcan-Minor_Salivary_Gland-zscore.h5


47. spredixcan-Brain_Caudate_basal_ganglia-zscore.h5


48. spredixcan-Kidney_Cortex-zscore.h5


# Testing

## List of traits match those in S-MultiXcan

In [15]:
_phenomexcan_trait_fullcodes = pd.Index(all_phenomexcan_traits)
display(_phenomexcan_trait_fullcodes)
assert _phenomexcan_trait_fullcodes.is_unique

Index(['22604_1-Work_hours_lumped_category_15_to_lessthan20_hours',
       '22601_54342896-Job_coding_chef_cook_caterer',
       '21001_raw-Body_mass_index_BMI',
       '2654_6-Nonbutter_spread_type_details_Olive_oil_based_spread_eg_Bertolli',
       '40001_C259-Underlying_primary_cause_of_death_ICD10_C259_Pancreas_unspecified',
       '4803_11-Tinnitus_Yes_now_most_or_all_of_the_time',
       '20090_372-Type_of_fatoil_used_in_cooking_Very_low_fat_polyunsaturated_margarine',
       '20002_1625-Noncancer_illness_code_selfreported_cellulitis',
       'G57-Diagnoses_main_ICD10_G57_Mononeuropathies_of_lower_limb',
       '22601_53142540-Job_coding_heating_engineer_gas_service_engineer_air_conditioning_engineer',
       ...
       '6152_7-Blood_clot_DVT_bronchitis_emphysema_asthma_rhinitis_eczema_allergy_diagnosed_by_doctor_Blood_clot_in_the_lung',
       '5101_raw-6mm_weak_meridian_angle_right',
       '22601_41423445-Job_coding_emergency_services_control_roomradio_operatorofficer',
      

### pickle

In [16]:
output_filename_prefix = f"spredixcan-mashr-zscores-Adipose_Subcutaneous"
filepath = Path(spredixcan_pkl_output_folder, f"{output_filename_prefix}.pkl")
_spredixcan_traits = pd.read_pickle(filepath).columns

In [17]:
_spredixcan_traits

Index(['22604_1-Work_hours_lumped_category_15_to_lessthan20_hours',
       '22601_54342896-Job_coding_chef_cook_caterer',
       '21001_raw-Body_mass_index_BMI',
       '2654_6-Nonbutter_spread_type_details_Olive_oil_based_spread_eg_Bertolli',
       '40001_C259-Underlying_primary_cause_of_death_ICD10_C259_Pancreas_unspecified',
       '4803_11-Tinnitus_Yes_now_most_or_all_of_the_time',
       '20090_372-Type_of_fatoil_used_in_cooking_Very_low_fat_polyunsaturated_margarine',
       '20002_1625-Noncancer_illness_code_selfreported_cellulitis',
       'G57-Diagnoses_main_ICD10_G57_Mononeuropathies_of_lower_limb',
       '22601_53142540-Job_coding_heating_engineer_gas_service_engineer_air_conditioning_engineer',
       ...
       '6152_7-Blood_clot_DVT_bronchitis_emphysema_asthma_rhinitis_eczema_allergy_diagnosed_by_doctor_Blood_clot_in_the_lung',
       '5101_raw-6mm_weak_meridian_angle_right',
       '22601_41423445-Job_coding_emergency_services_control_roomradio_operatorofficer',
      

In [18]:
assert _spredixcan_traits.is_unique

In [19]:
_tmp = _phenomexcan_trait_fullcodes.intersection(_spredixcan_traits)
display(_tmp)
assert _tmp.shape[0] == _phenomexcan_trait_fullcodes.shape[0]

Index(['22604_1-Work_hours_lumped_category_15_to_lessthan20_hours',
       '22601_54342896-Job_coding_chef_cook_caterer',
       '21001_raw-Body_mass_index_BMI',
       '2654_6-Nonbutter_spread_type_details_Olive_oil_based_spread_eg_Bertolli',
       '40001_C259-Underlying_primary_cause_of_death_ICD10_C259_Pancreas_unspecified',
       '4803_11-Tinnitus_Yes_now_most_or_all_of_the_time',
       '20090_372-Type_of_fatoil_used_in_cooking_Very_low_fat_polyunsaturated_margarine',
       '20002_1625-Noncancer_illness_code_selfreported_cellulitis',
       'G57-Diagnoses_main_ICD10_G57_Mononeuropathies_of_lower_limb',
       '22601_53142540-Job_coding_heating_engineer_gas_service_engineer_air_conditioning_engineer',
       ...
       '6152_7-Blood_clot_DVT_bronchitis_emphysema_asthma_rhinitis_eczema_allergy_diagnosed_by_doctor_Blood_clot_in_the_lung',
       '5101_raw-6mm_weak_meridian_angle_right',
       '22601_41423445-Job_coding_emergency_services_control_roomradio_operatorofficer',
      

### tsv.gz

In [20]:
output_filename_prefix = f"spredixcan-mashr-zscores-Adipose_Visceral_Omentum"
filepath = Path(spredixcan_tsv_output_folder, f"{output_filename_prefix}.tsv.gz")
_spredixcan_traits = pd.read_csv(filepath, sep="\t", index_col="gene_id").columns

In [21]:
_spredixcan_traits

Index(['22604_1-Work_hours_lumped_category_15_to_lessthan20_hours',
       '22601_54342896-Job_coding_chef_cook_caterer',
       '21001_raw-Body_mass_index_BMI',
       '2654_6-Nonbutter_spread_type_details_Olive_oil_based_spread_eg_Bertolli',
       '40001_C259-Underlying_primary_cause_of_death_ICD10_C259_Pancreas_unspecified',
       '4803_11-Tinnitus_Yes_now_most_or_all_of_the_time',
       '20090_372-Type_of_fatoil_used_in_cooking_Very_low_fat_polyunsaturated_margarine',
       '20002_1625-Noncancer_illness_code_selfreported_cellulitis',
       'G57-Diagnoses_main_ICD10_G57_Mononeuropathies_of_lower_limb',
       '22601_53142540-Job_coding_heating_engineer_gas_service_engineer_air_conditioning_engineer',
       ...
       '6152_7-Blood_clot_DVT_bronchitis_emphysema_asthma_rhinitis_eczema_allergy_diagnosed_by_doctor_Blood_clot_in_the_lung',
       '5101_raw-6mm_weak_meridian_angle_right',
       '22601_41423445-Job_coding_emergency_services_control_roomradio_operatorofficer',
      

In [22]:
assert _spredixcan_traits.is_unique

In [23]:
_tmp = _phenomexcan_trait_fullcodes.intersection(_spredixcan_traits)
display(_tmp)
assert _tmp.shape[0] == _phenomexcan_trait_fullcodes.shape[0]

Index(['22604_1-Work_hours_lumped_category_15_to_lessthan20_hours',
       '22601_54342896-Job_coding_chef_cook_caterer',
       '21001_raw-Body_mass_index_BMI',
       '2654_6-Nonbutter_spread_type_details_Olive_oil_based_spread_eg_Bertolli',
       '40001_C259-Underlying_primary_cause_of_death_ICD10_C259_Pancreas_unspecified',
       '4803_11-Tinnitus_Yes_now_most_or_all_of_the_time',
       '20090_372-Type_of_fatoil_used_in_cooking_Very_low_fat_polyunsaturated_margarine',
       '20002_1625-Noncancer_illness_code_selfreported_cellulitis',
       'G57-Diagnoses_main_ICD10_G57_Mononeuropathies_of_lower_limb',
       '22601_53142540-Job_coding_heating_engineer_gas_service_engineer_air_conditioning_engineer',
       ...
       '6152_7-Blood_clot_DVT_bronchitis_emphysema_asthma_rhinitis_eczema_allergy_diagnosed_by_doctor_Blood_clot_in_the_lung',
       '5101_raw-6mm_weak_meridian_angle_right',
       '22601_41423445-Job_coding_emergency_services_control_roomradio_operatorofficer',
      

## Values

Tests taken from: https://github.com/hakyimlab/phenomexcan/blob/master/scripts/100_postprocessing/05_spredixcan.ipynb

In [24]:
# pkl
output_filename_prefix = f"spredixcan-mashr-zscores-Thyroid"
filepath = Path(spredixcan_pkl_output_folder, f"{output_filename_prefix}.pkl")
df = pd.read_pickle(filepath)[
    "N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria"
]

assert df.shape[0] == 15289
assert df.loc["ENSG00000213965"] == -3.6753054157625686
assert pd.isnull(df.loc["ENSG00000198670"])
assert df.loc["ENSG00000177025"] == 4.316259089446458

In [25]:
# tsv.gz
output_filename_prefix = f"spredixcan-mashr-zscores-Thyroid"
filepath = Path(spredixcan_tsv_output_folder, f"{output_filename_prefix}.tsv.gz")
df = pd.read_csv(filepath, sep="\t", index_col="gene_id")[
    "N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria"
]

assert df.shape[0] == 15289
assert df.loc["ENSG00000213965"].round(5) == -3.67531
assert pd.isnull(df.loc["ENSG00000198670"])
assert df.loc["ENSG00000177025"].round(5) == 4.31626

Check if small values in tsv.gz are correctly saved:

In [26]:
# tsv.gz
output_filename_prefix = f"spredixcan-mashr-zscores-Adipose_Subcutaneous"
filepath = Path(spredixcan_tsv_output_folder, f"{output_filename_prefix}.tsv.gz")
df = pd.read_csv(filepath, sep="\t", index_col="gene_id")

assert (
    df.loc[
        "ENSG00000002746",
        "20003_1141153242-Treatmentmedication_code_balsalazide_disodium",
    ].round(5)
    == 0.00327
)
assert df.loc["ENSG00000074706", "MAGNETIC_HDL.C"] == 0.00
assert (
    np.format_float_scientific(
        df.loc[
            "ENSG00000164112",
            "N13-Diagnoses_main_ICD10_N13_Obstructive_and_reflux_uropathy",
        ],
        5,
    )
    == "-1.80052e-07"
)
assert (
    np.format_float_scientific(
        df.loc[
            "ENSG00000257467",
            "20411_0-Ever_been_injured_or_injured_someone_else_through_drinking_alcohol_No",
        ],
        5,
    )
    == "-3.89826e-09"
)

More tests taken from the webapp:

Standing height

In [27]:
output_filename_prefix = f"spredixcan-mashr-zscores-Whole_Blood"
filepath = Path(spredixcan_pkl_output_folder, f"{output_filename_prefix}.pkl")
_tmp = pd.read_pickle(filepath)["50_raw-Standing_height"]
assert _tmp.shape == (12610,)

In [28]:
assert _tmp.loc["ENSG00000101019"].round(3) == -34.024

In [29]:
assert _tmp.loc["ENSG00000109805"].round(3) == -22.855

In [30]:
assert _tmp.loc["ENSG00000177311"].round(3) == 33.819

In [31]:
output_filename_prefix = f"spredixcan-mashr-zscores-Whole_Blood"
filepath = Path(spredixcan_tsv_output_folder, f"{output_filename_prefix}.tsv.gz")
_tmp = pd.read_csv(filepath, sep="\t", index_col="gene_id")["50_raw-Standing_height"]
assert _tmp.shape == (12610,)

In [32]:
assert _tmp.loc["ENSG00000101019"].round(3) == -34.024

In [33]:
assert _tmp.loc["ENSG00000109805"].round(3) == -22.855

In [34]:
assert _tmp.loc["ENSG00000177311"].round(3) == 33.819

Schizophrenia

In [35]:
output_filename_prefix = f"spredixcan-mashr-zscores-Prostate"
filepath = Path(spredixcan_pkl_output_folder, f"{output_filename_prefix}.pkl")
_tmp = pd.read_pickle(filepath)["pgc.scz2"]

In [36]:
assert _tmp.loc["ENSG00000233822"].round(3) == 10.752

In [37]:
assert _tmp.loc["ENSG00000137312"].round(3) == -8.827

In [38]:
assert _tmp.loc["ENSG00000204257"].round(3) == -7.965

In [39]:
output_filename_prefix = f"spredixcan-mashr-zscores-Prostate"
filepath = Path(spredixcan_tsv_output_folder, f"{output_filename_prefix}.tsv.gz")
_tmp = pd.read_csv(filepath, sep="\t", index_col="gene_id")["pgc.scz2"]

In [40]:
assert _tmp.loc["ENSG00000233822"].round(3) == 10.752

In [41]:
assert _tmp.loc["ENSG00000137312"].round(3) == -8.827

In [42]:
assert _tmp.loc["ENSG00000204257"].round(3) == -7.965