# Description

It projects the PhenomeXcan results (EFO version) into the MultiPLIER latent space.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

from IPython.display import display
import pandas as pd

import conf
from entity import Gene
from data.cache import read_data
from multiplier import MultiplierProjection

# Settings

In [3]:
RESULTS_PROJ_OUTPUT_DIR = Path(conf.RESULTS["PROJECTIONS_DIR"])
RESULTS_PROJ_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_PROJ_OUTPUT_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections')

# Load PhenomeXcan data (S-MultiXcan)

In [4]:
smultixcan_results_filename = conf.PHENOMEXCAN[
    "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
]

display(smultixcan_results_filename)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.pkl')

In [5]:
results_filename_stem = smultixcan_results_filename.stem
display(results_filename_stem)

'smultixcan-efo_partial-mashr-zscores'

In [6]:
smultixcan_results = pd.read_pickle(smultixcan_results_filename)

In [7]:
smultixcan_results.shape

(22515, 3752)

In [8]:
smultixcan_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
ENSG00000000457,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
ENSG00000000460,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
ENSG00000000938,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
ENSG00000000971,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


## Gene IDs to Gene names

In [9]:
smultixcan_results = smultixcan_results.rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [10]:
smultixcan_results.shape

(22515, 3752)

In [11]:
smultixcan_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
SCYL3,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
C1orf112,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
FGR,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
CFH,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


## Remove duplicated gene entries

In [12]:
smultixcan_results.index[smultixcan_results.index.duplicated(keep="first")]

Index(['SPATA13', 'LINC01422', 'LINC00484', 'MAL2', 'GOLGA8M', 'LINC01115',
       'LYNX1'],
      dtype='object', name='gene_name')

In [13]:
smultixcan_results = smultixcan_results.loc[
    ~smultixcan_results.index.duplicated(keep="first")
]

In [14]:
smultixcan_results.shape

(22508, 3752)

## Some checks

In [15]:
# the data should have no NaN values
assert smultixcan_results.shape == smultixcan_results.dropna(how="any").shape

# Standardize S-MultiXcan results

Here we adjust for highly polygenic traits (see notebook `005_00-data_analysis.ipynb`): we penalize those traits that have large effect sizes across several genes, such as antropometric traits.

In [16]:
_tmp = smultixcan_results.apply(lambda x: x / x.sum())

In [17]:
_tmp.shape

(22508, 3752)

In [18]:
assert _tmp.shape == smultixcan_results.shape

In [19]:
# some testing
_trait = "body height"
_gene = "SCYL3"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "100001_raw-Food_weight"
_gene = "DPM1"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "estrogen-receptor negative breast cancer"
_gene = "CFH"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "asthma"
_gene = "C1orf112"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

In [20]:
smultixcan_results = _tmp

In [21]:
smultixcan_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,6e-05,3.9e-05,5e-06,1.6e-05,6.1e-05,7.5e-05,4e-06,4.254814e-05,3.1e-05,1.7e-05,...,2e-05,7.5e-05,6.4e-05,4.7e-05,6.6e-05,6.6e-05,6.9e-05,5.8e-05,6.250232e-05,3.4e-05
SCYL3,3.3e-05,5.5e-05,0.000119,4.1e-05,5e-05,1.1e-05,5.9e-05,4.103114e-05,2e-05,3.6e-05,...,0.000118,7e-06,3e-05,7e-06,2.9e-05,0.000126,2.7e-05,0.000114,1.421837e-05,7e-05
C1orf112,2.7e-05,2.2e-05,6.7e-05,2.3e-05,2.2e-05,1.4e-05,6.7e-05,4.432186e-05,3e-05,4.1e-05,...,9.8e-05,5.5e-05,3e-05,2.3e-05,8.4e-05,6.8e-05,1.8e-05,2.4e-05,9.338414e-06,5.6e-05
FGR,1.5e-05,1.4e-05,4.7e-05,1.9e-05,3e-06,1e-05,8e-06,5.350929e-07,1.9e-05,2e-05,...,3.6e-05,0.000114,0.000152,6e-06,5.7e-05,3.8e-05,3.5e-05,5e-05,1.969101e-05,5.8e-05
CFH,2.9e-05,2.1e-05,3.9e-05,6.3e-05,1.7e-05,1.8e-05,8.8e-05,1.668206e-05,1.7e-05,9.5e-05,...,1.4e-05,3.6e-05,6e-05,3.8e-05,8.3e-05,1.5e-05,5e-06,8.1e-05,8.895497e-11,3.6e-05


# Project S-MultiXcan data into MultiPLIER latent space

In [22]:
mproj = MultiplierProjection()

In [23]:
smultixcan_into_multiplier = mproj.transform(smultixcan_results)

In [24]:
smultixcan_into_multiplier.shape

(987, 3752)

In [25]:
smultixcan_into_multiplier.head()

Unnamed: 0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
LV1,-0.021292,-0.046815,-0.021585,-0.030324,-0.017773,-0.007844,-0.015529,-0.011609,-0.051342,-0.036813,...,-0.001614,0.002102,-0.005318,-0.003041,0.012873,-0.015951,-0.018904,0.009576,0.019755,0.010027
LV2,0.056061,-0.009864,-0.028888,-0.053573,0.006951,-0.054597,-0.030748,0.014519,-0.026933,0.009024,...,-0.029614,-0.033636,0.020768,0.010656,-0.013305,0.050738,0.012085,-0.032372,-0.02454,-0.002616
LV3,0.00165,-0.004262,0.019211,0.007484,0.004548,0.006548,0.006741,-0.005634,-0.022129,-0.017256,...,0.010539,-0.019591,-0.013391,-0.044369,0.020142,0.006052,0.020869,0.043931,0.058049,-0.013144
LV4,0.026265,-0.007235,0.004304,-0.042464,-0.001093,-0.041208,-0.051179,0.00906,-0.003999,-0.005874,...,-0.018162,0.012136,-0.046232,-0.030949,-0.040147,0.044545,0.009865,-0.049828,-0.013725,0.024988
LV5,-0.015616,0.010799,0.017792,0.010591,0.001972,0.040024,-0.027612,-0.00154,0.033072,0.009477,...,0.030632,-0.023253,0.015853,-0.002222,0.019385,-0.015501,-0.031477,-0.002554,-0.03262,0.017731


# Quick analysis

In [26]:
(smultixcan_into_multiplier.loc["LV603"].sort_values(ascending=False).head(20))

30220_raw-Basophill_percentage                                   0.181608
30130_raw-Monocyte_count                                         0.158166
30000_raw-White_blood_cell_leukocyte_count                       0.146061
myeloid white cell count                                         0.133998
leukocyte count                                                  0.131486
sum of neutrophil and eosinophil counts                          0.128405
granulocyte count                                                0.128316
sum of basophil and neutrophil counts                            0.126586
neutrophil count                                                 0.125868
30160-Basophill_count                                            0.122953
30180_raw-Lymphocyte_percentage                                  0.119818
30140_raw-Neutrophill_count                                      0.113055
monocyte count                                                   0.113054
20003_1140875420-Treatmentmedication_c

In [27]:
(smultixcan_into_multiplier.loc["LV136"].sort_values(ascending=False).head(20))

coronary artery disease                                           0.210545
5132_raw-3mm_strong_meridian_right                                0.179597
5134_raw-6mm_strong_meridian_left                                 0.167916
5099_raw-3mm_weak_meridian_right                                  0.161459
5133_raw-6mm_strong_meridian_right                                0.160688
5098_raw-6mm_weak_meridian_right                                  0.157502
I9_CORATHER-Coronary_atherosclerosis                              0.154131
5097_raw-6mm_weak_meridian_left                                   0.150595
5096_raw-3mm_weak_meridian_left                                   0.150400
5135_raw-3mm_strong_meridian_left                                 0.149901
I9_IHD-Ischaemic_heart_disease_wide_definition                    0.148814
I9_MI_STRICT-Myocardial_infarction_strict                         0.125965
I9_MI-Myocardial_infarction                                       0.125965
acute myocardial infarcti

In [28]:
(smultixcan_into_multiplier.loc["LV844"].sort_values(ascending=False).head(20))

celiac disease                                                                                  0.714810
rheumatoid arthritis                                                                            0.600279
K11_COELIAC-Coeliac_disease                                                                     0.600040
malabsorption syndrome                                                                          0.570880
2986-Started_insulin_within_one_year_diagnosis_of_diabetes                                      0.350327
6144_3-Never_eat_eggs_dairy_wheat_sugar_Wheat_products                                          0.338204
systemic lupus erythematosus                                                                    0.327493
hyperthyroidism AND thyrotoxicosis                                                              0.309382
20003_1140883066-Treatmentmedication_code_insulin_product                                       0.289095
psoriasis                                              

# Save

In [29]:
output_file = Path(
    RESULTS_PROJ_OUTPUT_DIR, f"projection-{results_filename_stem}.pkl"
).resolve()

display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections/projection-smultixcan-efo_partial-mashr-zscores.pkl')

In [30]:
smultixcan_into_multiplier.to_pickle(output_file)