# Description

For this projection, this notebook takes **ALL non-zero gene loadings** from each latent variable (LV) in the MultiPLIER model.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pickle

import numpy as np
import pandas as pd

import conf
from data.cache import read_data

# Settings

In [3]:
RESULTS_PROJ_OUTPUT_DIR = Path(
    conf.RESULTS['PROJECTIONS_DIR']
)

RESULTS_PROJ_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_PROJ_OUTPUT_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections')

# Read gene mappings

In [4]:
GENE_ID_TO_NAME_MAP = read_data(conf.PHENOMEXCAN["GENE_MAP_ID_TO_NAME"])
GENE_NAME_TO_ID_MAP = read_data(conf.PHENOMEXCAN["GENE_MAP_NAME_TO_ID"])

# Load PhenomeXcan data (S-MultiXcan)

In [5]:
smultixcan_results_filename = conf.PHENOMEXCAN["SMULTIXCAN_MASHR_ZSCORES_FILE"]

display(smultixcan_results_filename)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/smultixcan-mashr-zscores.pkl')

In [6]:
results_filename_stem = smultixcan_results_filename.stem
display(results_filename_stem)

'smultixcan-mashr-zscores'

In [7]:
smultixcan_results = pd.read_pickle(smultixcan_results_filename)

In [8]:
smultixcan_results.shape

(22515, 4091)

In [9]:
smultixcan_results.head()

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.169468,0.102558,0.239545,0.887758,1.313448,1.472148,0.72616,1.516367,1.299771,1.068093,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.377624,0.738444,0.298259
ENSG00000000457,1.358856,1.846875,0.139324,0.12953,0.757757,1.103979,0.612418,1.822327,2.035372,1.008058,...,1.441795,0.654791,2.545653,1.202984,0.514244,0.237223,0.414171,0.101731,1.012735,0.945167
ENSG00000000460,0.151008,1.173202,1.179426,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.033308,0.482261,0.695624,0.33648,0.083316,3.493196,0.991948
ENSG00000000938,1.302722,0.841524,1.578926,0.72134,0.139314,4.387016,0.125959,1.247123,0.215124,0.892083,...,0.126657,0.048048,1.886356,0.540496,0.127524,1.494501,0.056432,1.704863,1.351619,1.027297
ENSG00000000971,1.338813,0.262339,0.689379,1.702019,0.325859,0.063161,1.141126,0.882682,0.035533,1.810191,...,0.858497,1.675562,2.319072,1.598721,0.162958,0.005703,3.004544,0.803669,0.444266,0.165671


## Gene IDs to Gene names

In [10]:
smultixcan_results = smultixcan_results.rename(index=GENE_ID_TO_NAME_MAP)

In [11]:
smultixcan_results.shape

(22515, 4091)

In [12]:
smultixcan_results.head()

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,0.169468,0.102558,0.239545,0.887758,1.313448,1.472148,0.72616,1.516367,1.299771,1.068093,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.377624,0.738444,0.298259
SCYL3,1.358856,1.846875,0.139324,0.12953,0.757757,1.103979,0.612418,1.822327,2.035372,1.008058,...,1.441795,0.654791,2.545653,1.202984,0.514244,0.237223,0.414171,0.101731,1.012735,0.945167
C1orf112,0.151008,1.173202,1.179426,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.033308,0.482261,0.695624,0.33648,0.083316,3.493196,0.991948
FGR,1.302722,0.841524,1.578926,0.72134,0.139314,4.387016,0.125959,1.247123,0.215124,0.892083,...,0.126657,0.048048,1.886356,0.540496,0.127524,1.494501,0.056432,1.704863,1.351619,1.027297
CFH,1.338813,0.262339,0.689379,1.702019,0.325859,0.063161,1.141126,0.882682,0.035533,1.810191,...,0.858497,1.675562,2.319072,1.598721,0.162958,0.005703,3.004544,0.803669,0.444266,0.165671


## Remove duplicated gene entries

In [13]:
smultixcan_results.index[smultixcan_results.index.duplicated(keep='first')]

Index(['SPATA13', 'LINC01422', 'LINC00484', 'MAL2', 'GOLGA8M', 'LINC01115',
       'LYNX1'],
      dtype='object', name='gene_name')

In [14]:
smultixcan_results = smultixcan_results.loc[~smultixcan_results.index.duplicated(keep='first')]

In [15]:
smultixcan_results.shape

(22508, 4091)

## Remove NaN values

**TODO**: it might be better to try to impute this values

In [16]:
smultixcan_results = smultixcan_results.dropna(how='any')

In [17]:
smultixcan_results.shape

(22145, 4091)

# Project S-MultiXcan data into MultiPLIER latent space

In [18]:
from multiplier import MultiplierProjection

In [19]:
mproj = MultiplierProjection()

In [20]:
smultixcan_into_multiplier = mproj.transform(smultixcan_results)

In [21]:
smultixcan_into_multiplier.shape

(987, 4091)

In [22]:
smultixcan_into_multiplier.head()

Unnamed: 0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
LV1,-0.026255,0.006045,0.007276,-0.02974,-0.001949,0.009288,0.006488,0.003354,-0.021751,-0.039159,...,0.036648,0.094062,0.047505,-0.035089,-0.0366,0.185595,-0.031841,-0.011883,0.033163,0.061374
LV2,-0.006253,-0.023996,0.037932,-1.8e-05,-0.016128,-0.033736,-0.017215,-0.004558,0.079234,-0.040277,...,0.085747,-0.064254,0.022124,-0.008441,-0.049594,0.074838,-0.057865,-0.007585,0.00358,-0.025216
LV3,-0.002729,0.051637,-0.013182,-0.001663,0.035221,-0.016019,0.000484,-0.046965,-0.000498,-0.041794,...,0.068618,-0.034395,0.145881,0.031539,-0.03174,0.015501,-0.028148,-0.048226,0.030093,-0.026938
LV4,0.042319,-0.023236,0.063016,-0.002628,-0.046487,0.018004,-0.029841,-0.023949,-0.026485,0.022759,...,-0.070986,-0.02896,0.015369,0.051671,0.014078,-0.147863,0.034195,0.068829,-0.023213,-0.089619
LV5,-0.025975,0.002167,-0.022646,0.004559,0.004649,0.02844,-0.024379,-0.028536,0.019263,-0.026291,...,0.008842,-0.000932,0.004195,0.029759,-0.04376,0.057031,-0.002836,-0.046215,0.099309,0.075663


# Quick analysis

In [23]:
(
    smultixcan_into_multiplier.loc['LV603']
    .sort_values(ascending=False)
    .head(20)
)

30000_raw-White_blood_cell_leukocyte_count           0.347896
30130_raw-Monocyte_count                             0.317362
50_raw-Standing_height                               0.305480
30100_raw-Mean_platelet_thrombocyte_volume           0.300201
30180_raw-Lymphocyte_percentage                      0.299121
Astle_et_al_2016_White_blood_cell_count              0.292459
Astle_et_al_2016_Monocyte_count                      0.291476
30140_raw-Neutrophill_count                          0.291155
Astle_et_al_2016_Myeloid_white_cell_count            0.283702
Astle_et_al_2016_Granulocyte_count                   0.265380
30200_raw-Neutrophill_percentage                     0.265349
Astle_et_al_2016_Sum_neutrophil_eosinophil_counts    0.264932
Astle_et_al_2016_Sum_basophil_neutrophil_counts      0.259895
Astle_et_al_2016_Neutrophil_count                    0.258384
30150-Eosinophill_count                              0.252839
30090_raw-Platelet_crit                              0.233135
30080_ra

In [24]:
(
    smultixcan_into_multiplier.loc['LV136']
    .sort_values(ascending=False)
    .head(20)
)

5132_raw-3mm_strong_meridian_right                                0.270901
5134_raw-6mm_strong_meridian_left                                 0.252899
5099_raw-3mm_weak_meridian_right                                  0.249057
5133_raw-6mm_strong_meridian_right                                0.240017
5098_raw-6mm_weak_meridian_right                                  0.237263
5096_raw-3mm_weak_meridian_left                                   0.234509
5097_raw-6mm_weak_meridian_left                                   0.231357
5135_raw-3mm_strong_meridian_left                                 0.228634
CARDIoGRAM_C4D_CAD_ADDITIVE                                       0.223093
I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease      0.175861
I9_CORATHER-Coronary_atherosclerosis                              0.175774
I9_IHD-Ischaemic_heart_disease_wide_definition                    0.174624
4079_raw-Diastolic_blood_pressure_automated_reading               0.173653
I9_MI_STRICT-Myocardial_i

# Save

In [25]:
output_file = Path(
    RESULTS_PROJ_OUTPUT_DIR,
    f'projection-{results_filename_stem}.pkl'
).resolve()

display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections/projection-smultixcan-mashr-zscores.pkl')

In [26]:
smultixcan_into_multiplier.to_pickle(output_file)