# Description

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pickle

import numpy as np
import pandas as pd

import conf
from data.cache import read_data

# Settings

In [3]:
# The percentile name indicates the top percentage of genes retained
PERCENTILE_NAME = 'pALL'

display(PERCENTILE_NAME)

'pALL'

In [4]:
RESULTS_PROJ_OUTPUT_DIR = Path(
    conf.RESULTS['PROJECTIONS_DIR']
)

RESULTS_PROJ_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_PROJ_OUTPUT_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/projections')

# Read gene mappings

In [5]:
GENE_ID_TO_NAME_MAP = read_data(conf.PHENOMEXCAN["GENE_MAP_ID_TO_NAME"])
GENE_NAME_TO_ID_MAP = read_data(conf.PHENOMEXCAN["GENE_MAP_NAME_TO_ID"])

# Load PhenomeXcan data (S-MultiXcan)

In [6]:
smultixcan_results_filename = conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]

display(smultixcan_results_filename)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.pkl')

In [7]:
results_filename_stem = smultixcan_results_filename.stem
display(results_filename_stem)

'smultixcan-efo_partial-mashr-zscores'

In [8]:
smultixcan_results = pd.read_pickle(smultixcan_results_filename)

In [9]:
smultixcan_results.shape

(22515, 3749)

In [10]:
smultixcan_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
ENSG00000000457,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
ENSG00000000460,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
ENSG00000000938,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
ENSG00000000971,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


## Gene IDs to Gene names

In [11]:
smultixcan_results = smultixcan_results.rename(index=GENE_ID_TO_NAME_MAP)

In [12]:
smultixcan_results.shape

(22515, 3749)

In [13]:
smultixcan_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
SCYL3,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
C1orf112,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
FGR,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
CFH,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


## Remove duplicated gene entries

In [14]:
smultixcan_results.index[smultixcan_results.index.duplicated(keep='first')]

Index(['SPATA13', 'LINC01422', 'LINC00484', 'MAL2', 'GOLGA8M', 'LINC01115',
       'LYNX1'],
      dtype='object', name='gene_name')

In [15]:
smultixcan_results = smultixcan_results.loc[~smultixcan_results.index.duplicated(keep='first')]

In [16]:
smultixcan_results.shape

(22508, 3749)

## Remove NaN values

**TODO**: it might be better to try to impute this values

In [17]:
smultixcan_results = smultixcan_results.dropna(how='any')

In [18]:
smultixcan_results.shape

(22508, 3749)

# Project S-MultiXcan data into MultiPLIER latent space

In [19]:
from multiplier import MultiplierProjection

In [20]:
mproj = MultiplierProjection()

In [21]:
smultixcan_into_multiplier = mproj.transform(smultixcan_results)

In [22]:
smultixcan_into_multiplier.shape

(987, 3749)

In [23]:
smultixcan_into_multiplier.head()

Unnamed: 0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
LV1,-0.018452,-0.043782,-0.021514,-0.030454,-0.017428,-0.012313,-0.018044,-0.008047,-0.049581,-0.033719,...,-0.006604,-0.003207,-0.010638,-0.005853,0.001435,-0.013369,-0.005603,0.005034,0.045065,0.040257
LV2,0.052938,-0.012041,-0.028537,-0.052542,0.003757,-0.05468,-0.032025,0.009933,-0.030161,0.006869,...,-0.030526,-0.033616,0.018583,0.004988,-0.013814,0.052914,0.03417,-0.032019,-0.013778,0.022792
LV3,-0.003629,-0.011772,0.009441,0.000459,-0.003708,2.1e-05,-0.001102,-0.013368,-0.024807,-0.020284,...,-8.3e-05,-0.022389,-0.019574,-0.045773,0.00688,0.007325,0.048046,0.030989,0.088343,0.02185
LV4,0.028359,-0.006148,0.007808,-0.039613,-0.000929,-0.039796,-0.043357,0.007231,-0.002575,-0.003986,...,-0.018537,0.010687,-0.043556,-0.030884,-0.037816,0.043915,0.025911,-0.04774,-0.00655,0.054932
LV5,-0.0155,0.007011,0.012707,0.006191,-0.000647,0.032319,-0.028891,-0.002337,0.029445,0.008233,...,0.023084,-0.023192,0.010425,-0.006992,0.010299,-0.015184,-0.019313,-0.007507,-0.02618,0.049838


# Quick analysis

In [24]:
(
    smultixcan_into_multiplier.loc['LV603']
    .sort_values(ascending=False)
    .head(20)
)

eosinophil count                              0.372362
neutrophil count                              0.350661
30000_raw-White_blood_cell_leukocyte_count    0.340048
30130_raw-Monocyte_count                      0.309905
50_raw-Standing_height                        0.298986
30100_raw-Mean_platelet_thrombocyte_volume    0.293252
30180_raw-Lymphocyte_percentage               0.291835
30140_raw-Neutrophill_count                   0.284210
body height                                   0.260484
30200_raw-Neutrophill_percentage              0.258957
leukocyte count                               0.255563
30150-Eosinophill_count                       0.247597
myeloid white cell count                      0.239829
monocyte count                                0.238397
granulocyte count                             0.226945
30090_raw-Platelet_crit                       0.226716
30080_raw-Platelet_count                      0.225761
30110_raw-Platelet_distribution_width         0.219328
30030_raw-

In [25]:
(
    smultixcan_into_multiplier.loc['LV136']
    .sort_values(ascending=False)
    .head(20)
)

5132_raw-3mm_strong_meridian_right                        0.267596
5134_raw-6mm_strong_meridian_left                         0.250199
5099_raw-3mm_weak_meridian_right                          0.246629
coronary artery disease                                   0.241942
5133_raw-6mm_strong_meridian_right                        0.237817
5098_raw-6mm_weak_meridian_right                          0.234233
5096_raw-3mm_weak_meridian_left                           0.231873
5097_raw-6mm_weak_meridian_left                           0.228746
5135_raw-3mm_strong_meridian_left                         0.225632
I9_CORATHER-Coronary_atherosclerosis                      0.173045
I9_IHD-Ischaemic_heart_disease_wide_definition            0.172836
4079_raw-Diastolic_blood_pressure_automated_reading       0.172334
hearing loss                                              0.146640
I9_MI_STRICT-Myocardial_infarction_strict                 0.129612
I9_MI-Myocardial_infarction                               0.12

# Save

In [26]:
output_file = Path(
    RESULTS_PROJ_OUTPUT_DIR,
    f'projection-{results_filename_stem}.pkl'
).resolve()

display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/projections/projection-smultixcan-efo_partial-mashr-zscores.pkl')

In [27]:
smultixcan_into_multiplier.to_pickle(output_file)