# Description

It projects eMERGE S-MultiXcan results into the MultiPLIER latent space.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

from IPython.display import display
import pandas as pd

import conf
from data.cache import read_data
from multiplier import MultiplierProjection
from entity import Gene

# Settings

In [3]:
RESULTS_PROJ_OUTPUT_DIR = Path(conf.RESULTS["PROJECTIONS_DIR"])

RESULTS_PROJ_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_PROJ_OUTPUT_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections')

# Load eMERGE data (S-MultiXcan)

In [4]:
smultixcan_results_filename = conf.EMERGE["SMULTIXCAN_MASHR_ZSCORES_FILE"]

display(smultixcan_results_filename)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge/gene_assoc/emerge-smultixcan-mashr-zscores.pkl')

In [5]:
results_filename_stem = smultixcan_results_filename.stem
display(results_filename_stem)

'emerge-smultixcan-mashr-zscores'

In [6]:
smultixcan_results = pd.read_pickle(smultixcan_results_filename)

In [7]:
smultixcan_results.shape

(22198, 309)

In [8]:
smultixcan_results.head()

Unnamed: 0_level_0,292.3,079,741,418.1,280,747,591,426.21,244,070.3,...,427,585.4,345,366.2,420.2,195,250.2,250.1,371,562
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,1.043436,1.134028,1.595979,0.399451,0.7251,1.655468,1.803598,0.12525,1.053218,0.922592,...,0.364789,0.453873,0.759558,1.173671,0.298944,1.029646,0.826238,0.685711,2.069701,1.544766
ENSG00000000457,1.236752,0.119837,0.22412,0.20061,1.30419,0.541478,0.646474,0.493759,0.18968,1.563273,...,1.5195,0.20129,1.020996,0.592006,0.632661,0.995453,0.069362,0.460979,0.217046,1.350475
ENSG00000000460,0.497108,0.00569,0.31771,0.184918,1.136574,1.708245,0.288962,1.070026,0.40351,1.106321,...,0.678253,1.166204,1.138126,0.608565,0.449551,0.096689,0.887467,0.143815,0.236647,1.048859
ENSG00000000938,0.374442,1.099899,1.975795,0.107667,0.90352,0.673819,0.855749,0.499088,0.375406,0.815757,...,1.585687,0.584143,0.751384,2.75823,0.572712,0.141023,1.437022,0.288943,0.665727,1.225229
ENSG00000000971,1.108762,1.133163,0.227188,0.222948,0.072709,0.256124,0.540911,0.606947,0.229133,1.557433,...,2.349373,1.122458,0.155346,0.990123,0.326914,1.10021,1.099537,2.511358,0.950262,1.467305


## Gene IDs to Gene names

In [9]:
smultixcan_results = smultixcan_results.rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [10]:
smultixcan_results.shape

(22198, 309)

In [11]:
smultixcan_results.head()

Unnamed: 0_level_0,292.3,079,741,418.1,280,747,591,426.21,244,070.3,...,427,585.4,345,366.2,420.2,195,250.2,250.1,371,562
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,1.043436,1.134028,1.595979,0.399451,0.7251,1.655468,1.803598,0.12525,1.053218,0.922592,...,0.364789,0.453873,0.759558,1.173671,0.298944,1.029646,0.826238,0.685711,2.069701,1.544766
SCYL3,1.236752,0.119837,0.22412,0.20061,1.30419,0.541478,0.646474,0.493759,0.18968,1.563273,...,1.5195,0.20129,1.020996,0.592006,0.632661,0.995453,0.069362,0.460979,0.217046,1.350475
C1orf112,0.497108,0.00569,0.31771,0.184918,1.136574,1.708245,0.288962,1.070026,0.40351,1.106321,...,0.678253,1.166204,1.138126,0.608565,0.449551,0.096689,0.887467,0.143815,0.236647,1.048859
FGR,0.374442,1.099899,1.975795,0.107667,0.90352,0.673819,0.855749,0.499088,0.375406,0.815757,...,1.585687,0.584143,0.751384,2.75823,0.572712,0.141023,1.437022,0.288943,0.665727,1.225229
CFH,1.108762,1.133163,0.227188,0.222948,0.072709,0.256124,0.540911,0.606947,0.229133,1.557433,...,2.349373,1.122458,0.155346,0.990123,0.326914,1.10021,1.099537,2.511358,0.950262,1.467305


## Remove duplicated gene entries

In [12]:
smultixcan_results.index[smultixcan_results.index.duplicated(keep="first")]

Index(['SPATA13', 'LINC01422', 'LINC00484', 'MAL2', 'LINC01115', 'LYNX1'], dtype='object', name='gene_name')

In [13]:
smultixcan_results = smultixcan_results.loc[
    ~smultixcan_results.index.duplicated(keep="first")
]

In [14]:
smultixcan_results.shape

(22192, 309)

## Remove NaN values

**TODO**: it might be better to try to impute these values

In [15]:
smultixcan_results = smultixcan_results.dropna(how="any")

In [16]:
smultixcan_results.shape

(22107, 309)

# Project S-MultiXcan data into MultiPLIER latent space

In [17]:
mproj = MultiplierProjection()

In [18]:
smultixcan_into_multiplier = mproj.transform(smultixcan_results)

In [19]:
smultixcan_into_multiplier.shape

(987, 309)

In [20]:
smultixcan_into_multiplier.head()

Unnamed: 0,292.3,079,741,418.1,280,747,591,426.21,244,070.3,...,427,585.4,345,366.2,420.2,195,250.2,250.1,371,562
LV1,0.054491,-0.020097,0.022097,0.011815,-0.027529,-0.044248,0.022966,-0.052994,0.049973,0.00023,...,-0.03238,0.012214,0.006572,-0.024007,-0.006435,0.05631,0.072363,0.039763,-0.022529,0.03441
LV2,-0.045586,-0.009249,0.037598,-0.017011,-0.033393,-0.018336,0.007075,-0.011952,-0.002291,0.029972,...,0.018297,0.037166,-0.011099,-0.012876,-0.055898,0.002411,0.021665,0.011047,0.003468,0.035322
LV3,0.024822,-0.060301,-0.029168,-0.034559,0.006261,0.022122,-0.006224,0.000662,0.036817,-7.7e-05,...,0.03856,-0.038163,0.003248,-0.041152,0.014292,-0.03949,0.009368,0.013476,0.044817,0.004777
LV4,-0.039762,0.012785,-0.0037,0.028548,-0.029743,-0.00373,0.0356,-0.022699,0.005748,0.037,...,-0.029203,0.07504,0.042023,-0.027431,0.004113,0.028081,0.008579,0.017368,-0.073883,-0.020724
LV5,-0.012542,-0.003956,-0.026013,0.012478,0.007327,0.024376,-0.000232,0.063066,0.044968,-0.002194,...,0.030944,-0.035498,-0.013057,-0.006033,-0.035562,-0.009096,-0.006855,-0.036068,0.002163,0.006373


# Quick analysis

In [21]:
(smultixcan_into_multiplier.loc["LV603"].sort_values(ascending=False).head(20))

244.2     0.074562
743.1     0.065800
112       0.065030
272       0.061693
743.12    0.061463
272.1     0.059703
327.7     0.056143
279       0.054503
426.32    0.054381
571.8     0.053887
250.22    0.052988
571       0.052454
442.11    0.048040
429.1     0.047737
244       0.047282
743.11    0.047024
599.3     0.042783
427.21    0.042456
599.2     0.041790
272.11    0.041357
Name: LV603, dtype: float64

In [22]:
(smultixcan_into_multiplier.loc["LV136"].sort_values(ascending=False).head(20))

411.4     0.075308
740.1     0.074490
426.3     0.070966
747.1     0.069081
740.11    0.067678
433.21    0.064578
496       0.063015
411.2     0.062356
433.2     0.061120
443       0.060725
274       0.059065
571       0.056349
426.2     0.055841
426.31    0.055215
274.1     0.054139
480       0.053241
571.51    0.053201
741       0.051006
411       0.049525
208       0.049270
Name: LV136, dtype: float64

# Save

In [23]:
output_file = Path(
    RESULTS_PROJ_OUTPUT_DIR, f"projection-{results_filename_stem}.pkl"
).resolve()

display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections/projection-emerge-smultixcan-mashr-zscores.pkl')

In [24]:
smultixcan_into_multiplier.to_pickle(output_file)