# Description

It standardize (z-score) the raw S-MultiXcan results (no projection into the latent space). It uses the genes present in the MultiPLIER models only.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

import conf

# Settings

## Input data

In [3]:
INPUT_FILEPATH = Path(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.pkl')

'smultixcan-efo_partial-mashr-zscores'

## Output folder

In [4]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std')

# Load input file

In [5]:
data = pd.read_pickle(INPUT_FILEPATH).T

In [6]:
display(data.shape)

(3749, 22515)

In [7]:
display(data.head())

gene_name,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,...,ENSG00000284240,ENSG00000284308,ENSG00000284395,ENSG00000284413,ENSG00000284418,ENSG00000284430,ENSG00000284452,ENSG00000284513,ENSG00000284526,ENSG00000284552
100001_raw-Food_weight,1.145442,0.618066,0.515724,0.280781,0.548127,0.045587,0.329995,0.109494,1.356741,1.474255,...,1.49041,0.230329,0.596503,0.519733,2.285074,0.12498,1.587903,1.522281,0.150938,1.010143
100002_raw-Energy,0.724557,1.028131,0.403596,0.25391,0.389877,0.197393,0.669649,0.04101,0.83212,0.954183,...,1.165679,0.111142,0.084263,1.229913,0.39019,0.505316,0.975901,1.817369,0.756393,0.729526
100003_raw-Protein,0.090876,2.21842,1.251359,0.879148,0.723469,0.777974,0.207873,0.536609,0.453969,1.286942,...,0.54137,0.734872,0.634674,1.31675,0.761859,1.276888,0.160988,0.346794,0.609476,0.222126
100004_raw-Fat,0.298165,0.762584,0.433091,0.352705,1.16725,0.578435,0.738983,0.565245,0.397189,0.192279,...,0.867217,0.540941,0.284347,1.661131,0.404078,1.248959,0.799771,1.443097,0.814969,0.545356
100005_raw-Carbohydrate,1.134347,0.934418,0.413466,0.051846,0.315952,0.046237,1.113674,0.319842,0.965217,0.919779,...,1.747265,0.496178,0.144053,0.701817,0.827677,0.587188,1.089338,2.001502,1.362716,1.49003


# Keep only genes in the MultiPLIER model

In [8]:
from entity import Gene
from multiplier import MultiplierProjection

In [9]:
multiplier_genes = \
    MultiplierProjection._read_model_z()\
    .rename(index=Gene.GENE_NAME_TO_ID_MAP).index

In [10]:
# common_genes = smultixcan_results.index.intersection(lincs_consensi_drugbank.index)
common_genes = multiplier_genes.intersection(data.columns)

In [11]:
display(common_genes)

Index(['ENSG00000183087', 'ENSG00000157227', 'ENSG00000096696',
       'ENSG00000175130', 'ENSG00000113140', 'ENSG00000117984',
       'ENSG00000116016', 'ENSG00000129116', 'ENSG00000134686',
       'ENSG00000108679',
       ...
       'ENSG00000111716', 'ENSG00000166796', 'ENSG00000114331',
       'ENSG00000131584', 'ENSG00000165410', 'ENSG00000172757',
       'ENSG00000147862', 'ENSG00000008323', 'ENSG00000167083',
       'ENSG00000149257'],
      dtype='object', length=6452)

In [12]:
data = data[common_genes]

In [13]:
display(data.shape)

(3749, 6452)

In [14]:
assert not data.isna().any().any()

# z-score standardization

In [15]:
data_stats = data.iloc[:, :10].describe()
display(data_stats)

gene_name,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679
count,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0
mean,0.869024,0.850735,0.816185,0.814119,0.852497,0.808036,0.877966,0.890302,0.954453,0.840276
std,0.715009,0.74543,0.634833,0.61101,0.651678,0.608549,0.855598,0.673139,0.976718,0.633642
min,0.000168,2.3e-05,0.000261,6.8e-05,0.0003,0.000113,0.000297,3.1e-05,0.001149,0.000416
25%,0.331437,0.317697,0.337163,0.330703,0.328534,0.326538,0.348574,0.360625,0.345557,0.352286
50%,0.715198,0.689134,0.675318,0.689476,0.709992,0.670569,0.723682,0.749636,0.753889,0.697201
75%,1.242802,1.193661,1.164237,1.160527,1.238855,1.165987,1.248958,1.288451,1.26823,1.204855
max,7.466818,7.004718,8.472557,4.231093,4.498332,4.371566,19.356913,6.605218,10.519923,4.731186


In [16]:
scaled_data = pd.DataFrame(
    data=scale(data),
    index=data.index.copy(),
    columns=data.columns.copy()
)

In [17]:
display(scaled_data.shape)

(3749, 6452)

In [18]:
display(scaled_data.head())

gene_name,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679,...,ENSG00000111716,ENSG00000166796,ENSG00000114331,ENSG00000131584,ENSG00000165410,ENSG00000172757,ENSG00000147862,ENSG00000008323,ENSG00000167083,ENSG00000149257
100001_raw-Food_weight,0.834672,-0.969366,-0.788966,-1.236243,-1.19291,-0.859681,-0.324518,0.880799,-0.152357,-1.109631,...,-0.847934,-1.246339,-1.225673,-0.270551,-0.251339,-0.348636,-0.283083,0.54104,0.412878,0.21218
100002_raw-Energy,-0.893351,-0.906648,-1.108427,-0.633443,-1.257057,-0.975325,0.100801,-0.814622,0.9937,0.709155,...,1.039015,-0.097355,0.613387,1.297104,-0.646984,-0.642464,0.254638,-0.235577,0.390156,0.87505
100003_raw-Protein,0.724878,0.367317,-1.100323,-1.304925,-0.735474,-1.178054,0.002718,-0.151997,0.730952,0.949561,...,0.668253,0.381954,0.883388,0.028914,-1.262583,-0.547308,-0.369882,0.725386,0.969339,0.582554
100004_raw-Fat,-0.273941,-1.113389,-1.176575,-0.461884,-0.947444,-0.213364,-0.169319,-1.092854,0.7522,1.207424,...,0.724525,-0.569374,0.590451,1.656459,-0.081229,-0.290437,0.9289,-0.661822,0.709848,0.600761
100005_raw-Carbohydrate,-0.925286,-0.186918,-0.488066,-0.682023,-0.746643,-0.262771,-0.261208,-0.513372,0.385347,-0.681057,...,0.692922,-0.029716,0.655211,1.326645,-0.897059,-0.569916,1.393707,-0.189605,-0.369507,0.257153


In [19]:
scaled_data_stats = scaled_data.iloc[:,:10].describe()
display(scaled_data_stats)

gene_name,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679
count,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0
mean,-1.645108e-15,-4.10519e-15,-1.281213e-15,6.481878e-16,-1.35892e-15,6.254444e-16,1.440417e-16,1.800522e-16,-4.302299e-16,2.687516e-15
std,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133
min,-1.21533,-1.141391,-1.285428,-1.332483,-1.30787,-1.327799,-1.025934,-1.322744,-0.9761577,-1.325626
25%,-0.7519615,-0.7151707,-0.7546637,-0.7912807,-0.8041282,-0.7913295,-0.6188225,-0.7869824,-0.6234933,-0.7702381
50%,-0.2151678,-0.2168181,-0.221925,-0.204023,-0.2187022,-0.2259242,-0.1803475,-0.2089995,-0.2053719,-0.2258285
75%,0.5228289,0.460099,0.5483305,0.5670197,0.5929466,0.588282,0.4336633,0.5915595,0.3212999,0.5754475
max,9.228796,8.256719,12.06206,5.593088,5.59528,5.85656,21.60059,8.491086,9.794785,6.141369


## Testing

In [20]:
assert np.all([np.isclose(scaled_data_stats.loc['mean', c], 0.0) for c in scaled_data_stats.columns])

In [21]:
assert np.all([np.isclose(scaled_data_stats.loc['std', c], 1.0, atol=1e-03) for c in scaled_data_stats.columns])

# Save

In [22]:
output_file = Path(
    RESULTS_DIR,
    f'z_score_std-{input_filepath_stem}.pkl',
).resolve()

display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std/z_score_std-smultixcan-efo_partial-mashr-zscores.pkl')

In [23]:
scaled_data.to_pickle(output_file)