# Description

It standardize (z-score) S-MultiXcan results projected into the MultiPLIER latent space.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

import conf

# Settings

In [3]:
np.random.seed(0)

## Input data

In [4]:
INPUT_FILEPATH = Path(
    conf.RESULTS["PROJECTIONS_DIR"],
    "projection-smultixcan-efo_partial-mashr-zscores.pkl",
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections/projection-smultixcan-efo_partial-mashr-zscores.pkl')

'projection-smultixcan-efo_partial-mashr-zscores'

## Output folder

In [5]:
# output dir for this notebook
RESULTS_DIR = Path(conf.RESULTS["DATA_TRANSFORMATIONS_DIR"], "z_score_std").resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std')

# Load input file

In [6]:
data = pd.read_pickle(INPUT_FILEPATH).T

In [7]:
display(data.shape)

(3752, 987)

In [8]:
display(data.head())

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.021292,0.056061,0.00165,0.026265,-0.015616,0.040333,-0.01205,0.027985,0.021028,0.027118,...,0.033575,0.051033,-0.038899,0.084371,0.027174,-0.032639,0.015128,-0.01843,0.016253,-0.021666
100002_raw-Energy,-0.046815,-0.009864,-0.004262,-0.007235,0.010799,0.02154,0.008009,0.025354,0.009816,0.04123,...,0.006675,0.022421,-0.014042,0.002723,0.066739,-0.031207,-0.00104,0.050533,-0.007979,-0.017742
100003_raw-Protein,-0.021585,-0.028888,0.019211,0.004304,0.017792,0.023804,-0.001246,0.023492,0.007045,-0.002453,...,0.029808,0.030413,-0.065487,0.000143,0.001568,-0.027622,-0.006422,0.040084,-0.008175,-0.014154
100004_raw-Fat,-0.030324,-0.053573,0.007484,-0.042464,0.010591,0.031777,0.007824,0.018422,-0.027632,0.028237,...,0.017412,0.018589,0.003284,-0.004189,0.040693,-0.036318,0.019393,0.029495,-0.015966,-0.00216
100005_raw-Carbohydrate,-0.017773,0.006951,0.004548,-0.001093,0.001972,-0.005518,0.033477,0.017333,0.040532,0.029409,...,-0.007403,-0.000781,0.006414,0.009671,0.034106,-0.008757,0.022819,0.055545,0.005864,-0.055875


# z-score standardization

In [9]:
data_stats = data.iloc[:, :10].describe()
display(data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0
mean,-2.603935e-18,7.693443e-18,-3.8467219999999997e-19,-9.468852999999998e-19,3.5508199999999997e-19,2.603935e-18,2.130492e-18,-3.077377e-18,-4.379345e-18,-1.982541e-18
std,0.03063955,0.02856884,0.02861655,0.02989334,0.02892348,0.02723226,0.0304022,0.02567312,0.02770053,0.02911987
min,-0.1146518,-0.1072292,-0.09168592,-0.1452817,-0.08667826,-0.1060733,-0.09353592,-0.09165651,-0.09488038,-0.09749692
25%,-0.01986454,-0.01919182,-0.01899159,-0.01957128,-0.01969083,-0.01800094,-0.02064351,-0.01773037,-0.0183389,-0.01983394
50%,-0.0006645821,-0.0001334593,-0.0009511898,0.0004470145,-0.0008506998,-0.0005344013,-0.000892687,-0.000648481,-0.001445903,-0.0004857271
75%,0.01917887,0.01823343,0.01815307,0.01962423,0.01728434,0.01804522,0.01889462,0.01715813,0.01649573,0.01884798
max,0.2157845,0.1309168,0.1280702,0.1004022,0.1592952,0.1157216,0.1802857,0.09777436,0.1913541,0.1211129


In [10]:
scaled_data = pd.DataFrame(
    data=scale(data), index=data.index.copy(), columns=data.columns.copy()
)

In [11]:
display(scaled_data.shape)

(3752, 987)

In [12]:
display(scaled_data.head())

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.695006,1.962565,0.057683,0.878731,-0.539977,1.481272,-0.396422,1.09018,0.759223,0.931395,...,1.129784,1.752343,-1.411403,2.823863,0.931116,-1.054519,0.432982,-0.633597,0.554279,-0.642479
100002_raw-Energy,-1.528127,-0.345309,-0.148953,-0.24206,0.373427,0.791092,0.263477,0.987702,0.354391,1.416059,...,0.224604,0.769882,-0.509482,0.091153,2.286789,-1.008256,-0.029764,1.737229,-0.272107,-0.526125
100003_raw-Protein,-0.704572,-1.011299,0.67142,0.143991,0.615212,0.874212,-0.040998,0.91517,0.254369,-0.084237,...,1.003019,1.044314,-2.376108,0.004778,0.053714,-0.892447,-0.1838,1.377991,-0.278794,-0.419733
100004_raw-Fat,-0.989832,-1.87549,0.261555,-1.420719,0.366238,1.167049,0.257387,0.717674,-0.997664,0.969825,...,0.585913,0.638314,0.119139,-0.140204,1.394326,-1.173402,0.555058,1.013982,-0.544506,-0.064061
100005_raw-Carbohydrate,-0.580143,0.243335,0.158966,-0.036558,0.068176,-0.202639,1.101281,0.675227,1.463432,1.010078,...,-0.249108,-0.026814,0.232713,0.323682,1.168642,-0.282935,0.653105,1.909526,0.199997,-1.656894


In [13]:
scaled_data_stats = scaled_data.iloc[:, :10].describe()
display(scaled_data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0
mean,-7.575082e-18,1.3256390000000002e-17,-1.1362620000000001e-17,-7.575082e-18,9.468853e-18,1.1362620000000001e-17,1.5150160000000002e-17,-3.030033e-17,1.1362620000000001e-17,-1.1362620000000001e-17
std,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133
min,-3.742454,-3.753861,-3.204374,-4.860651,-2.997212,-3.895654,-3.077027,-3.57061,-3.425675,-3.34857
25%,-0.6484164,-0.671864,-0.6637458,-0.6547908,-0.6808811,-0.6611033,-0.6791044,-0.6907119,-0.6621298,-0.6812042
50%,-0.02169323,-0.004672121,-0.03324358,0.01495564,-0.029416,-0.01962645,-0.0293665,-0.02526251,-0.05220462,-0.01668249
75%,0.6260347,0.6383127,0.63444,0.6565627,0.5976681,0.6627296,0.6215715,0.6684196,0.5955815,0.6473411
max,7.043617,4.583115,4.475987,3.359129,5.508202,4.249999,5.930812,3.80894,6.908878,4.159668


## Testing

In [14]:
assert np.all(
    [
        np.isclose(scaled_data_stats.loc["mean", c], 0.0)
        for c in scaled_data_stats.columns
    ]
)

In [15]:
assert np.all(
    [
        np.isclose(scaled_data_stats.loc["std", c], 1.0, atol=1e-03)
        for c in scaled_data_stats.columns
    ]
)

# Save

In [16]:
output_file = Path(
    RESULTS_DIR,
    f"z_score_std-{input_filepath_stem}.pkl",
).resolve()

display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std/z_score_std-projection-smultixcan-efo_partial-mashr-zscores.pkl')

In [17]:
scaled_data.to_pickle(output_file)