# Description

It standardizes the features (latent variables) of an input file.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

import conf

# Settings

In [3]:
INPUT_FILEPATH_STEM = 'smultixcan-efo_partial-mashr-zscores'

In [4]:
INPUT_FILEPATH = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'traits_selections',
    f'diseases_only-{INPUT_FILEPATH_STEM}.pkl',
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/traits_selections/diseases_only-smultixcan-efo_partial-mashr-zscores.pkl')

'diseases_only-smultixcan-efo_partial-mashr-zscores'

In [5]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/z_score_std')

# Load input file

In [6]:
data = pd.read_pickle(INPUT_FILEPATH)

In [7]:
data.shape

(538, 6452)

In [8]:
data.head()

gene_name,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679,...,ENSG00000111716,ENSG00000166796,ENSG00000114331,ENSG00000131584,ENSG00000165410,ENSG00000172757,ENSG00000147862,ENSG00000008323,ENSG00000167083,ENSG00000149257
vascular disease AND intestinal disease,1.508862,1.621723,0.363885,0.853025,0.672102,1.661944,0.065144,0.852891,1.070464,1.646857,...,0.338839,1.045194,0.337591,0.727816,0.919068,1.028998,0.957071,1.032144,0.372147,1.101383
"osteoarthritis, knee",0.293745,2.495255,0.898634,1.097025,0.092686,1.35323,2.32963,0.875087,1.932763,0.830047,...,0.31505,1.740132,0.659764,0.585959,0.745088,0.859034,0.844895,0.577415,1.093594,1.46107
carpal tunnel syndrome,3.269874,1.031169,1.854991,0.849646,1.193563,0.963425,1.081281,0.583741,0.370495,0.987732,...,1.695482,0.469553,0.525354,1.462587,0.730298,1.266646,0.680878,2.512643,1.943234,0.574996
gastritis,0.731628,0.92258,0.20744,1.049796,0.237018,0.081444,1.578283,0.63575,1.193168,2.471475,...,1.927164,0.906981,0.535501,1.265527,0.412329,0.47471,0.454023,2.122722,0.842579,1.56191
neoplasm,3.09046,0.749946,0.479871,1.374612,0.164061,0.385841,0.960421,0.579108,0.311806,0.340309,...,1.405575,1.277434,1.423244,1.400503,0.210456,1.100135,1.864806,1.084138,1.477129,0.92596


# Data preprocessing for clustering

In [9]:
data_stats = data.iloc[:, :10].describe()
display(data_stats)

gene_name,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679
count,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0
mean,0.935764,0.903561,0.863292,0.897959,0.868305,0.853082,0.890784,0.951402,0.929203,0.85944
std,0.676888,0.712305,0.599326,0.637146,0.595733,0.604609,0.6273,0.643526,0.679505,0.603726
min,0.001795,0.00186,0.018206,6.8e-05,0.003543,0.006888,0.002766,0.000109,0.002864,0.001376
25%,0.40765,0.368081,0.406369,0.403494,0.376657,0.349378,0.391706,0.436109,0.40957,0.431575
50%,0.837084,0.754471,0.744194,0.828386,0.751487,0.731933,0.807094,0.853946,0.780332,0.711309
75%,1.293354,1.258527,1.184544,1.284796,1.245304,1.230868,1.24843,1.324895,1.296129,1.213644
max,5.088178,6.790776,3.22225,3.552356,3.378371,2.873694,3.815878,3.436513,4.479128,3.048902


## Standardize

In [10]:
scaled_data = pd.DataFrame(
    data=scale(data),
    index=data.index.copy(),
    columns=data.columns.copy()
)

In [11]:
scaled_data.shape

(538, 6452)

In [12]:
scaled_data.head()

gene_name,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679,...,ENSG00000111716,ENSG00000166796,ENSG00000114331,ENSG00000131584,ENSG00000165410,ENSG00000172757,ENSG00000147862,ENSG00000008323,ENSG00000167083,ENSG00000149257
vascular disease AND intestinal disease,0.847454,1.009161,-0.834057,-0.070589,-0.329654,1.339072,-1.317406,-0.153223,0.20808,1.305476,...,-0.862693,0.178475,-0.867851,-0.318883,0.023148,0.121395,0.106954,0.282147,-0.811827,0.413489
"osteoarthritis, knee",-0.949368,2.236648,0.059025,0.312724,-1.30317,0.827997,2.295848,-0.1187,1.478273,-0.048731,...,-0.900876,1.2399,-0.357637,-0.5102,-0.233842,-0.130737,-0.066291,-0.47085,0.133876,0.992046
carpal tunnel syndrome,3.451503,0.179316,1.656231,-0.075898,0.546487,0.182674,0.303961,-0.571854,-0.822994,0.212698,...,1.314737,-0.700739,-0.570497,0.672079,-0.255689,0.473933,-0.319597,2.733735,1.24762,-0.433203
gastritis,-0.301861,0.026726,-1.095334,0.238529,-1.060668,-1.277446,1.096986,-0.490961,0.388828,2.672629,...,1.68659,-0.032626,-0.554427,0.406311,-0.72537,-0.700858,-0.669952,2.088057,-0.195165,1.154248
neoplasm,3.1862,-0.215859,-0.64035,0.748803,-1.183247,-0.773517,0.111114,-0.57906,-0.909444,-0.860679,...,0.849432,0.53319,0.851463,0.588349,-1.023563,0.226923,1.508859,0.368243,0.63663,0.131323


In [13]:
scaled_data_stats = scaled_data.iloc[:,:10].describe()
display(scaled_data_stats)

gene_name,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679
count,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0
mean,-2.674441e-16,6.075272e-16,9.046873e-16,-2.83953e-16,9.971371e-16,3.43385e-16,-4.820597e-16,2.773494e-16,-1.452783e-16,-1.591457e-15
std,1.000931,1.000931,1.000931,1.000931,1.000931,1.000931,1.000931,1.000931,1.000931,1.000931
min,-1.381081,-1.267069,-1.411373,-1.410552,-1.452944,-1.400873,-1.416936,-1.479628,-1.364526,-1.422604
25%,-0.7809334,-0.752455,-0.7631052,-0.7767848,-0.8260515,-0.833881,-0.7963371,-0.8014798,-0.7654346,-0.7093673
50%,-0.1459199,-0.2094999,-0.1989051,-0.1092965,-0.1962739,-0.2005615,-0.1335364,-0.1515823,-0.2192914,-0.2455904
75%,0.5287771,0.498799,0.5365205,0.6077054,0.6334212,0.625425,0.5706673,0.580926,0.5404921,0.587242
max,6.140272,8.27271,3.939682,4.169953,4.217329,3.345125,4.667331,3.865306,5.229143,3.629959


## Testing

In [14]:
assert np.all([np.isclose(scaled_data_stats.loc['mean', c], 0.0) for c in scaled_data_stats.columns])

In [15]:
assert np.all([np.isclose(scaled_data_stats.loc['std', c], 1.0, atol=1e-03) for c in scaled_data_stats.columns])

# Save

In [16]:
output_file = Path(
    RESULTS_DIR,
    f'z_score_std-{input_filepath_stem}.pkl',
).resolve()

display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/z_score_std/z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores.pkl')

In [17]:
scaled_data.to_pickle(output_file)