# Description

It standardizes the features (latent variables) of an input file.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

import conf

# Settings

In [3]:
INPUT_FILEPATH = Path(
    conf.RESULTS['PROJECTIONS_DIR'],
    'projection-smultixcan-efo_partial-mashr-zscores.pkl'
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/projections/projection-smultixcan-efo_partial-mashr-zscores.pkl')

'projection-smultixcan-efo_partial-mashr-zscores'

In [4]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/z_score_std')

# Load input file

In [5]:
data = pd.read_pickle(INPUT_FILEPATH)

In [6]:
data.shape

(987, 3749)

In [7]:
data.head()

Unnamed: 0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
LV1,-0.018452,-0.043782,-0.021514,-0.030454,-0.017428,-0.012313,-0.018044,-0.008047,-0.049581,-0.033719,...,-0.006604,-0.003207,-0.010638,-0.005853,0.001435,-0.013369,-0.005603,0.005034,0.045065,0.040257
LV2,0.052938,-0.012041,-0.028537,-0.052542,0.003757,-0.05468,-0.032025,0.009933,-0.030161,0.006869,...,-0.030526,-0.033616,0.018583,0.004988,-0.013814,0.052914,0.03417,-0.032019,-0.013778,0.022792
LV3,-0.003629,-0.011772,0.009441,0.000459,-0.003708,2.1e-05,-0.001102,-0.013368,-0.024807,-0.020284,...,-8.3e-05,-0.022389,-0.019574,-0.045773,0.00688,0.007325,0.048046,0.030989,0.088343,0.02185
LV4,0.028359,-0.006148,0.007808,-0.039613,-0.000929,-0.039796,-0.043357,0.007231,-0.002575,-0.003986,...,-0.018537,0.010687,-0.043556,-0.030884,-0.037816,0.043915,0.025911,-0.04774,-0.00655,0.054932
LV5,-0.0155,0.007011,0.012707,0.006191,-0.000647,0.032319,-0.028891,-0.002337,0.029445,0.008233,...,0.023084,-0.023192,0.010425,-0.006992,0.010299,-0.015184,-0.019313,-0.007507,-0.02618,0.049838


# Data preprocessing for clustering

In [8]:
data_stats = data.T.iloc[:, :10].describe()
display(data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0
mean,1.184554e-18,-7.818055e-18,4.975126e-18,-7.699599999999999e-19,-2.369108e-19,-1.1253260000000001e-18,-1.658375e-18,-9.476429999999999e-19,4.5013040000000004e-18,3.316751e-18
std,0.0333267,0.02860283,0.03791407,0.03044476,0.03008916,0.02615014,0.03737188,0.0315345,0.03179361,0.02975571
min,-0.1829498,-0.09410261,-0.08970296,-0.2330088,-0.08127525,-0.2021169,-0.08845736,-0.08158941,-0.08224399,-0.122525
25%,-0.02004467,-0.01854991,-0.02237111,-0.01877114,-0.0197989,-0.01670325,-0.02203061,-0.01942657,-0.01838957,-0.01980324
50%,-0.002603228,-0.00135958,-0.005313336,-0.0002637242,-0.00222582,-0.0001166629,-0.004656835,-0.003405316,-0.003455237,-0.001833973
75%,0.01645519,0.0167071,0.01344975,0.01793625,0.01509677,0.01648301,0.01536048,0.01436112,0.01348235,0.01782587
max,0.3179871,0.2425872,0.3298316,0.1857484,0.3089461,0.1050431,0.3650229,0.2703459,0.3605632,0.2193655


## Standardize

In [9]:
data_t = data.T

scaled_data = pd.DataFrame(
    data=scale(data_t),
    index=data_t.index.copy(),
    columns=data_t.columns.copy()
)

In [10]:
scaled_data.shape

(3749, 987)

In [11]:
scaled_data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.553743,1.85106,-0.095721,0.931603,-0.515196,1.371264,-0.413165,0.729668,0.546338,0.901153,...,1.148776,1.649018,-1.375952,2.203063,0.810199,-0.987048,0.27771,-0.607613,0.572376,-0.660846
100002_raw-Energy,-1.313905,-0.421015,-0.310524,-0.201978,0.233028,0.693838,0.084129,0.572439,0.217866,1.296973,...,0.16543,0.799276,-0.361989,-0.008253,2.06378,-0.930765,-0.147007,1.601938,-0.273553,-0.560822
100003_raw-Protein,-0.645622,-0.997845,0.249039,0.256499,0.422357,0.829189,-0.169004,0.51155,0.002389,-0.0572,...,1.016791,1.019488,-2.148844,-0.06513,0.041052,-0.836118,-0.320571,1.325519,-0.341494,-0.495786
100004_raw-Fat,-0.913924,-1.837183,0.012095,-1.301315,0.205794,1.129114,0.019985,0.37666,-0.810272,0.843611,...,0.544266,0.576714,0.281731,-0.218963,1.258957,-1.131924,0.378545,0.842047,-0.622194,-0.160058
100005_raw-Carbohydrate,-0.523004,0.131379,-0.097802,-0.030526,-0.0215,-0.219116,0.668229,0.3657,1.102347,0.845625,...,-0.34474,0.079274,0.253427,0.236474,1.033895,-0.229105,0.364793,1.776714,0.190531,-1.459641


In [12]:
scaled_data_stats = scaled_data.iloc[:,:10].describe()
display(scaled_data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0
mean,1.516229e-17,-7.581144e-18,3.790572e-18,-1.516229e-17,0.0,-1.9900500000000003e-17,-3.790572e-17,-1.895286e-17,7.581144e-18,1.895286e-17
std,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133
min,-5.490319,-3.290415,-2.36627,-7.654516,-2.701507,-7.730123,-2.367266,-2.587652,-2.587154,-4.118246
25%,-0.60154,-0.6486207,-0.5901263,-0.6166463,-0.658095,-0.6388294,-0.5895756,-0.6161241,-0.5784817,-0.665616
50%,-0.0781228,-0.04753941,-0.1401602,-0.008663541,-0.073984,-0.004461865,-0.1246246,-0.1080014,-0.1086916,-0.06164252
75%,0.49382,0.5841844,0.3547903,0.5892194,0.501801,0.630406,0.4110718,0.4554706,0.424115,0.5991536
max,9.542784,8.482362,8.700611,6.101977,10.269057,4.017459,9.768617,8.574164,11.34226,7.373197


In [13]:
assert np.all([np.isclose(scaled_data_stats.loc['mean', c], 0.0) for c in scaled_data_stats.columns])

In [14]:
assert np.all([np.isclose(scaled_data_stats.loc['std', c], 1.0, atol=1e-03) for c in scaled_data_stats.columns])

# Save

In [15]:
output_file = Path(
    RESULTS_DIR,
    f'z_score_standardized-{input_filepath_stem}.pkl',
).resolve()

display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/z_score_std/z_score_standardized-projection-smultixcan-efo_partial-mashr-zscores.pkl')

In [16]:
scaled_data.to_pickle(output_file)