# Description

It standardizes the features (latent variables) of an input file.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

import conf

# Settings

In [3]:
INPUT_FILEPATH = Path(
    conf.RESULTS['PROJECTIONS_DIR'],
    'projection-smultixcan-mashr-zscores.pkl'
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/projections/projection-smultixcan-mashr-zscores.pkl')

'projection-smultixcan-mashr-zscores'

In [4]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/z_score_std')

# Load input file

In [5]:
data = pd.read_pickle(INPUT_FILEPATH)

In [6]:
data.shape

(987, 4091)

In [7]:
data.head()

Unnamed: 0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
LV1,-0.026255,0.006045,0.007276,-0.02974,-0.001949,0.009288,0.006488,0.003354,-0.021751,-0.039159,...,0.036648,0.094062,0.047505,-0.035089,-0.0366,0.185595,-0.031841,-0.011883,0.033163,0.061374
LV2,-0.006253,-0.023996,0.037932,-1.8e-05,-0.016128,-0.033736,-0.017215,-0.004558,0.079234,-0.040277,...,0.085747,-0.064254,0.022124,-0.008441,-0.049594,0.074838,-0.057865,-0.007585,0.00358,-0.025216
LV3,-0.002729,0.051637,-0.013182,-0.001663,0.035221,-0.016019,0.000484,-0.046965,-0.000498,-0.041794,...,0.068618,-0.034395,0.145881,0.031539,-0.03174,0.015501,-0.028148,-0.048226,0.030093,-0.026938
LV4,0.042319,-0.023236,0.063016,-0.002628,-0.046487,0.018004,-0.029841,-0.023949,-0.026485,0.022759,...,-0.070986,-0.02896,0.015369,0.051671,0.014078,-0.147863,0.034195,0.068829,-0.023213,-0.089619
LV5,-0.025975,0.002167,-0.022646,0.004559,0.004649,0.02844,-0.024379,-0.028536,0.019263,-0.026291,...,0.008842,-0.000932,0.004195,0.029759,-0.04376,0.057031,-0.002836,-0.046215,0.099309,0.075663


# Data preprocessing for clustering

In [8]:
data_stats = data.T.iloc[:, :10].describe()
display(data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0
mean,-1.79112e-18,-3.785776e-18,3.039476e-18,-3.365135e-18,4.722044e-18,-4.342109e-19,4.233556e-18,-9.226982e-19,-3.7993449999999996e-19,-7.598690999999999e-19
std,0.03326184,0.02835837,0.03706188,0.03046931,0.03007658,0.02642815,0.03693083,0.03091832,0.03174964,0.02954993
min,-0.1800493,-0.09502399,-0.08809295,-0.2374265,-0.08089705,-0.2076498,-0.0877385,-0.08628219,-0.08309656,-0.1209542
25%,-0.02013433,-0.01830758,-0.02164229,-0.018931,-0.02037183,-0.01703704,-0.02199054,-0.01916786,-0.0182465,-0.01945359
50%,-0.002566123,-0.001081795,-0.004677365,-0.0001997775,-0.002374624,-0.000258297,-0.004157257,-0.003233265,-0.002972257,-0.001338313
75%,0.01693591,0.01698605,0.01363002,0.01819961,0.01563033,0.01674753,0.01502715,0.01409039,0.01354585,0.01774809
max,0.3304012,0.2484342,0.3385425,0.1884026,0.3158933,0.1094037,0.377263,0.2799561,0.3681587,0.2264011


## Standardize

In [9]:
data_t = data.T

scaled_data = pd.DataFrame(
    data=scale(data_t),
    index=data_t.index.copy(),
    columns=data_t.columns.copy()
)

In [10]:
scaled_data.shape

(4091, 987)

In [11]:
scaled_data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
20096_1-Size_of_red_wine_glass_drunk_small_125ml,-0.789441,-0.220511,-0.073647,1.389075,-0.863722,0.837183,0.115059,-0.489714,-0.060237,-0.73997,...,-0.828369,-0.766238,-0.795901,-0.060845,-0.650635,0.198545,-1.046268,-0.907707,0.11911,0.090268
2345-Ever_had_bowel_cancer_screening,0.181777,-0.846275,1.393447,-0.762712,0.072048,-0.374755,-0.977721,0.283071,0.149015,1.216908,...,1.036607,0.533858,0.900654,1.301938,0.098157,0.247145,1.076262,-1.307636,1.550653,0.036361
N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,0.218786,1.337743,-0.355718,2.068429,-0.75305,0.519698,0.603722,-0.656698,-0.183306,-0.889235,...,0.070416,-0.226306,-0.140882,0.557099,-1.195313,1.047936,-0.235122,0.735247,0.199132,-0.103762
100011_raw-Iron,-0.894239,-0.000629,-0.044876,-0.086266,0.151583,0.996284,0.00904,1.871274,-0.357051,1.240719,...,0.594991,0.923486,-0.72301,0.527064,0.44943,-0.741877,0.345336,1.168521,-1.740056,-0.91604
5221-Index_of_best_refractometry_result_right,-0.058612,-0.568793,0.950456,-1.525888,0.154586,-1.224318,0.066917,0.211487,0.044214,1.328202,...,1.298788,-0.204407,-0.427908,0.42953,-0.359803,-0.872641,-0.516132,-0.88126,0.404037,0.480434


In [12]:
scaled_data_stats = scaled_data.iloc[:,:10].describe()
display(scaled_data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0
mean,3.4736870000000004e-18,1.867107e-17,1.389475e-17,-2.2578970000000002e-17,0.0,-8.684218000000001e-18,-5.2105310000000004e-18,-1.259212e-17,-1.736844e-18,-1.302633e-17
std,1.000122,1.000122,1.000122,1.000122,1.000122,1.000122,1.000122,1.000122,1.000122,1.000122
min,-5.413752,-3.351237,-2.377206,-7.79327,-2.690031,-7.858105,-2.376042,-2.79099,-2.617564,-4.093716
25%,-0.6054023,-0.6456582,-0.5840214,-0.6213897,-0.677415,-0.6447337,-0.5955249,-0.6200274,-0.5747697,-0.6584099
50%,-0.07715859,-0.03815195,-0.1262196,-0.006557481,-0.078962,-0.009774751,-0.1125825,-0.1045872,-0.0936269,-0.04529544
75%,0.5092317,0.5990515,0.3678088,0.5973826,0.519748,0.6337779,0.4069496,0.4557852,0.426698,0.6006871
max,9.934556,8.761596,9.135635,6.184114,10.504248,4.140171,10.21664,9.055807,11.5971,7.662582


In [13]:
assert np.all([np.isclose(scaled_data_stats.loc['mean', c], 0.0) for c in scaled_data_stats.columns])

In [14]:
assert np.all([np.isclose(scaled_data_stats.loc['std', c], 1.0, atol=1e-03) for c in scaled_data_stats.columns])

# Save

In [15]:
output_file = Path(
    RESULTS_DIR,
    f'z_score_standardized-{input_filepath_stem}.pkl',
).resolve()

display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/z_score_std/z_score_standardized-projection-smultixcan-mashr-zscores.pkl')

In [16]:
scaled_data.to_pickle(output_file)