# Description

It analyzes the properties of the different data versions used (`z_score_std`, `pca` and `umap`) to cluster traits, and performs some checks.

# Environment variables

In [1]:
from IPython.display import display

import conf

N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

3

In [2]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=3
env: OPEN_BLAS_NUM_THREADS=3
env: NUMEXPR_NUM_THREADS=3
env: OMP_NUM_THREADS=3


# Modules loading

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd

from utils import generate_result_set_name

# Settings

In [5]:
INITIAL_RANDOM_STATE = 12345

# Z-score standardized data

In [7]:
INPUT_SUBSET = "z_score_std"

In [8]:
INPUT_STEM = "projection-smultixcan-efo_partial-mashr-zscores"

In [9]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    f"{INPUT_SUBSET}-{INPUT_STEM}.pkl",
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/data_transformations/z_score_std/z_score_std-projection-smultixcan-efo_partial-mashr-zscores.pkl')

'z_score_std-projection-smultixcan-efo_partial-mashr-zscores'

In [10]:
data = pd.read_pickle(input_filepath)

In [11]:
data.shape

(3749, 987)

In [12]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.553743,1.85106,-0.095721,0.931603,-0.515196,1.371264,-0.413165,0.729668,0.546338,0.901153,...,1.148776,1.649018,-1.375952,2.203063,0.810199,-0.987048,0.27771,-0.607613,0.572376,-0.660846
100002_raw-Energy,-1.313905,-0.421015,-0.310524,-0.201978,0.233028,0.693838,0.084129,0.572439,0.217866,1.296973,...,0.16543,0.799276,-0.361989,-0.008253,2.06378,-0.930765,-0.147007,1.601938,-0.273553,-0.560822
100003_raw-Protein,-0.645622,-0.997845,0.249039,0.256499,0.422357,0.829189,-0.169004,0.51155,0.002389,-0.0572,...,1.016791,1.019488,-2.148844,-0.06513,0.041052,-0.836118,-0.320571,1.325519,-0.341494,-0.495786
100004_raw-Fat,-0.913924,-1.837183,0.012095,-1.301315,0.205794,1.129114,0.019985,0.37666,-0.810272,0.843611,...,0.544266,0.576714,0.281731,-0.218963,1.258957,-1.131924,0.378545,0.842047,-0.622194,-0.160058
100005_raw-Carbohydrate,-0.523004,0.131379,-0.097802,-0.030526,-0.0215,-0.219116,0.668229,0.3657,1.102347,0.845625,...,-0.34474,0.079274,0.253427,0.236474,1.033895,-0.229105,0.364793,1.776714,0.190531,-1.459641


## Data stats

In [13]:
data.min().min(), data.max().max()

(-9.173162070658442, 27.660674759843893)

In [14]:
assert not np.isinf(data).any().any()

In [15]:
assert not data.isna().any().any()

In [16]:
data_stats = data.describe()

In [17]:
data_stats.T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LV1,3749.0,7.581144e-18,1.000133,-5.490319,-0.601540,-0.078123,0.493820,9.542784
LV2,3749.0,-7.581144e-18,1.000133,-3.290415,-0.648621,-0.047539,0.584184,8.482362
LV3,3749.0,7.581144e-18,1.000133,-2.366270,-0.590126,-0.140160,0.354790,8.700611
LV4,3749.0,-2.274343e-17,1.000133,-7.654516,-0.616646,-0.008664,0.589219,6.101977
LV5,3749.0,7.581144e-18,1.000133,-2.701507,-0.658095,-0.073984,0.501801,10.269057
...,...,...,...,...,...,...,...,...
LV983,3749.0,-3.411515e-17,1.000133,-4.204381,-0.645346,-0.079892,0.561935,13.183630
LV984,3749.0,-1.895286e-17,1.000133,-2.525851,-0.560073,-0.103051,0.370580,13.149904
LV985,3749.0,1.137172e-17,1.000133,-3.463111,-0.668643,-0.021685,0.613902,8.720886
LV986,3749.0,-3.790572e-18,1.000133,-5.625046,-0.621761,-0.009071,0.636834,5.718949


In [18]:
assert not np.isinf(data_stats).any().any()

In [19]:
assert not data_stats.isna().any().any()

## Check duplicated values

In [20]:
data_dups = data.duplicated()

In [21]:
data_dups.any()

True

In [22]:
data_dups.value_counts()

False    3725
True       24
dtype: int64

In [23]:
data_dups_labels = data.index[data_dups]
display(data_dups_labels)

Index(['C_DIGESTIVE_ORGANS', 'C_ENDOCRINE', 'C_EYE_BRAIN_NEURO',
       'C_FEMALE_GENITAL', 'C_LIP_ORAL_PHARYNX', 'C_MALE_GENITAL',
       'C_MESTOTHEL_SOFTTISSUE', 'C_RESPIRATORY_INTRATHORACIC', 'C_SKIN',
       'C_UNCERTAIN_SECONDARY', 'C_URINARY_TRACT',
       'H7_BCKRNDRETINOPAT-Background_retinopathy_and_retinal_vascular_changes',
       'I9_CHD_NOREV-Major_coronary_heart_disease_event_excluding_revascularizations',
       'I9_HEARTFAIL-Heart_failurestrict',
       'I9_HEARTFAIL_NS-Heart_failure_not_strict',
       'I9_HYPTENSPREG-Hypertension_complicating_pregnancy_childbirth_and_the_puerperium',
       'I9_MI_STRICT-Myocardial_infarction_strict',
       'ILD_DIFF_DG-ILD_differential_diagnosis', 'J10_ASTHMA-Asthma',
       'KRA_PSY_MOOD-Mood_disorders',
       'KRA_PSY_SCHIZODEL-Schizophrenia_or_delusion',
       'PNEUMONIA-Pneumonias_AsthmaCOPD_comorbidities',
       'PULM_ANXIETY-Anxiety_asthmarelated_comorbidities',
       'X_RESPIRATORY-Diseases_of_the_respiratory_system'],
 

This duplicated traits should be taken into account when interpreting any results derived from the data (such as cluster analysis).

# PCA

In [24]:
INPUT_SUBSET = "pca"

In [25]:
INPUT_STEM = "z_score_std-projection-smultixcan-efo_partial-mashr-zscores"

In [26]:
DR_OPTIONS = {
    "n_components": 50,
    "svd_solver": "full",
    "random_state": 0,
}

In [27]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    generate_result_set_name(
        DR_OPTIONS, prefix=f"{INPUT_SUBSET}-{INPUT_STEM}-", suffix=".pkl"
    ),
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/data_transformations/pca/pca-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-n_components_50-random_state_0-svd_solver_full.pkl')

'pca-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-n_components_50-random_state_0-svd_solver_full'

In [28]:
data = pd.read_pickle(input_filepath)

In [29]:
data.shape

(3749, 50)

In [30]:
data.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA41,PCA42,PCA43,PCA44,PCA45,PCA46,PCA47,PCA48,PCA49,PCA50
100001_raw-Food_weight,-0.547515,-0.00725,0.245361,0.132424,1.201447,-0.113158,0.048859,1.031679,0.711208,0.1261,...,0.113685,-0.109753,0.178202,-0.487868,1.80592,-0.342611,-0.663712,-1.313987,-1.481707,0.39641
100002_raw-Energy,-1.570593,-0.618992,-1.223395,1.627321,-0.56386,0.224922,-1.500327,2.045633,-1.665998,-0.180432,...,0.69788,1.412676,2.103988,0.385625,2.386495,-2.939998,-0.978275,-1.893307,-0.259497,0.423264
100003_raw-Protein,-0.601212,-0.948824,-1.022209,2.438433,-2.062235,0.356202,-1.538038,2.314675,-1.108991,0.318034,...,0.171992,1.351979,0.054404,-0.725132,2.22652,-3.481674,-0.445294,-0.477452,0.237157,1.440867
100004_raw-Fat,-1.648214,-0.342042,-0.270189,1.683268,-1.222401,0.018349,-0.032824,1.671161,-0.908965,0.2241,...,-0.358812,0.696836,1.072624,0.627951,1.802,0.174704,0.826319,-0.826992,0.596113,0.62005
100005_raw-Carbohydrate,-1.867586,-0.006412,-0.30322,0.770958,-0.116822,0.460946,-2.407732,1.534273,-1.930577,0.064621,...,1.332852,1.978032,2.158456,0.831123,1.093202,-3.539033,-0.831862,-0.842298,-0.723072,0.359438


## Data stats

In [31]:
data.min().min(), data.max().max()

(-49.816726369808826, 100.40836100619943)

In [32]:
assert not np.isinf(data).any().any()

In [33]:
assert not data.isna().any().any()

In [34]:
data_stats = data.describe()

In [35]:
data_stats.T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PCA1,3749.0,3.638949e-16,8.485766,-5.74371,-3.131198,-2.434309,-0.815791,100.408361
PCA2,3749.0,-1.212983e-16,4.105379,-35.62376,-0.840904,-0.174942,0.552516,48.919206
PCA3,3749.0,7.581144e-18,3.294152,-38.575629,-0.664815,0.114516,0.863103,42.121356
PCA4,3749.0,1.516229e-17,3.094793,-49.816726,-0.61088,0.111067,0.803579,53.66026
PCA5,3749.0,0.0,3.013435,-24.146123,-0.924471,-0.189415,0.621831,28.486623
PCA6,3749.0,3.790572e-17,2.813783,-25.778786,-0.67263,0.08106,0.728776,90.377913
PCA7,3749.0,7.581144e-17,2.625236,-18.09253,-0.711175,0.057197,0.815881,37.031625
PCA8,3749.0,2.2743430000000003e-17,2.388907,-28.191215,-0.756626,0.012539,0.772162,41.857488
PCA9,3749.0,-7.581144e-18,2.381241,-35.214992,-0.732368,0.039457,0.784176,35.850059
PCA10,3749.0,-7.581144e-18,2.297545,-25.22973,-0.690982,0.152595,0.919323,25.529039


In [36]:
assert not np.isinf(data_stats).any().any()

In [37]:
assert not data_stats.isna().any().any()

## Check duplicated values

In [38]:
data_dups = data.duplicated()

In [39]:
data_dups.any()

True

In [40]:
data_dups.value_counts()

False    3725
True       24
dtype: int64

In [41]:
data.index[data_dups]

Index(['C_DIGESTIVE_ORGANS', 'C_ENDOCRINE', 'C_EYE_BRAIN_NEURO',
       'C_FEMALE_GENITAL', 'C_LIP_ORAL_PHARYNX', 'C_MALE_GENITAL',
       'C_MESTOTHEL_SOFTTISSUE', 'C_RESPIRATORY_INTRATHORACIC', 'C_SKIN',
       'C_UNCERTAIN_SECONDARY', 'C_URINARY_TRACT',
       'H7_BCKRNDRETINOPAT-Background_retinopathy_and_retinal_vascular_changes',
       'I9_CHD_NOREV-Major_coronary_heart_disease_event_excluding_revascularizations',
       'I9_HEARTFAIL-Heart_failurestrict',
       'I9_HEARTFAIL_NS-Heart_failure_not_strict',
       'I9_HYPTENSPREG-Hypertension_complicating_pregnancy_childbirth_and_the_puerperium',
       'I9_MI_STRICT-Myocardial_infarction_strict',
       'ILD_DIFF_DG-ILD_differential_diagnosis', 'J10_ASTHMA-Asthma',
       'KRA_PSY_MOOD-Mood_disorders',
       'KRA_PSY_SCHIZODEL-Schizophrenia_or_delusion',
       'PNEUMONIA-Pneumonias_AsthmaCOPD_comorbidities',
       'PULM_ANXIETY-Anxiety_asthmarelated_comorbidities',
       'X_RESPIRATORY-Diseases_of_the_respiratory_system'],
 

In [42]:
# same duplicates in `z_score_std`
assert data.index[data_dups].equals(data_dups_labels)

# UMAP

In [43]:
INPUT_SUBSET = "umap"

In [44]:
INPUT_STEM = "z_score_std-projection-smultixcan-efo_partial-mashr-zscores"

In [45]:
DR_OPTIONS = {
    "n_components": 50,
    "metric": "euclidean",
    "n_neighbors": 15,
    "random_state": 0,
}

In [46]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    generate_result_set_name(
        DR_OPTIONS, prefix=f"{INPUT_SUBSET}-{INPUT_STEM}-", suffix=".pkl"
    ),
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/data_transformations/umap/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')

'umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0'

In [47]:
data = pd.read_pickle(input_filepath)

In [48]:
data.shape

(3749, 50)

In [49]:
data.head()

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5,UMAP6,UMAP7,UMAP8,UMAP9,UMAP10,...,UMAP41,UMAP42,UMAP43,UMAP44,UMAP45,UMAP46,UMAP47,UMAP48,UMAP49,UMAP50
100001_raw-Food_weight,11.53159,10.77339,10.5515,5.689103,12.646474,8.795625,2.365859,-3.382425,1.458605,14.236823,...,2.214731,9.304909,3.359118,5.211676,1.265622,2.058685,1.291174,1.565621,7.317275,9.037535
100002_raw-Energy,11.554761,10.753816,10.548004,5.7108,12.674987,8.787647,2.379787,-3.339913,1.442655,14.196218,...,2.237087,9.270699,3.38344,5.185328,1.257113,2.057439,1.290686,1.557209,7.297037,9.009866
100003_raw-Protein,11.519615,10.786408,10.540249,5.687959,12.628188,8.789925,2.36837,-3.390937,1.464806,14.233623,...,2.216162,9.314158,3.350708,5.226166,1.267382,2.051692,1.305047,1.558942,7.307035,9.042575
100004_raw-Fat,11.515584,10.756584,10.570711,5.697041,12.661936,8.78733,2.396913,-3.351611,1.435632,14.221588,...,2.214707,9.273252,3.383349,5.220068,1.270067,2.065669,1.281447,1.578588,7.332527,9.005523
100005_raw-Carbohydrate,11.528723,10.762026,10.566773,5.698555,12.671127,8.791903,2.394764,-3.35437,1.442259,14.226301,...,2.216362,9.275563,3.383139,5.216354,1.269187,2.063429,1.28002,1.577227,7.335059,9.009464


## Data stats

In [50]:
data.min().min(), data.max().max()

(-3.4087222, 14.702299)

In [51]:
assert not np.isinf(data).any().any()

In [52]:
assert not data.isna().any().any()

In [53]:
data_stats = data.describe()

In [54]:
data_stats.T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
UMAP1,3749.0,11.81947,0.591982,6.864138,11.726042,11.911532,12.061273,12.532558
UMAP2,3749.0,10.51436,0.462244,1.230025,10.438283,10.543786,10.644863,11.035397
UMAP3,3749.0,10.790124,0.357213,5.346579,10.718997,10.828536,10.921763,11.457878
UMAP4,3749.0,5.80606,0.170279,5.047084,5.724452,5.815688,5.908139,6.249172
UMAP5,3749.0,13.429363,0.603651,7.840955,13.357506,13.531915,13.656499,14.083682
UMAP6,3749.0,8.863518,0.242541,7.562965,8.759696,8.87627,9.020582,9.584191
UMAP7,3749.0,2.696704,0.571861,1.989095,2.497714,2.643137,2.778179,7.133248
UMAP8,3749.0,-2.644093,0.493672,-3.408722,-2.848704,-2.709177,-2.564153,1.060233
UMAP9,3749.0,1.227889,0.178356,0.536676,1.118166,1.245491,1.35719,1.682636
UMAP10,3749.0,14.017711,0.616001,9.559341,13.998945,14.163577,14.272578,14.702299


In [55]:
assert not np.isinf(data_stats).any().any()

In [56]:
assert not data_stats.isna().any().any()

## Check duplicated values

In [57]:
data_dups = data.duplicated()

In [58]:
data_dups.any()

False

There are no duplicates with UMAP data, but the duplicates in `z_score_std` and `pca` are very close by in the UMAP representation.

In [59]:
data_dups_labels

Index(['C_DIGESTIVE_ORGANS', 'C_ENDOCRINE', 'C_EYE_BRAIN_NEURO',
       'C_FEMALE_GENITAL', 'C_LIP_ORAL_PHARYNX', 'C_MALE_GENITAL',
       'C_MESTOTHEL_SOFTTISSUE', 'C_RESPIRATORY_INTRATHORACIC', 'C_SKIN',
       'C_UNCERTAIN_SECONDARY', 'C_URINARY_TRACT',
       'H7_BCKRNDRETINOPAT-Background_retinopathy_and_retinal_vascular_changes',
       'I9_CHD_NOREV-Major_coronary_heart_disease_event_excluding_revascularizations',
       'I9_HEARTFAIL-Heart_failurestrict',
       'I9_HEARTFAIL_NS-Heart_failure_not_strict',
       'I9_HYPTENSPREG-Hypertension_complicating_pregnancy_childbirth_and_the_puerperium',
       'I9_MI_STRICT-Myocardial_infarction_strict',
       'ILD_DIFF_DG-ILD_differential_diagnosis', 'J10_ASTHMA-Asthma',
       'KRA_PSY_MOOD-Mood_disorders',
       'KRA_PSY_SCHIZODEL-Schizophrenia_or_delusion',
       'PNEUMONIA-Pneumonias_AsthmaCOPD_comorbidities',
       'PULM_ANXIETY-Anxiety_asthmarelated_comorbidities',
       'X_RESPIRATORY-Diseases_of_the_respiratory_system'],
 

In [60]:
data.loc[data_dups_labels]

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5,UMAP6,UMAP7,UMAP8,UMAP9,UMAP10,...,UMAP41,UMAP42,UMAP43,UMAP44,UMAP45,UMAP46,UMAP47,UMAP48,UMAP49,UMAP50
C_DIGESTIVE_ORGANS,11.612365,10.408484,10.353334,5.613502,13.362354,9.219149,2.527786,-2.30675,1.209678,13.640932,...,2.074721,8.442904,4.221562,4.628087,1.017156,2.044081,1.021398,1.998282,7.111103,8.560739
C_ENDOCRINE,12.137884,10.709962,10.721994,5.848867,13.750921,8.800262,2.918092,-2.526018,0.905364,13.636403,...,1.814478,9.044935,4.339827,5.307078,1.192611,2.278971,0.715732,1.996412,7.025422,8.308843
C_EYE_BRAIN_NEURO,11.652816,10.658445,10.483661,5.938981,13.310802,9.099992,2.593069,-2.399594,1.1631,13.655129,...,2.158016,8.535105,4.060854,4.709802,0.919469,2.009145,0.841735,2.066127,7.111288,8.53238
C_FEMALE_GENITAL,12.22818,10.550727,10.673268,5.710598,13.620814,8.635132,2.26909,-2.936968,1.228203,14.406964,...,2.228101,8.654643,3.908278,4.689437,1.068443,2.346099,0.95463,1.548952,7.813228,8.712458
C_LIP_ORAL_PHARYNX,11.652957,10.419183,10.772189,5.849023,13.352536,8.758797,2.832381,-2.635704,1.403345,14.092684,...,2.270365,8.677272,4.090956,5.015478,1.255675,2.376117,0.848027,2.022413,7.777277,8.707595
C_MALE_GENITAL,12.402927,10.493404,11.089517,5.803801,13.572761,9.222039,2.582552,-2.961198,0.804954,14.376616,...,2.042169,8.882956,3.865058,4.625568,1.609759,2.094713,0.896443,2.229554,7.335732,8.870277
C_MESTOTHEL_SOFTTISSUE,11.68089,10.510612,10.93915,6.006436,13.66467,8.557766,2.498705,-2.701117,1.508273,13.942077,...,2.505704,8.595994,3.95052,4.675672,1.053082,2.720984,0.769591,2.044732,7.660796,8.664788
C_RESPIRATORY_INTRATHORACIC,11.564771,10.351174,10.896262,5.8026,12.975153,8.636232,2.693226,-2.943238,1.244875,13.527085,...,1.658707,7.979597,3.79029,5.591244,1.089208,2.398131,0.649549,1.73346,7.615727,8.401855
C_SKIN,11.958974,10.904602,10.629015,6.000162,13.052731,8.752838,2.491469,-2.432939,1.250108,13.84939,...,2.133202,8.532611,4.061694,4.724177,1.118692,2.155363,0.770637,2.137494,7.283073,8.762411
C_UNCERTAIN_SECONDARY,12.375943,10.576038,10.714533,5.496879,13.773856,8.953336,2.86008,-2.884468,1.435327,14.179268,...,2.080451,8.768971,4.103535,4.684254,1.042092,2.269248,0.905113,2.094592,8.14906,8.566664
