# Description

It analyzes the properties of the different data versions used (`z_score_std`, `pca` and `umap`) to cluster traits, and performs some checks.

# Environment variables

In [1]:
from IPython.display import display

import conf

N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

2

In [2]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=2
env: OPEN_BLAS_NUM_THREADS=2
env: NUMEXPR_NUM_THREADS=2
env: OMP_NUM_THREADS=2


# Modules loading

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd

from utils import generate_result_set_name

# Settings

In [5]:
INITIAL_RANDOM_STATE = 12345

# Z-score standardized data

In [6]:
INPUT_SUBSET = "z_score_std"

In [7]:
INPUT_STEM = "projection-smultixcan-efo_partial-mashr-zscores"

In [8]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    f"{INPUT_SUBSET}-{INPUT_STEM}.pkl",
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std/z_score_std-projection-smultixcan-efo_partial-mashr-zscores.pkl')

'z_score_std-projection-smultixcan-efo_partial-mashr-zscores'

In [9]:
data = pd.read_pickle(input_filepath)

In [10]:
data.shape

(3752, 987)

In [11]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.695006,1.962565,0.057683,0.878731,-0.539977,1.481272,-0.396422,1.09018,0.759223,0.931395,...,1.129784,1.752343,-1.411403,2.823863,0.931116,-1.054519,0.432982,-0.633597,0.554279,-0.642479
100002_raw-Energy,-1.528127,-0.345309,-0.148953,-0.24206,0.373427,0.791092,0.263477,0.987702,0.354391,1.416059,...,0.224604,0.769882,-0.509482,0.091153,2.286789,-1.008256,-0.029764,1.737229,-0.272107,-0.526125
100003_raw-Protein,-0.704572,-1.011299,0.67142,0.143991,0.615212,0.874212,-0.040998,0.91517,0.254369,-0.084237,...,1.003019,1.044314,-2.376108,0.004778,0.053714,-0.892447,-0.1838,1.377991,-0.278794,-0.419733
100004_raw-Fat,-0.989832,-1.87549,0.261555,-1.420719,0.366238,1.167049,0.257387,0.717674,-0.997664,0.969825,...,0.585913,0.638314,0.119139,-0.140204,1.394326,-1.173402,0.555058,1.013982,-0.544506,-0.064061
100005_raw-Carbohydrate,-0.580143,0.243335,0.158966,-0.036558,0.068176,-0.202639,1.101281,0.675227,1.463432,1.010078,...,-0.249108,-0.026814,0.232713,0.323682,1.168642,-0.282935,0.653105,1.909526,0.199997,-1.656894


## Data stats

In [12]:
data.min().min(), data.max().max()

(-5.591963893267397, 20.911849300098996)

In [13]:
assert not np.isinf(data).any().any()

In [14]:
assert not data.isna().any().any()

In [15]:
data_stats = data.describe()

In [16]:
data_stats.T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LV1,3752.0,-7.575082e-18,1.000133,-3.742454,-0.648416,-0.021693,0.626035,7.043617
LV2,3752.0,1.325639e-17,1.000133,-3.753861,-0.671864,-0.004672,0.638313,4.583115
LV3,3752.0,-1.136262e-17,1.000133,-3.204374,-0.663746,-0.033244,0.634440,4.475987
LV4,3752.0,-7.575082e-18,1.000133,-4.860651,-0.654791,0.014956,0.656563,3.359129
LV5,3752.0,9.468853e-18,1.000133,-2.997212,-0.680881,-0.029416,0.597668,5.508202
...,...,...,...,...,...,...,...,...
LV983,3752.0,3.787541e-18,1.000133,-2.834860,-0.678074,-0.075179,0.595309,9.993240
LV984,3752.0,-1.893771e-17,1.000133,-3.399541,-0.656353,-0.045135,0.555113,7.991445
LV985,3752.0,3.219410e-17,1.000133,-3.584552,-0.687056,-0.015667,0.652319,4.674100
LV986,3752.0,-9.468853e-19,1.000133,-3.407572,-0.640318,0.001397,0.664050,3.535360


In [17]:
assert not np.isinf(data_stats).any().any()

In [18]:
assert not data_stats.isna().any().any()

## Check duplicated values

In [19]:
data_dups = data.round(5).duplicated(keep=False)

In [20]:
with pd.option_context("display.max_rows", 100, "display.max_columns", 10):
    display(data.loc[data_dups].sort_values("LV1"))

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,...,LV983,LV984,LV985,LV986,LV987
otitis externa,-1.685662,-0.329982,1.429659,-1.897124,0.419681,...,1.652075,-0.556176,-1.022885,-0.381484,1.044769
H8_EXTOTITIS-Otitis_externa,-1.685662,-0.329982,1.429659,-1.897124,0.419681,...,1.652075,-0.556176,-1.022885,-0.381484,1.044769
L12_EPIDERMALTHICKOTH-Other_epidermal_thickening,-1.372594,-2.840009,0.088276,-0.428396,0.834243,...,-1.901749,0.412013,0.249073,-0.238723,-1.680163
epidermal thickening,-1.372594,-2.840009,0.088276,-0.428396,0.834243,...,-1.901749,0.412013,0.249073,-0.238723,-1.680163
C_UNCERTAIN_SECONDARY,-1.317363,-0.348156,2.119712,-1.572985,-1.306667,...,0.962995,0.754257,-0.258054,0.017627,-0.220039
C3_UNCERTAIN_SECONDARY-Secondary_uncertain_malignant_neoplasm,-1.317363,-0.348156,2.119712,-1.572985,-1.306667,...,0.962995,0.754257,-0.258054,0.017627,-0.220039
H7_STRABOTH-Other_strabismus,-1.179963,0.228374,-1.986832,-1.245763,1.471153,...,-1.133495,0.601951,-0.251008,0.118861,-0.523905
strabismus,-1.179963,0.228374,-1.986832,-1.245763,1.471153,...,-1.133495,0.601951,-0.251008,0.118861,-0.523905
C_URINARY_TRACT,-1.018225,-3.3e-05,-0.01086,1.020034,1.089526,...,-0.0823,-0.656353,-0.736378,0.377882,0.022299
C3_URINARY_TRACT-Malignant_neoplasm_of_urinary_organs,-1.018225,-3.3e-05,-0.01086,1.020034,1.089526,...,-0.0823,-0.656353,-0.736378,0.377882,0.022299


In [21]:
data_dups.any()

True

In [22]:
data_dups.value_counts()

False    3663
True       89
dtype: int64

In [23]:
data_dups_labels = data.loc[data_dups].sort_values("LV1").index
display(data_dups_labels[:10])

Index(['otitis externa', 'H8_EXTOTITIS-Otitis_externa',
       'L12_EPIDERMALTHICKOTH-Other_epidermal_thickening',
       'epidermal thickening', 'C_UNCERTAIN_SECONDARY',
       'C3_UNCERTAIN_SECONDARY-Secondary_uncertain_malignant_neoplasm',
       'H7_STRABOTH-Other_strabismus', 'strabismus', 'C_URINARY_TRACT',
       'C3_URINARY_TRACT-Malignant_neoplasm_of_urinary_organs'],
      dtype='object')

These duplicated traits should be taken into account when interpreting any results derived from the data (such as cluster analysis).

# PCA

In [24]:
INPUT_SUBSET = "pca"

In [25]:
INPUT_STEM = "z_score_std-projection-smultixcan-efo_partial-mashr-zscores"

In [26]:
DR_OPTIONS = {
    "n_components": 50,
    "svd_solver": "full",
    "random_state": 0,
}

In [27]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    generate_result_set_name(
        DR_OPTIONS, prefix=f"{INPUT_SUBSET}-{INPUT_STEM}-", suffix=".pkl"
    ),
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/pca/pca-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-n_components_50-random_state_0-svd_solver_full.pkl')

'pca-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-n_components_50-random_state_0-svd_solver_full'

In [28]:
data = pd.read_pickle(input_filepath)

In [29]:
data.shape

(3752, 50)

In [30]:
data.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA41,PCA42,PCA43,PCA44,PCA45,PCA46,PCA47,PCA48,PCA49,PCA50
100001_raw-Food_weight,0.805216,-0.86539,0.69948,-0.065976,0.999617,-0.418645,1.993288,-0.346875,-1.522978,-2.382791,...,0.777932,0.507896,0.693668,1.02426,0.189606,-1.661826,0.281234,-0.096792,-0.339508,0.041431
100002_raw-Energy,0.588507,-1.491772,1.75634,-3.593295,2.100607,0.36491,1.254815,2.028484,-0.357607,-4.832726,...,-0.780215,-2.285033,-0.607663,-2.727018,1.711667,-0.535025,2.342068,0.197519,1.636996,-1.11441
100003_raw-Protein,1.91016,-1.873687,1.876677,-3.832557,1.240704,2.941375,2.293978,0.802473,0.66538,-4.375319,...,-0.547161,0.514721,0.034145,-0.423218,-0.324825,-0.000958,0.294579,1.186937,0.226722,0.169931
100004_raw-Fat,0.750799,-0.294733,1.31771,-1.346081,2.006403,0.533509,-0.752916,0.937515,-0.572191,-2.451612,...,0.060359,-0.878309,-0.774943,-1.91489,2.559763,1.547267,1.85111,-0.310431,3.495624,-1.595785
100005_raw-Carbohydrate,-0.530044,-0.007398,0.611418,-3.604094,2.227872,0.051271,0.001135,2.303819,-0.387759,-5.686184,...,-1.110273,-3.032541,-1.223873,-0.945626,2.150276,-1.798716,2.526678,0.716987,0.300972,-0.893854


## Data stats

In [31]:
data.min().min(), data.max().max()

(-23.802797110319943, 33.2732448375466)

In [32]:
assert not np.isinf(data).any().any()

In [33]:
assert not data.isna().any().any()

In [34]:
data_stats = data.describe()

In [35]:
data_stats.T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PCA1,3752.0,-3.030033e-17,4.289205,-5.622337,-2.10775,-1.171765,0.226506,27.018533
PCA2,3752.0,6.060066e-17,2.848039,-11.867184,-0.942571,-0.086275,0.788981,32.2223
PCA3,3752.0,0.0,2.337725,-23.802797,-0.846508,0.012899,0.819,27.152855
PCA4,3752.0,-2.2725250000000003e-17,2.31023,-11.730295,-0.897644,0.024376,0.89684,26.519926
PCA5,3752.0,-5.4919350000000007e-17,2.188628,-18.438041,-0.860214,-0.007901,0.882066,25.823172
PCA6,3752.0,0.0,2.142986,-15.863555,-0.916041,-4.9e-05,0.905827,21.399116
PCA7,3752.0,3.787541e-18,2.068004,-11.423329,-0.9702,-0.139725,0.790585,33.273245
PCA8,3752.0,0.0,1.985777,-13.519333,-0.961497,-0.070582,0.799552,17.145322
PCA9,3752.0,-1.5150160000000002e-17,1.956795,-14.609073,-0.88004,0.023499,0.855638,16.686925
PCA10,3752.0,5.681312000000001e-17,1.894428,-12.112828,-0.995019,-0.024213,0.915382,14.491414


In [36]:
assert not np.isinf(data_stats).any().any()

In [37]:
assert not data_stats.isna().any().any()

## Check duplicated values

In [38]:
data_dups = data.round(5).duplicated(keep=False)

In [39]:
data_dups.any()

True

In [40]:
data_dups.value_counts()

False    3663
True       89
dtype: int64

In [41]:
data.index[data_dups][:10]

Index(['ASTHMA_CHILD-Childhood_asthma_age16', 'ASTHMA_MODE-Asthma_mode',
       'ASTHMA_PNEUMONIA-Asthmarelated_pneumonia',
       'C3_DIGESTIVE_ORGANS-Malignant_neoplasm_of_digestive_organs',
       'C3_ENDOCRINE-Malignant_neoplasm_of_endocrine_gland',
       'C3_EYE_BRAIN_NEURO-Malignant_neoplasm_of_eye_brain_and_central_nervous_system',
       'C3_FEMALE_GENITAL-malignant_neoplasm_of_female_genital_organs',
       'C3_LIP_ORAL_PHARYNX-Malignant_neoplasm_of_lip_oral_cavity_and_pharynx',
       'C3_MALE_GENITAL-malignant_neoplasm_of_male_genital_organs',
       'C3_MESTOTHEL_SOFTTISSUE-Melignant_neoplasm_of_mesothelium_and_soft_tissue'],
      dtype='object')

In [42]:
# same duplicates in `z_score_std`
assert set(data.index[data_dups]) == set(data_dups_labels)

# UMAP

In [43]:
INPUT_SUBSET = "umap"

In [44]:
INPUT_STEM = "z_score_std-projection-smultixcan-efo_partial-mashr-zscores"

In [45]:
DR_OPTIONS = {
    "n_components": 50,
    "metric": "euclidean",
    "n_neighbors": 15,
    "random_state": 0,
}

In [46]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    generate_result_set_name(
        DR_OPTIONS, prefix=f"{INPUT_SUBSET}-{INPUT_STEM}-", suffix=".pkl"
    ),
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')

'umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0'

In [47]:
data = pd.read_pickle(input_filepath)

In [48]:
data.shape

(3752, 50)

In [49]:
data.head()

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5,UMAP6,UMAP7,UMAP8,UMAP9,UMAP10,...,UMAP41,UMAP42,UMAP43,UMAP44,UMAP45,UMAP46,UMAP47,UMAP48,UMAP49,UMAP50
100001_raw-Food_weight,9.58728,8.050184,9.340894,8.884522,4.562011,9.261327,9.078164,8.009141,9.288872,9.827831,...,9.753575,0.440851,-0.077164,-0.184829,9.345852,0.622542,-0.051136,4.266558,8.047482,-0.317385
100002_raw-Energy,9.910512,8.372274,9.605316,9.808598,4.64197,9.360196,8.848985,7.768594,9.231921,9.328465,...,9.647689,0.581584,0.54793,0.600128,8.976416,1.249645,0.234843,4.636343,7.173085,0.137119
100003_raw-Protein,9.923088,8.38064,9.609899,9.836457,4.647365,9.366775,8.835996,7.767549,9.230281,9.314386,...,9.642637,0.580362,0.565898,0.621073,8.967297,1.271435,0.24396,4.648671,7.149967,0.148192
100004_raw-Fat,9.898531,8.375697,9.600443,9.785713,4.639249,9.360721,8.886237,7.783376,9.237095,9.345434,...,9.64827,0.573073,0.521042,0.576926,8.979161,1.227988,0.231848,4.622777,7.205517,0.147385
100005_raw-Carbohydrate,9.895807,8.376662,9.605841,9.791338,4.636903,9.352356,8.867312,7.774787,9.233479,9.344456,...,9.652949,0.575161,0.529565,0.579919,8.988125,1.231771,0.226778,4.62436,7.194818,0.123104


## Data stats

In [50]:
data.min().min(), data.max().max()

(-2.0432608, 10.968822)

In [51]:
assert not np.isinf(data).any().any()

In [52]:
assert not data.isna().any().any()

In [53]:
data_stats = data.describe()

In [54]:
data_stats.T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
UMAP1,3752.0,9.667269,0.568371,-2.043261,9.544245,9.650231,9.848108,10.205195
UMAP2,3752.0,8.582644,0.853518,2.013011,8.261819,8.671461,9.033814,10.047072
UMAP3,3752.0,9.524095,0.467407,1.812973,9.424377,9.561857,9.642713,10.305815
UMAP4,3752.0,9.668339,0.694791,7.830413,9.046152,9.609067,10.339871,10.968822
UMAP5,3752.0,4.678472,0.130449,3.99431,4.609011,4.679538,4.747692,5.178999
UMAP6,3752.0,9.467794,0.127408,9.123776,9.363022,9.496684,9.570718,10.07568
UMAP7,3752.0,9.488421,0.5808,8.005753,9.078334,9.42958,10.062101,10.655107
UMAP8,3752.0,8.157632,0.207703,7.523412,7.984377,8.170478,8.304098,8.829289
UMAP9,3752.0,9.376164,0.115023,9.057338,9.307835,9.35227,9.428299,9.899257
UMAP10,3752.0,9.696962,0.280207,9.285474,9.431865,9.699163,9.917829,10.549976


In [55]:
assert not np.isinf(data_stats).any().any()

In [56]:
assert not data_stats.isna().any().any()

## Check duplicated values

In [57]:
data_dups = data.round(5).duplicated(keep=False)

In [58]:
data_dups.any()

False

There are no duplicates with UMAP data, but the duplicates in `z_score_std` and `pca` are very close by in the UMAP representation.

In [59]:
data_dups_labels[:10]

Index(['otitis externa', 'H8_EXTOTITIS-Otitis_externa',
       'L12_EPIDERMALTHICKOTH-Other_epidermal_thickening',
       'epidermal thickening', 'C_UNCERTAIN_SECONDARY',
       'C3_UNCERTAIN_SECONDARY-Secondary_uncertain_malignant_neoplasm',
       'H7_STRABOTH-Other_strabismus', 'strabismus', 'C_URINARY_TRACT',
       'C3_URINARY_TRACT-Malignant_neoplasm_of_urinary_organs'],
      dtype='object')

In [60]:
data.loc[data_dups_labels]

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5,UMAP6,UMAP7,UMAP8,UMAP9,UMAP10,...,UMAP41,UMAP42,UMAP43,UMAP44,UMAP45,UMAP46,UMAP47,UMAP48,UMAP49,UMAP50
otitis externa,9.551093,7.981320,9.493232,8.834431,4.417214,9.290780,9.715325,8.066138,9.332725,9.903297,...,9.853333,0.379089,-0.085457,-0.357628,9.076071,0.613556,-0.006733,4.352067,8.101846,0.056318
H8_EXTOTITIS-Otitis_externa,9.554805,7.974651,9.502875,8.838317,4.403702,9.288329,9.727035,8.072176,9.328022,9.902865,...,9.855843,0.370807,-0.083122,-0.367169,9.070306,0.624112,-0.025003,4.349858,8.111155,0.057873
L12_EPIDERMALTHICKOTH-Other_epidermal_thickening,9.464342,7.988772,9.293400,8.659856,4.610901,9.360639,9.786700,8.078593,9.521825,9.986540,...,9.891342,0.488052,-0.245327,-0.357983,9.052238,0.307899,0.195046,4.346837,8.079193,0.175529
epidermal thickening,9.473613,8.007787,9.308480,8.676417,4.611607,9.365240,9.777965,8.085194,9.508198,9.973901,...,9.874878,0.478733,-0.237961,-0.344338,9.054968,0.326440,0.187872,4.341372,8.079620,0.177535
C_UNCERTAIN_SECONDARY,9.563827,9.050376,9.215671,9.605934,4.867174,9.588847,10.361145,8.478359,9.608009,10.063944,...,9.573668,0.304981,-0.273114,-0.005607,8.728317,0.573103,1.018113,4.702545,7.902545,1.610409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3_RESPIRATORY_INTRATHORACIC-Malignant_neoplasm_of_respiratory_system_and_intrathoracic_organs,10.031796,8.721112,9.705954,8.929115,4.361479,9.682940,9.310378,8.168248,9.327873,9.603079,...,9.533340,0.235098,-0.006092,0.506889,8.984775,1.350959,0.285813,4.360876,7.957686,0.425512
hyperthyroidism,9.607764,8.166314,9.437190,8.824901,4.690758,9.265266,9.712802,8.029313,9.322843,9.928528,...,9.878122,0.467593,-0.048978,-0.260045,9.070807,0.459282,0.307388,4.515699,7.866353,-0.027778
THYROTOXICOSIS-Thyrotoxicosis,9.605329,8.163507,9.431808,8.820052,4.692393,9.264345,9.714269,8.028500,9.326065,9.930358,...,9.880958,0.468656,-0.051107,-0.263472,9.071269,0.454347,0.310960,4.515286,7.866503,-0.028039
C_SKIN,9.733206,8.669869,9.625696,9.309209,4.724639,9.384936,9.723085,8.227638,9.368038,9.849195,...,9.646558,0.404172,-0.040124,-0.189650,9.056011,0.784218,0.152937,4.503841,7.969868,0.334979
