# Description

It prepares the data to create a clustering tree visualization (using the R package `clustree`).

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.display import display
from pathlib import Path

import numpy as np
import pandas as pd

from utils import generate_result_set_name
import conf

# Settings

In [3]:
CONSENSUS_CLUSTERING_DIR = Path(
    conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering"
).resolve()

display(CONSENSUS_CLUSTERING_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering')

# Load data

## PCA

In [4]:
INPUT_SUBSET = "pca"

In [5]:
INPUT_STEM = "z_score_std-projection-smultixcan-efo_partial-mashr-zscores"

In [6]:
DR_OPTIONS = {
    "n_components": 50,
    "svd_solver": "full",
    "random_state": 0,
}

In [7]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    generate_result_set_name(
        DR_OPTIONS, prefix=f"{INPUT_SUBSET}-{INPUT_STEM}-", suffix=".pkl"
    ),
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/pca/pca-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-n_components_50-random_state_0-svd_solver_full.pkl')

'pca-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-n_components_50-random_state_0-svd_solver_full'

In [8]:
data_pca = pd.read_pickle(input_filepath).iloc[:, :5]

In [9]:
data_pca.shape

(3752, 5)

In [10]:
data_pca.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5
100001_raw-Food_weight,0.805216,-0.86539,0.69948,-0.065976,0.999617
100002_raw-Energy,0.588507,-1.491772,1.75634,-3.593295,2.100607
100003_raw-Protein,1.91016,-1.873687,1.876677,-3.832557,1.240704
100004_raw-Fat,0.750799,-0.294733,1.31771,-1.346081,2.006403
100005_raw-Carbohydrate,-0.530044,-0.007398,0.611418,-3.604094,2.227872


## UMAP

In [11]:
INPUT_SUBSET = "umap"

In [12]:
INPUT_STEM = "z_score_std-projection-smultixcan-efo_partial-mashr-zscores"

In [13]:
DR_OPTIONS = {
    "n_components": 5,
    "metric": "euclidean",
    "n_neighbors": 15,
    "random_state": 0,
}

In [14]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    generate_result_set_name(
        DR_OPTIONS, prefix=f"{INPUT_SUBSET}-{INPUT_STEM}-", suffix=".pkl"
    ),
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')

'umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0'

In [15]:
data_umap = pd.read_pickle(input_filepath)

In [16]:
data_umap.shape

(3752, 5)

In [17]:
data_umap.head()

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
100001_raw-Food_weight,0.426554,0.670532,7.363805,1.171837,6.297295
100002_raw-Energy,-1.605179,0.815699,8.288521,0.990394,6.817351
100003_raw-Protein,-1.656178,0.788297,8.355906,1.017072,6.845651
100004_raw-Fat,-1.508325,0.802536,8.328274,1.033939,6.709319
100005_raw-Carbohydrate,-1.617872,0.812711,8.307973,1.020575,6.825944


# Load selected best partitions

In [18]:
input_file = Path(CONSENSUS_CLUSTERING_DIR, "best_partitions_by_k.pkl").resolve()
display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering/best_partitions_by_k.pkl')

In [19]:
best_partitions = pd.read_pickle(input_file)

In [20]:
best_partitions.shape

(59, 4)

In [21]:
best_partitions.head()

Unnamed: 0_level_0,method,partition,ami_mean,selected
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,eac_average_coassoc_matrix,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 11, 25, 2...",0.286678,True
41,eac_average_coassoc_matrix,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 11, 25, 2...",0.286357,True
40,eac_average_coassoc_matrix,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 25, 25...",0.286107,True
48,eac_average_coassoc_matrix,"[25, 25, 25, 25, 25, 25, 25, 25, 25, 6, 25, 25...",0.286006,True
17,scc_020,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0.285774,True


# Prepare data for clustrees

In [22]:
clustrees_df = pd.concat((data_pca, data_umap), join="inner", axis=1)

In [23]:
display(clustrees_df.shape)
assert clustrees_df.shape == (data_pca.shape[0], data_pca.shape[1] + data_umap.shape[1])

(3752, 10)

In [24]:
clustrees_df.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
100001_raw-Food_weight,0.805216,-0.86539,0.69948,-0.065976,0.999617,0.426554,0.670532,7.363805,1.171837,6.297295
100002_raw-Energy,0.588507,-1.491772,1.75634,-3.593295,2.100607,-1.605179,0.815699,8.288521,0.990394,6.817351
100003_raw-Protein,1.91016,-1.873687,1.876677,-3.832557,1.240704,-1.656178,0.788297,8.355906,1.017072,6.845651
100004_raw-Fat,0.750799,-0.294733,1.31771,-1.346081,2.006403,-1.508325,0.802536,8.328274,1.033939,6.709319
100005_raw-Carbohydrate,-0.530044,-0.007398,0.611418,-3.604094,2.227872,-1.617872,0.812711,8.307973,1.020575,6.825944


## Add partitions

In [25]:
_tmp = np.unique(
    [best_partitions.loc[k, "partition"].shape for k in best_partitions.index]
)
display(_tmp)
assert _tmp.shape[0] == 1
assert _tmp[0] == data_umap.shape[0] == data_pca.shape[0]

array([3752])

In [26]:
assert not best_partitions.isna().any().any()

In [27]:
# df = df.assign(**{f'k{k}': partitions.loc[k, 'partition'] for k in selected_k_values})
clustrees_df = clustrees_df.assign(
    **{
        f"k{k}": best_partitions.loc[k, "partition"]
        for k in best_partitions.index
        if best_partitions.loc[k, "selected"]
    }
)

In [28]:
clustrees_df.index.rename("trait", inplace=True)

In [29]:
clustrees_df.shape

(3752, 25)

In [30]:
clustrees_df.head()

Unnamed: 0_level_0,PCA1,PCA2,PCA3,PCA4,PCA5,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5,...,k47,k45,k46,k44,k43,k16,k49,k12,k54,k52
trait,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001_raw-Food_weight,0.805216,-0.86539,0.69948,-0.065976,0.999617,0.426554,0.670532,7.363805,1.171837,6.297295,...,25,25,25,25,25,1,25,0,25,12
100002_raw-Energy,0.588507,-1.491772,1.75634,-3.593295,2.100607,-1.605179,0.815699,8.288521,0.990394,6.817351,...,25,25,25,25,25,1,25,0,25,12
100003_raw-Protein,1.91016,-1.873687,1.876677,-3.832557,1.240704,-1.656178,0.788297,8.355906,1.017072,6.845651,...,25,25,25,25,25,1,25,0,25,12
100004_raw-Fat,0.750799,-0.294733,1.31771,-1.346081,2.006403,-1.508325,0.802536,8.328274,1.033939,6.709319,...,25,25,25,25,25,1,25,0,25,12
100005_raw-Carbohydrate,-0.530044,-0.007398,0.611418,-3.604094,2.227872,-1.617872,0.812711,8.307973,1.020575,6.825944,...,25,25,25,25,25,1,25,0,25,12


In [31]:
# make sure partitions were assigned correctly
assert (
    np.unique(
        [
            clustrees_df[f"{k}"].value_counts().sum()
            for k in clustrees_df.columns[
                clustrees_df.columns.str.contains("^k[0-9]+$", regex=True)
            ]
        ]
    )[0]
    == data_pca.shape[0]
)

# Assign labels

In [32]:
trait_labels = pd.Series({t: None for t in clustrees_df.index})

In [33]:
trait_labels.head()

100001_raw-Food_weight     None
100002_raw-Energy          None
100003_raw-Protein         None
100004_raw-Fat             None
100005_raw-Carbohydrate    None
dtype: object

In [34]:
trait_labels.loc["3143_raw-Ankle_spacing_width"] = "Anthropometry"

trait_labels.loc[
    [
        "20153_raw-Forced_expiratory_volume_in_1second_FEV1_predicted",
        "20150_raw-Forced_expiratory_volume_in_1second_FEV1_Best_measure",
        "20151_raw-Forced_vital_capacity_FVC_Best_measure",
        "3062_raw-Forced_vital_capacity_FVC",
        "3063_raw-Forced_expiratory_volume_in_1second_FEV1",
    ]
] = "Spirometry"


trait_labels.loc[
    [
        "30080_raw-Platelet_count",
        "30090_raw-Platelet_crit",
        "30100_raw-Mean_platelet_thrombocyte_volume",
        "30110_raw-Platelet_distribution_width",
        "platelet count",
    ]
] = "Platelet"

trait_labels.loc[
    [
        "23106_raw-Impedance_of_whole_body",
        "23107_raw-Impedance_of_leg_right",
        "23108_raw-Impedance_of_leg_left",
        "23109_raw-Impedance_of_arm_right",
        "23110_raw-Impedance_of_arm_left",
    ]
] = "Impedance"

trait_labels.loc[
    [
        "40001_C349-Underlying_primary_cause_of_death_ICD10_C349_Bronchus_or_lung_unspecified",
        "C3_RESPIRATORY_INTRATHORACIC-Malignant_neoplasm_of_respiratory_system_and_intrathoracic_organs",
        "C_BRONCHUS_LUNG-Malignant_neoplasm_of_bronchus_and_lung",
        "C_RESPIRATORY_INTRATHORACIC",
        "LUNG_CANCER_MESOT-Lung_cancer_and_mesothelioma",
        "lung carcinoma",
    ]
] = "Lung cancer"

# From https://biobank.ndph.ox.ac.uk/showcase/label.cgi?id=100014
trait_labels.loc[
    [
        "5086_raw-Cylindrical_power_left",
        "5087_raw-Cylindrical_power_right",
        "5116_raw-3mm_cylindrical_power_right",
        "5117_raw-6mm_cylindrical_power_right",
        "5118_raw-6mm_cylindrical_power_left",
        "5119_raw-3mm_cylindrical_power_left",
    ]
] = "Refractometry"

trait_labels.loc[
    [
        "5096_raw-3mm_weak_meridian_left",
        "5097_raw-6mm_weak_meridian_left",
        "5098_raw-6mm_weak_meridian_right",
        "5099_raw-3mm_weak_meridian_right",
        "5132_raw-3mm_strong_meridian_right",
        "5133_raw-6mm_strong_meridian_right",
        "5134_raw-6mm_strong_meridian_left",
        "5135_raw-3mm_strong_meridian_left",
    ]
] = "Keratometry"

trait_labels.loc[
    [
        "3144_raw-Heel_Broadband_ultrasound_attenuation_direct_entry",
        "3147_raw-Heel_quantitative_ultrasound_index_QUI_direct_entry",
        "3148_raw-Heel_bone_mineral_density_BMD",
        "4101_raw-Heel_broadband_ultrasound_attenuation_left",
        "4104_raw-Heel_quantitative_ultrasound_index_QUI_direct_entry_left",
        "4105_raw-Heel_bone_mineral_density_BMD_left",
        "4106_raw-Heel_bone_mineral_density_BMD_Tscore_automated_left",
        "4120_raw-Heel_broadband_ultrasound_attenuation_right",
        "4123_raw-Heel_quantitative_ultrasound_index_QUI_direct_entry_right",
        "4124_raw-Heel_bone_mineral_density_BMD_right",
        "4125_raw-Heel_bone_mineral_density_BMD_Tscore_automated_right",
        "78_raw-Heel_bone_mineral_density_BMD_Tscore_automated",
    ]
] = "Heel bone"


trait_labels.loc[
    [
        "22617_3319-Job_SOC_coding_Protective_service_associate_professionals_nec",
        "5983_raw-ECG_heart_rate",
        "5984_raw-ECG_load",
        "5986_raw-ECG_phase_time",
        "5992-ECG_phase_duration",
        "5993-ECG_number_of_stages_in_a_phase",
        "6020_1-Completion_status_of_test_Fully_completed",
        "6020_31-Completion_status_of_test_Participant_wanted_to_stop_early",
        "6020_33-Completion_status_of_test_Heart_rate_reached_safety_level",
        "6032_raw-Maximum_workload_during_fitness_test",
        "6033_raw-Maximum_heart_rate_during_fitness_test",
        "6038_raw-Number_of_trend_entries",
        "6039-Duration_of_fitness_test",
        "ability to walk or cycle unaided for 10 minutes, self-reported",
        "achievement of target heart rate, self-reported",
    ]
] = "ECG"

trait_labels.loc[
    [
        "30000_raw-White_blood_cell_leukocyte_count",
        "30120_raw-Lymphocyte_count",
        "30130_raw-Monocyte_count",
        "30140_raw-Neutrophill_count",
        "30150-Eosinophill_count",
        "30180_raw-Lymphocyte_percentage",
        "30190_raw-Monocyte_percentage",
        "30200_raw-Neutrophill_percentage",
        "30210_raw-Eosinophill_percentage",
        "eosinophil count",
        "granulocyte count",
        "leukocyte count",
        "lymphocyte count",
        "monocyte count",
        "myeloid white cell count",
        "neutrophil count",
    ]
] = "White blood cells"


trait_labels.loc[
    [
        "30010_raw-Red_blood_cell_erythrocyte_count",
        "30020_raw-Haemoglobin_concentration",
        "30030_raw-Haematocrit_percentage",
        "30040_raw-Mean_corpuscular_volume",
        "30050_raw-Mean_corpuscular_haemoglobin",
        "30060_raw-Mean_corpuscular_haemoglobin_concentration",
        "30070_raw-Red_blood_cell_erythrocyte_distribution_width",
        "30240_raw-Reticulocyte_percentage",
        "30250_raw-Reticulocyte_count",
        "30260_raw-Mean_reticulocyte_volume",
        "30270_raw-Mean_sphered_cell_volume",
        "30280_raw-Immature_reticulocyte_fraction",
        "30290_raw-High_light_scatter_reticulocyte_percentage",
        "30300_raw-High_light_scatter_reticulocyte_count",
        "erythrocyte count",
        "reticulocyte count",
    ]
] = "Red blood cells"

trait_labels.loc[
    [
        "20015_raw-Sitting_height",
        "21001_raw-Body_mass_index_BMI",
        "21002_raw-Weight",
        "23098_raw-Weight",
        "23099_raw-Body_fat_percentage",
        "23100_raw-Whole_body_fat_mass",
        "23101_raw-Whole_body_fatfree_mass",
        "23102_raw-Whole_body_water_mass",
        "23104_raw-Body_mass_index_BMI",
        "23105_raw-Basal_metabolic_rate",
        "23111_raw-Leg_fat_percentage_right",
        "23112_raw-Leg_fat_mass_right",
        "23113_raw-Leg_fatfree_mass_right",
        "23114_raw-Leg_predicted_mass_right",
        "23115_raw-Leg_fat_percentage_left",
        "23116_raw-Leg_fat_mass_left",
        "23117_raw-Leg_fatfree_mass_left",
        "23118_raw-Leg_predicted_mass_left",
        "23119_raw-Arm_fat_percentage_right",
        "23120_raw-Arm_fat_mass_right",
        "23121_raw-Arm_fatfree_mass_right",
        "23122_raw-Arm_predicted_mass_right",
        "23123_raw-Arm_fat_percentage_left",
        "23124_raw-Arm_fat_mass_left",
        "23125_raw-Arm_fatfree_mass_left",
        "23126_raw-Arm_predicted_mass_left",
        "23127_raw-Trunk_fat_percentage",
        "23128_raw-Trunk_fat_mass",
        "23129_raw-Trunk_fatfree_mass",
        "23130_raw-Trunk_predicted_mass",
        "48_raw-Waist_circumference",
        "49_raw-Hip_circumference",
        "50_raw-Standing_height",
        "body height",
    ]
] = "Anthropometry"

trait_labels.loc[
    [
        "20003_1140861958-Treatmentmedication_code_simvastatin",
        "20003_1140868226-Treatmentmedication_code_aspirin",
        "20003_1140879802-Treatmentmedication_code_amlodipine",
        "20003_1141194794-Treatmentmedication_code_bendroflumethiazide",
        "4079_raw-Diastolic_blood_pressure_automated_reading",
        "4080_raw-Systolic_blood_pressure_automated_reading",
        "6150_1-Vascularheart_problems_diagnosed_by_doctor_Heart_attack",
        "6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above",
        "6150_2-Vascularheart_problems_diagnosed_by_doctor_Angina",
        "6150_4-Vascularheart_problems_diagnosed_by_doctor_High_blood_pressure",
        "6153_1-Medication_for_cholesterol_blood_pressure_diabetes_or_take_exogenous_hormones_Cholesterol_lowering_medication",
        "6153_100-Medication_for_cholesterol_blood_pressure_diabetes_or_take_exogenous_hormones_None_of_the_above",
        "6153_2-Medication_for_cholesterol_blood_pressure_diabetes_or_take_exogenous_hormones_Blood_pressure_medication",
        "6154_1-Medication_for_pain_relief_constipation_heartburn_Aspirin",
        "6177_1-Medication_for_cholesterol_blood_pressure_or_diabetes_Cholesterol_lowering_medication",
        "6177_100-Medication_for_cholesterol_blood_pressure_or_diabetes_None_of_the_above",
        "6177_2-Medication_for_cholesterol_blood_pressure_or_diabetes_Blood_pressure_medication",
        "I9_CHD-Major_coronary_heart_disease_event",
        "I9_CHD_NOREV-Major_coronary_heart_disease_event_excluding_revascularizations",
        "I9_CORATHER-Coronary_atherosclerosis",
        "I9_IHD-Ischaemic_heart_disease_wide_definition",
        "I9_MI-Myocardial_infarction",
        "I9_MI_STRICT-Myocardial_infarction_strict",
        "I9_UAP-Unstable_angina_pectoris",
        "IX_CIRCULATORY-Diseases_of_the_circulatory_system",
        "acute myocardial infarction",
        "angina pectoris",
        "coronary artery disease",
        "hypercholesterolemia",
        "hypertension",
        "myocardial infarction",
    ]
] = "Cardiovascular"

trait_labels.loc[
    [
        "1717-Skin_colour",
        "1727-Ease_of_skin_tanning",
        "1737-Childhood_sunburn_occasions",
        "1747_1-Hair_colour_natural_before_greying_Blonde",
        "1747_2-Hair_colour_natural_before_greying_Red",
        "1747_3-Hair_colour_natural_before_greying_Light_brown",
        "1747_4-Hair_colour_natural_before_greying_Dark_brown",
        "1747_5-Hair_colour_natural_before_greying_Black",
        "2267-Use_of_sunuv_protection",
        "C3_SKIN-Malignant_neoplasm_of_skin",
        "C_MELANOMA_SKIN-Malignant_melanoma_of_skin",
        "C_OTHER_SKIN-Other_malignant_neoplasms_of_skin",
        "C_SKIN",
        "basal cell carcinoma",
        "skin neoplasm",
    ]
] = "Skin/hair"

In [35]:
trait_labels

100001_raw-Food_weight                           None
100002_raw-Energy                                None
100003_raw-Protein                               None
100004_raw-Fat                                   None
100005_raw-Carbohydrate                          None
                                                 ... 
wellbeing measurement AND family relationship    None
wheezing                                         None
whooping cough                                   None
worry measurement                                None
wrist fracture                                   None
Length: 3752, dtype: object

In [36]:
clustrees_df = clustrees_df.assign(labels=trait_labels)

# Save

In [37]:
output_file = Path(CONSENSUS_CLUSTERING_DIR, "clustering_tree_data.tsv").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering/clustering_tree_data.tsv')

In [38]:
clustrees_df.to_csv(output_file, sep="\t")