# Description

This notebooks checks which traits move from different partitions and clusters across the clustering tree.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.display import display
from pathlib import Path

import pandas as pd

from data.cache import read_data
from utils import generate_result_set_name
import conf

# Settings

# Load data

## S-MultiXcan projection (`z_score_std`)

In [4]:
INPUT_SUBSET = "z_score_std"

In [5]:
INPUT_STEM = "projection-smultixcan-efo_partial-mashr-zscores"

In [6]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    f"{INPUT_SUBSET}-{INPUT_STEM}.pkl",
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/data_transformations/z_score_std/z_score_std-projection-smultixcan-efo_partial-mashr-zscores.pkl')

'z_score_std-projection-smultixcan-efo_partial-mashr-zscores'

In [7]:
data = read_data(input_filepath)

In [8]:
data.shape

(3752, 987)

In [9]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.695006,1.962565,0.057683,0.878731,-0.539977,1.481272,-0.396422,1.09018,0.759223,0.931395,...,1.129784,1.752343,-1.411403,2.823863,0.931116,-1.054519,0.432982,-0.633597,0.554279,-0.642479
100002_raw-Energy,-1.528127,-0.345309,-0.148953,-0.24206,0.373427,0.791092,0.263477,0.987702,0.354391,1.416059,...,0.224604,0.769882,-0.509482,0.091153,2.286789,-1.008256,-0.029764,1.737229,-0.272107,-0.526125
100003_raw-Protein,-0.704572,-1.011299,0.67142,0.143991,0.615212,0.874212,-0.040998,0.91517,0.254369,-0.084237,...,1.003019,1.044314,-2.376108,0.004778,0.053714,-0.892447,-0.1838,1.377991,-0.278794,-0.419733
100004_raw-Fat,-0.989832,-1.87549,0.261555,-1.420719,0.366238,1.167049,0.257387,0.717674,-0.997664,0.969825,...,0.585913,0.638314,0.119139,-0.140204,1.394326,-1.173402,0.555058,1.013982,-0.544506,-0.064061
100005_raw-Carbohydrate,-0.580143,0.243335,0.158966,-0.036558,0.068176,-0.202639,1.101281,0.675227,1.463432,1.010078,...,-0.249108,-0.026814,0.232713,0.323682,1.168642,-0.282935,0.653105,1.909526,0.199997,-1.656894


## Clustering results

In [10]:
CONSENSUS_CLUSTERING_DIR = Path(
    conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering"
).resolve()

display(CONSENSUS_CLUSTERING_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/clustering/consensus_clustering')

In [11]:
input_file = Path(CONSENSUS_CLUSTERING_DIR, "best_partitions_by_k.pkl").resolve()
display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/clustering/consensus_clustering/best_partitions_by_k.pkl')

In [12]:
best_partitions = read_data(input_file)

In [13]:
# keep selected partitions only
best_partitions = best_partitions[best_partitions["selected"]]

In [14]:
best_partitions.shape

(15, 4)

In [15]:
best_partitions.head()

Unnamed: 0_level_0,method,partition,ari_median,selected
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.090117,True
22,scc_020,"[13, 18, 18, 18, 18, 18, 18, 18, 18, 13, 18, 1...",0.0901,True
13,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.08992,True
12,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.089894,True
11,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.089616,True


# Within the "complex" branch

## part15 k14 to part16 k14

In [16]:
part_one = best_partitions.loc[15, "partition"]
part_two = best_partitions.loc[16, "partition"]

In [17]:
part_one_clus = data.loc[part_one == 14].index
part_two_clus = data.loc[part_two == 14].index

In [18]:
part_one_clus

Index(['1021-Duration_of_light_DIY', '102_raw-Pulse_rate_automated_reading',
       '1031-Frequency_of_friendfamily_visits',
       '1050-Time_spend_outdoors_in_summer',
       '1060-Time_spent_outdoors_in_winter',
       '1070-Time_spent_watching_television_TV',
       '1080-Time_spent_using_computer', '1090-Time_spent_driving',
       '1100-Drive_faster_than_motorway_speed_limit',
       '110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance',
       ...
       'self reported educational attainment', 'sleep duration',
       'smoking behavior', 'smoking cessation', 'snoring measurement',
       'tiredness measurement', 'type i diabetes mellitus', 'varicose veins',
       'wheezing', 'worry measurement'],
      dtype='object', length=326)

In [19]:
part_two_clus

Index(['20003_1140883066-Treatmentmedication_code_insulin_product',
       '20003_1141191044-Treatmentmedication_code_levothyroxine_sodium',
       '2986-Started_insulin_within_one_year_diagnosis_of_diabetes',
       '6144_3-Never_eat_eggs_dairy_wheat_sugar_Wheat_products',
       'hyperthyroidism AND thyrotoxicosis', 'hypothyroidism AND myxedema',
       'psoriasis'],
      dtype='object')

In [20]:
part_one_clus.intersection(part_two_clus)

Index(['20003_1140883066-Treatmentmedication_code_insulin_product', '2986-Started_insulin_within_one_year_diagnosis_of_diabetes'], dtype='object')

## part25 k24 to part26 k15

In [21]:
part_one = best_partitions.loc[25, "partition"]
part_two = best_partitions.loc[26, "partition"]

In [22]:
part_one_clus = data.loc[part_one == 24].index
part_two_clus = data.loc[part_two == 15].index

In [23]:
part_one_clus

Index(['1021-Duration_of_light_DIY', '102_raw-Pulse_rate_automated_reading',
       '1031-Frequency_of_friendfamily_visits',
       '1050-Time_spend_outdoors_in_summer',
       '1060-Time_spent_outdoors_in_winter',
       '1070-Time_spent_watching_television_TV',
       '1080-Time_spent_using_computer', '1090-Time_spent_driving',
       '1100-Drive_faster_than_motorway_speed_limit',
       '110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance',
       ...
       'self reported educational attainment', 'sleep duration',
       'smoking behavior', 'smoking cessation', 'snoring measurement',
       'tiredness measurement', 'type i diabetes mellitus', 'varicose veins',
       'wheezing', 'worry measurement'],
      dtype='object', length=154)

In [24]:
part_two_clus

Index(['20003_1140883066-Treatmentmedication_code_insulin_product',
       '20003_1141191044-Treatmentmedication_code_levothyroxine_sodium',
       '2976_raw-Age_diabetes_diagnosed',
       '2986-Started_insulin_within_one_year_diagnosis_of_diabetes',
       '6144_3-Never_eat_eggs_dairy_wheat_sugar_Wheat_products',
       '6153_3-Medication_for_cholesterol_blood_pressure_diabetes_or_take_exogenous_hormones_Insulin',
       '6177_3-Medication_for_cholesterol_blood_pressure_or_diabetes_Insulin',
       'hyperthyroidism AND thyrotoxicosis', 'hypothyroidism AND myxedema',
       'psoriasis'],
      dtype='object')

In [25]:
part_one_clus.intersection(part_two_clus)

Index(['2976_raw-Age_diabetes_diagnosed',
       '6153_3-Medication_for_cholesterol_blood_pressure_diabetes_or_take_exogenous_hormones_Insulin',
       '6177_3-Medication_for_cholesterol_blood_pressure_or_diabetes_Insulin'],
      dtype='object')

## part26 k13 to part29 k21

In [26]:
part_one = best_partitions.loc[26, "partition"]
part_two = best_partitions.loc[29, "partition"]

In [27]:
part_one_clus = data.loc[part_one == 13].index
part_two_clus = data.loc[part_two == 21].index

In [28]:
part_one_clus

Index(['EAGLE_Eczema', 'Jones_et_al_2016_Chronotype', 'MAGNETIC_CH2.DB.ratio',
       'MAGNETIC_HDL.C', 'MAGNETIC_IDL.TG', 'MAGNETIC_LDL.C',
       'alzheimer's disease', 'attention deficit hyperactivity disorder',
       'coronary artery disease', 'estrogen-receptor negative breast cancer',
       'estrogen-receptor positive breast cancer',
       'family history of breast cancer', 'fasting blood glucose measurement',
       'fasting blood insulin measurement', 'inflammatory bowel disease'],
      dtype='object')

In [29]:
part_two_clus

Index(['EAGLE_Eczema', 'SSGAC_Education_Years_Pooled',
       'attention deficit hyperactivity disorder', 'bone density',
       'inflammatory bowel disease', 'intracranial volume measurement',
       'schizophrenia'],
      dtype='object')

In [30]:
part_one_clus.intersection(part_two_clus)

Index(['EAGLE_Eczema', 'attention deficit hyperactivity disorder',
       'inflammatory bowel disease'],
      dtype='object')

# Between branches

## part22 k13 to part25 k23

In [31]:
part_one = best_partitions.loc[22, "partition"]
part_two = best_partitions.loc[25, "partition"]

In [32]:
part_one_clus = data.loc[part_one == 13].index
part_two_clus = data.loc[part_two == 23].index

In [33]:
part_one_clus

Index(['100001_raw-Food_weight', '100010-Portion_size',
       '100013_raw-Vitamin_B12', '100015_raw-Vitamin_C', '100018_raw-Retinol',
       '100019_raw-Carotene', '100021_raw-Vitamin_D', '100022-Alcohol',
       '100023_raw-Starch', '1001-Duration_of_strenuous_sports',
       ...
       'viral human hepatitis infection', 'viral meningitis',
       'visual impairment', 'vitiligo', 'vitreous body disease',
       'vocal cord polyp', 'voice disorders',
       'wellbeing measurement AND family relationship', 'whooping cough',
       'wrist fracture'],
      dtype='object', length=3319)

In [34]:
part_two_clus

Index(['41248_1000-Destinations_on_discharge_from_hospital_recoded_Usual_Place_of_residence',
       '4194_raw-Pulse_rate', '4260-Round_of_numeric_memory_test',
       '4282-Maximum_digits_remembered_correctly',
       '4283-Number_of_rounds_of_numeric_memory_test_performed',
       '4290_raw-Duration_screen_displayed', '4291-Number_of_attempts',
       '46_raw-Hand_grip_strength_left', '47_raw-Hand_grip_strength_right',
       '4825-Noisy_workplace',
       ...
       '864-Number_of_daysweek_walked_10_minutes', '874_raw-Duration_of_walks',
       '884-Number_of_daysweek_of_moderate_physical_activity_10_minutes',
       '894-Duration_of_moderate_activity',
       '904-Number_of_daysweek_of_vigorous_physical_activity_10_minutes',
       '924-Usual_walking_pace',
       '943-Frequency_of_stair_climbing_in_last_4_weeks',
       '971-Frequency_of_walking_for_pleasure_in_last_4_weeks',
       'ICDMAIN_ANY_ENTRY-Any_ICDMAIN_event_in_hilmo_or_causes_of_death',
       'episodic memory'],
     

In [35]:
part_one_clus.intersection(part_two_clus)

Index(['41248_1000-Destinations_on_discharge_from_hospital_recoded_Usual_Place_of_residence',
       '4290_raw-Duration_screen_displayed',
       '5262_raw-Intraocular_pressure_cornealcompensated_left',
       '5263_raw-Intraocular_pressure_Goldmanncorrelated_left',
       '6138_4-Qualifications_CSEs_or_equivalent',
       '6141_1-How_are_people_in_household_related_to_participant_Husband_wife_or_partner',
       '6143_3-Transport_type_for_commuting_to_job_workplace_Public_transport',
       '6146_1-Attendancedisabilitymobility_allowance_Attendance_allowance',
       '6147_1-Reason_for_glassescontact_lenses_For_shortsightedness_ie_only_or_mainly_for_distance_viewing_such_as_driving_cinema_etc_called_myopia',
       '6148_100-Eye_problemsdisorders_None_of_the_above',
       '6152_5-Blood_clot_DVT_bronchitis_emphysema_asthma_rhinitis_eczema_allergy_diagnosed_by_doctor_Blood_clot_in_the_leg_DVT',
       '6154_6-Medication_for_pain_relief_constipation_heartburn_Laxatives_eg_Dulcolax_Senoko

## part26 k0 to part29 k21

In [36]:
part_one = best_partitions.loc[26, "partition"]
part_two = best_partitions.loc[29, "partition"]

In [37]:
part_one_clus = data.loc[part_one == 0].index
part_two_clus = data.loc[part_two == 21].index

In [38]:
part_one_clus

Index(['100001_raw-Food_weight', '100010-Portion_size',
       '100013_raw-Vitamin_B12', '100015_raw-Vitamin_C', '100018_raw-Retinol',
       '100019_raw-Carotene', '100021_raw-Vitamin_D', '100022-Alcohol',
       '100023_raw-Starch', '1001-Duration_of_strenuous_sports',
       ...
       'viral human hepatitis infection', 'viral meningitis',
       'visual impairment', 'vitiligo', 'vitreous body disease',
       'vocal cord polyp', 'voice disorders',
       'wellbeing measurement AND family relationship', 'whooping cough',
       'wrist fracture'],
      dtype='object', length=3318)

In [39]:
part_two_clus

Index(['EAGLE_Eczema', 'SSGAC_Education_Years_Pooled',
       'attention deficit hyperactivity disorder', 'bone density',
       'inflammatory bowel disease', 'intracranial volume measurement',
       'schizophrenia'],
      dtype='object')

In [40]:
part_one_clus.intersection(part_two_clus)

Index(['bone density'], dtype='object')