# Description

This notebook implements one of the strategies (one out of two) to generate a null distribution for clustering results.
This strategy is referred to as "Null #2" in the manuscript.
See notebook `../00-shuffle_genes/00_00-shuffle_genes.ipynb` for introductory details.

This strategy shuffles the latent space instead of the the input data. For this, it projects the input matrix **M** (genes x traits) into the latent space, and then it shuffles LVs for each trait in the projected matrix. Finally, this projected matrix is used in the clustering pipeline (rest of the notebooks in this folder) to get the results.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

from IPython.display import display
import numpy as np
from scipy import stats
import pandas as pd
import pytest

import conf

# Settings

In [3]:
NULL_DIR = conf.RESULTS["CLUSTERING_NULL_DIR"] / "shuffle_lvs"

In [4]:
RESULTS_PROJ_OUTPUT_DIR = Path(conf.RESULTS["PROJECTIONS_DIR"])
RESULTS_PROJ_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_PROJ_OUTPUT_DIR)

PosixPath('/opt/data/results/projections')

In [5]:
OUTPUT_DIR = Path(
    NULL_DIR,
    "projections",
).resolve()
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

display(OUTPUT_DIR)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_lvs/projections')

In [6]:
rs = np.random.RandomState(0)

# Load projection of S-MultiXcan into LV space

In [7]:
smultixcan_results_filename = conf.PHENOMEXCAN[
    "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
]

display(smultixcan_results_filename)
assert smultixcan_results_filename.exists()

PosixPath('/opt/data/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.pkl')

In [8]:
results_filename_stem = smultixcan_results_filename.stem
display(results_filename_stem)

'smultixcan-efo_partial-mashr-zscores'

In [9]:
input_file = Path(
    RESULTS_PROJ_OUTPUT_DIR, f"projection-{results_filename_stem}.pkl"
).resolve()

display(input_file)

PosixPath('/opt/data/results/projections/projection-smultixcan-efo_partial-mashr-zscores.pkl')

In [10]:
projected_data = pd.read_pickle(input_file)

In [11]:
projected_data.shape

(987, 3752)

In [12]:
projected_data.head()

Unnamed: 0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
LV1,-0.021292,-0.046815,-0.021585,-0.030324,-0.017773,-0.007844,-0.015529,-0.011609,-0.051342,-0.036813,...,-0.001614,0.002102,-0.005318,-0.003041,0.012873,-0.015951,-0.018904,0.009576,0.019755,0.010027
LV2,0.056061,-0.009864,-0.028888,-0.053573,0.006951,-0.054597,-0.030748,0.014519,-0.026933,0.009024,...,-0.029614,-0.033636,0.020768,0.010656,-0.013305,0.050738,0.012085,-0.032372,-0.02454,-0.002616
LV3,0.00165,-0.004262,0.019211,0.007484,0.004548,0.006548,0.006741,-0.005634,-0.022129,-0.017256,...,0.010539,-0.019591,-0.013391,-0.044369,0.020142,0.006052,0.020869,0.043931,0.058049,-0.013144
LV4,0.026265,-0.007235,0.004304,-0.042464,-0.001093,-0.041208,-0.051179,0.00906,-0.003999,-0.005874,...,-0.018162,0.012136,-0.046232,-0.030949,-0.040147,0.044545,0.009865,-0.049828,-0.013725,0.024988
LV5,-0.015616,0.010799,0.017792,0.010591,0.001972,0.040024,-0.027612,-0.00154,0.033072,0.009477,...,0.030632,-0.023253,0.015853,-0.002222,0.019385,-0.015501,-0.031477,-0.002554,-0.03262,0.017731


# Shuffle projected data

In [13]:
shuffled_projected_data = projected_data.apply(
    lambda x: x.sample(frac=1, random_state=rs).to_numpy()
)

In [14]:
shuffled_projected_data.shape

(987, 3752)

In [15]:
shuffled_projected_data.head()

Unnamed: 0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
LV1,-0.073679,-0.015465,-0.008602,-0.012982,-0.045931,-0.040631,0.001114,-0.013283,-0.051335,-0.006559,...,0.017411,-0.014492,-0.00525,0.028588,0.022616,-0.009072,-0.012076,-0.016664,-0.001391,0.003459
LV2,-0.052828,0.016444,-0.009284,-0.00228,-0.032513,-0.007894,0.017441,-0.031007,-0.041056,-0.029158,...,-0.044056,-0.015335,-0.011027,-0.033919,0.024128,-0.000428,0.014698,-0.031238,0.008222,-0.011381
LV3,0.041971,-0.023914,-0.042691,-0.010464,0.016177,0.01419,0.013297,0.015347,-0.011235,-0.015889,...,-0.027326,-0.015616,0.033039,-0.026169,-0.036931,0.008826,0.049195,0.003056,0.022983,-0.003958
LV4,0.004956,-0.002509,0.037642,-0.037028,0.01015,-0.002645,-0.038279,0.006125,0.017921,-0.03442,...,0.006024,-0.016856,0.013144,0.012766,-0.040705,0.002743,0.017473,-0.039891,-0.02596,0.017731
LV5,-0.007681,-0.00076,0.010101,-0.008053,-0.038093,-0.064487,0.019368,0.025036,-0.02485,-0.036107,...,0.01051,0.000881,-0.003383,-0.029238,0.043541,0.016166,-0.056732,0.010405,0.02995,0.002559


## Testing

In [16]:
assert stats.pearsonr(projected_data.loc["LV1"], projected_data.loc["LV1"])[
    0
] == pytest.approx(1.0)
assert stats.pearsonr(
    shuffled_projected_data.loc["LV1"], shuffled_projected_data.loc["LV1"]
)[0] == pytest.approx(1.0)

In [17]:
_tmp = stats.pearsonr(shuffled_projected_data.loc["LV1"], projected_data.loc["LV1"])
display(_tmp)
assert _tmp[0] == pytest.approx(0.0, rel=0, abs=0.01)

(-0.007643915919422077, 0.6397359812853393)

In [18]:
assert stats.pearsonr(
    projected_data["100001_raw-Food_weight"], projected_data["100001_raw-Food_weight"]
)[0] == pytest.approx(1.0)
assert stats.pearsonr(
    shuffled_projected_data["100001_raw-Food_weight"],
    shuffled_projected_data["100001_raw-Food_weight"],
)[0] == pytest.approx(1.0)

In [19]:
_tmp = stats.pearsonr(
    shuffled_projected_data["100001_raw-Food_weight"],
    projected_data["100001_raw-Food_weight"],
)
display(_tmp)
assert _tmp[0] == pytest.approx(0.02, rel=0, abs=0.01)

(0.021171323526471955, 0.5064588516293915)

# Quick analysis

Ensure we broke known relationships

In [20]:
(shuffled_projected_data.loc["LV603"].sort_values(ascending=False).head(20))

inflammatory bowel disease                                                                      0.195017
family history of breast cancer                                                                 0.142886
K11_COELIAC-Coeliac_disease                                                                     0.131584
6177_1-Medication_for_cholesterol_blood_pressure_or_diabetes_Cholesterol_lowering_medication    0.115584
30210_raw-Eosinophill_percentage                                                                0.105227
6158_3-Why_reduced_smoking_Health_precaution                                                    0.104209
3148_raw-Heel_bone_mineral_density_BMD                                                          0.100769
throat disease                                                                                  0.099966
49_raw-Hip_circumference                                                                        0.098809
20003_1140865416-Treatmentmedication_code_colpermin_02m

In [21]:
(shuffled_projected_data.loc["LV136"].sort_values(ascending=False).head(20))

22601_41313093-Job_coding_ward_clerk_medical_records_clerk                                                                  0.141402
30070_raw-Red_blood_cell_erythrocyte_distribution_width                                                                     0.129152
rheumatoid arthritis                                                                                                        0.113252
fever of unknown origin                                                                                                     0.111836
K11_APPENDOTH-Other_appendicitis                                                                                            0.111691
5496-Leg_pain_when_walking_ever_disappears_while_walking                                                                    0.105609
22601_24213281-Job_coding_charteredcertified_accountant_auditor_company_accountant_articledaudit_clerk_official_receiver    0.103579
30050_raw-Mean_corpuscular_haemoglobin                               

In [22]:
(shuffled_projected_data.loc["LV844"].sort_values(ascending=False).head(20))

50_raw-Standing_height                                                                                                               0.160837
22601_52442947-Job_coding_television_or_radio_engineer_video_engineer                                                                0.153703
22601_31152695-Job_coding_quality_assurance_techniciancoordinator                                                                    0.135301
monocyte count                                                                                                                       0.126518
30250_raw-Reticulocyte_count                                                                                                         0.107709
20095_1-Size_of_white_wine_glass_drunk_small_125ml                                                                                   0.104462
22617_9244-Job_SOC_coding_School_midday_assistants                                                                                   0.100867
6140_2

In [23]:
(shuffled_projected_data.loc["LV246"].sort_values(ascending=False).head(20))

hypothyroidism AND myxedema                                                                                                   0.309303
30100_raw-Mean_platelet_thrombocyte_volume                                                                                    0.215921
2986-Started_insulin_within_one_year_diagnosis_of_diabetes                                                                    0.158714
sum of neutrophil and eosinophil counts                                                                                       0.151475
20003_1140882728-Treatmentmedication_code_otomize_ear_spray                                                                   0.128635
22660_102-Gap_coding_Unpaid_or_voluntary_work                                                                                 0.120179
20003_1140876146-Treatmentmedication_code_rhinocort_50micrograms_nasal_spray                                                  0.114425
erythrocyte count                                      

# Save

In [24]:
output_file = Path(OUTPUT_DIR, f"projection-{results_filename_stem}.pkl").resolve()

display(output_file)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_lvs/projections/projection-smultixcan-efo_partial-mashr-zscores.pkl')

In [25]:
shuffled_projected_data.to_pickle(output_file)