# Description

This notebook implements one of the strategies (one out of two) to generate a null distribution for clustering results.
This strategy is referred to as "Null #1" in the manuscript. It takes the input data for clustering (matrix **M** with gene-trait pvalues converted to z-scores, with 22k genes and 4k traits) and shuffles genes' pvalues for each trait. Then this shuffled version of matrix M is projected into the latent space. Finally, this projected matrix is used in the clustering pipeline (rest of the notebooks in this folder) to get the results.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

from IPython.display import display
import numpy as np
from scipy import stats
import pandas as pd
import pytest

import conf
from entity import Gene

from multiplier import MultiplierProjection

# Settings

In [3]:
NULL_DIR = conf.RESULTS["CLUSTERING_NULL_DIR"] / "shuffle_genes"

In [4]:
RESULTS_PROJ_OUTPUT_DIR = Path(
    NULL_DIR,
    "projections",
).resolve()
RESULTS_PROJ_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_PROJ_OUTPUT_DIR)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/projections')

In [5]:
rs = np.random.RandomState(0)

# Load PhenomeXcan data (S-MultiXcan)

In [6]:
smultixcan_results_filename = conf.PHENOMEXCAN[
    "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
]

display(smultixcan_results_filename)

PosixPath('/opt/data/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.pkl')

In [7]:
results_filename_stem = smultixcan_results_filename.stem
display(results_filename_stem)

'smultixcan-efo_partial-mashr-zscores'

In [8]:
smultixcan_results = pd.read_pickle(smultixcan_results_filename)

In [9]:
smultixcan_results.shape

(22515, 3752)

In [10]:
smultixcan_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
ENSG00000000457,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
ENSG00000000460,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
ENSG00000000938,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
ENSG00000000971,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


## Gene IDs to Gene names

In [11]:
smultixcan_results = smultixcan_results.rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [12]:
smultixcan_results.shape

(22515, 3752)

In [13]:
smultixcan_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
SCYL3,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
C1orf112,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
FGR,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
CFH,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


## Remove duplicated gene entries

In [14]:
smultixcan_results.index[smultixcan_results.index.duplicated(keep="first")]

Index(['SPATA13', 'LINC01422', 'LINC00484', 'MAL2', 'GOLGA8M', 'LINC01115',
       'LYNX1'],
      dtype='object', name='gene_name')

In [15]:
smultixcan_results = smultixcan_results.loc[
    ~smultixcan_results.index.duplicated(keep="first")
]

In [16]:
smultixcan_results.shape

(22508, 3752)

## Some checks

In [17]:
# the data should have no NaN values
assert smultixcan_results.shape == smultixcan_results.dropna(how="any").shape

# Standardize S-MultiXcan results

Here we adjust for highly polygenic traits (see notebook `005_00-data_analysis.ipynb`): we penalize those traits that have large effect sizes across several genes, such as antropometric traits.

In [18]:
_tmp = smultixcan_results.apply(lambda x: x / x.sum())

In [19]:
_tmp.shape

(22508, 3752)

In [20]:
assert _tmp.shape == smultixcan_results.shape

In [21]:
# some testing
_trait = "body height"
_gene = "SCYL3"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "100001_raw-Food_weight"
_gene = "DPM1"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "estrogen-receptor negative breast cancer"
_gene = "CFH"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "asthma"
_gene = "C1orf112"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

In [22]:
smultixcan_results = _tmp

In [23]:
smultixcan_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,6e-05,3.9e-05,5e-06,1.6e-05,6.1e-05,7.5e-05,4e-06,4.254814e-05,3.1e-05,1.7e-05,...,2e-05,7.5e-05,6.4e-05,4.7e-05,6.6e-05,6.6e-05,6.9e-05,5.8e-05,6.250232e-05,3.4e-05
SCYL3,3.3e-05,5.5e-05,0.000119,4.1e-05,5e-05,1.1e-05,5.9e-05,4.103114e-05,2e-05,3.6e-05,...,0.000118,7e-06,3e-05,7e-06,2.9e-05,0.000126,2.7e-05,0.000114,1.421837e-05,7e-05
C1orf112,2.7e-05,2.2e-05,6.7e-05,2.3e-05,2.2e-05,1.4e-05,6.7e-05,4.432186e-05,3e-05,4.1e-05,...,9.8e-05,5.5e-05,3e-05,2.3e-05,8.4e-05,6.8e-05,1.8e-05,2.4e-05,9.338414e-06,5.6e-05
FGR,1.5e-05,1.4e-05,4.7e-05,1.9e-05,3e-06,1e-05,8e-06,5.350929e-07,1.9e-05,2e-05,...,3.6e-05,0.000114,0.000152,6e-06,5.7e-05,3.8e-05,3.5e-05,5e-05,1.969101e-05,5.8e-05
CFH,2.9e-05,2.1e-05,3.9e-05,6.3e-05,1.7e-05,1.8e-05,8.8e-05,1.668206e-05,1.7e-05,9.5e-05,...,1.4e-05,3.6e-05,6e-05,3.8e-05,8.3e-05,1.5e-05,5e-06,8.1e-05,8.895497e-11,3.6e-05


# Shuffle genes within traits

In [24]:
shuffled_smultixcan_results = smultixcan_results.apply(
    lambda x: x.sample(frac=1, random_state=rs).to_numpy()
)

In [25]:
shuffled_smultixcan_results.shape

(22508, 3752)

In [26]:
shuffled_smultixcan_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,5.1e-05,3.1e-05,6.5e-05,0.000138,4.1e-05,5e-06,2e-06,3.3e-05,2.2e-05,3.3e-05,...,0.000111,1.2e-05,2.9e-05,3.9e-05,4.9e-05,6.1e-05,1.4e-05,6.922541e-05,5.6e-05,5e-05
SCYL3,6e-05,6.5e-05,4.5e-05,3.9e-05,3e-05,5.6e-05,0.0001,5.2e-05,6.7e-05,5.4e-05,...,5.7e-05,5.3e-05,0.000107,6e-06,3.5e-05,4.4e-05,8.3e-05,3.265571e-07,4e-06,4.8e-05
C1orf112,6.5e-05,4e-05,4e-06,0.000119,4e-06,2.8e-05,3.4e-05,1.5e-05,2.2e-05,1.4e-05,...,8e-06,8e-06,8.3e-05,4.1e-05,3.8e-05,2e-06,6.4e-05,9.720101e-05,2.3e-05,0.000101
FGR,8e-06,4.5e-05,2.1e-05,5e-06,2e-05,3.6e-05,3e-06,2.7e-05,1.6e-05,4.9e-05,...,5.5e-05,6.1e-05,1e-05,6.6e-05,5.3e-05,2.5e-05,4.8e-05,2.945239e-05,7.8e-05,3.6e-05
CFH,3.9e-05,3e-05,9e-06,3e-06,2.3e-05,1.1e-05,0.000132,5.5e-05,4.2e-05,3.2e-05,...,2e-05,3e-06,6.7e-05,7.8e-05,7e-06,0.000126,5e-06,3.174253e-05,3.9e-05,8.3e-05


## Testing

In [27]:
assert stats.pearsonr(smultixcan_results.loc["DPM1"], smultixcan_results.loc["DPM1"])[
    0
] == pytest.approx(1.0)
assert stats.pearsonr(
    shuffled_smultixcan_results.loc["DPM1"], shuffled_smultixcan_results.loc["DPM1"]
)[0] == pytest.approx(1.0)

In [28]:
_tmp = stats.pearsonr(
    smultixcan_results.loc["DPM1"], shuffled_smultixcan_results.loc["DPM1"]
)
display(_tmp)
assert _tmp[0] == pytest.approx(0.0, rel=0, abs=0.02)

(0.015004459012860445, 0.3581885908075809)

In [29]:
assert stats.pearsonr(
    smultixcan_results["100001_raw-Food_weight"],
    smultixcan_results["100001_raw-Food_weight"],
)[0] == pytest.approx(1.0)
assert stats.pearsonr(
    shuffled_smultixcan_results["100001_raw-Food_weight"],
    shuffled_smultixcan_results["100001_raw-Food_weight"],
)[0] == pytest.approx(1.0)

In [30]:
_tmp = stats.pearsonr(
    smultixcan_results["100001_raw-Food_weight"],
    shuffled_smultixcan_results["100001_raw-Food_weight"],
)
display(_tmp)
assert _tmp[0] == pytest.approx(0.0, rel=0, abs=0.02)

(-0.004639833221574795, 0.48638888315567747)

In [31]:
del smultixcan_results

# Project S-MultiXcan data into MultiPLIER latent space

In [32]:
mproj = MultiplierProjection()

In [33]:
smultixcan_into_multiplier = mproj.transform(shuffled_smultixcan_results)

In [34]:
smultixcan_into_multiplier.shape

(987, 3752)

In [35]:
smultixcan_into_multiplier.head()

Unnamed: 0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
LV1,-0.004991,0.015146,0.011558,-0.004454,-0.004064,0.019181,-0.006774,-0.000993,-0.051895,0.006165,...,0.011411,0.007752,-0.007232,-0.012486,-0.019609,0.001056,-0.068064,-0.067764,0.027634,0.009072
LV2,-0.02016,0.025381,0.026267,0.025022,-0.014816,0.001336,-0.025144,-0.006939,0.031557,-0.006765,...,0.045494,0.054857,0.014009,0.010356,-0.028176,0.014216,0.051512,-0.012968,-0.009068,-0.022579
LV3,-0.02603,0.03099,-0.00883,0.010287,0.027811,-0.024533,0.028973,-0.004228,-0.022456,-0.009028,...,-0.022074,0.001223,-0.04034,0.033154,0.015559,-0.027344,0.028957,0.047436,-0.013414,-0.014697
LV4,0.020068,-0.004168,0.023151,-0.045471,0.015872,-0.016416,-0.041916,-0.042642,0.039436,-0.029249,...,-0.001791,0.013281,0.072869,0.075342,0.01981,-0.009894,-0.037994,-0.013222,0.012071,-0.00558
LV5,0.011753,0.006254,-0.0147,0.035624,-0.012119,0.001753,0.01645,0.00587,-0.026259,-0.032612,...,-0.048864,0.002043,-0.020228,-0.016124,0.019393,0.04204,0.060709,-0.04139,-0.039641,0.016831


# Quick analysis

In [36]:
(smultixcan_into_multiplier.loc["LV603"].sort_values(ascending=False).head(20))

5099_raw-3mm_weak_meridian_right                                                           0.156829
30130_raw-Monocyte_count                                                                   0.133407
J10_ASTHMA-Asthma                                                                          0.103145
20003_1141194386-Treatmentmedication_code_telfast_30_tablet                                0.088224
23124_raw-Arm_fat_mass_left                                                                0.082922
20003_1140888366-Treatmentmedication_code_thiamine_preparation                             0.081478
leukocyte count                                                                            0.080619
20003_1140888092-Treatmentmedication_code_elocon_cream                                     0.079712
pancreatitis                                                                               0.076367
22174-Recent_medication_for_bronchiectasis                                                 0.076257


In [37]:
(smultixcan_into_multiplier.loc["LV136"].sort_values(ascending=False).head(20))

1747_1-Hair_colour_natural_before_greying_Blonde                                                               0.293540
celiac disease                                                                                                 0.156183
fallopian tube disease                                                                                         0.100879
30020_raw-Haemoglobin_concentration                                                                            0.097562
1488_raw-Tea_intake                                                                                            0.095025
20548_1-Manifestations_of_mania_or_irritability_I_was_more_talkative_than_usual                                0.094591
22170-Recent_medication_for_COPD_Chronic_Obstructive_Pulmonary_Disease                                         0.093176
22601_35633214-Job_coding_vocational_or_industrial_trainerinstructor_craft_instructor_apprentice_instructor    0.092978
22601_52443062-Job_coding_television_vid

In [38]:
(smultixcan_into_multiplier.loc["LV844"].sort_values(ascending=False).head(20))

type i diabetes mellitus                                                                                                     0.193646
30040_raw-Mean_corpuscular_volume                                                                                            0.185107
K11_COELIAC-Coeliac_disease                                                                                                  0.145886
20406-Ever_addicted_to_alcohol                                                                                               0.134645
22601_41363318-Job_coding_database_assistantclerk_computer_clerk_data_entry_clerkprocessor_book_keepingaccounts_machinist    0.133810
22617_9149-Job_SOC_coding_Other_goods_handling_and_storage_occupations_nec                                                   0.122899
22611_1-Workplace_had_a_lot_of_cigarette_smoke_from_other_people_smoking_Sometimes                                           0.116214
22617_1136-Job_SOC_coding_Information_and_communication_techno

In [39]:
(smultixcan_into_multiplier.loc["LV246"].sort_values(ascending=False).head(20))

6144_3-Never_eat_eggs_dairy_wheat_sugar_Wheat_products                                           0.340213
20003_1140883066-Treatmentmedication_code_insulin_product                                        0.233553
hyperthyroidism AND thyrotoxicosis                                                               0.193569
30150-Eosinophill_count                                                                          0.147097
K11_GASTRODUOULC-Gastroduodenal_ulcer                                                            0.119671
30040_raw-Mean_corpuscular_volume                                                                0.115193
5182_1-Both_eyes_present_Yes                                                                     0.113313
20003_1140888092-Treatmentmedication_code_elocon_cream                                           0.112417
2267-Use_of_sunuv_protection                                                                     0.110457
20407-Frequency_of_failure_to_fulfil_normal_ex

# Save

In [40]:
output_file = Path(
    RESULTS_PROJ_OUTPUT_DIR, f"projection-{results_filename_stem}.pkl"
).resolve()

display(output_file)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/projections/projection-smultixcan-efo_partial-mashr-zscores.pkl')

In [41]:
smultixcan_into_multiplier.to_pickle(output_file)