# Description

This notebook projects the PhenomeXcan data (MultiXcan results) into the MultiPLIER latent space. It takes only the top 1% of genes from each latent variable.

*Technical debt:* the notebook is usable and does its job, but will be refactored in the future, since other configurations are also useful (for instance, take different percentages of top genes to make the projection). Ideas for future refactoring:

1. Move projection code into libs.

# Modules loadings

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle

import numpy as np
import pandas as pd

import settings as conf
from metadata import GENE_ID_TO_NAME_MAP, GENE_NAME_TO_ID_MAP

# Read MultiPLIER model metadata

In [3]:
metadata_dir = os.path.dirname(conf.MULTIPLIER_SETTINGS['RECOUNT2_FULL_MODEL_FILE'])
metadata_file = os.path.join(metadata_dir, 'multiplier_model_metadata.pkl')
display(metadata_file)

'/media/miltondp/Elements/projects/multiplier/recount2_PLIER_data/multiplier_model_metadata.pkl'

In [4]:
with open(metadata_file, 'rb') as handle:
    multiplier_model_metadata = pickle.load(handle)

In [5]:
multiplier_model_metadata

{'L1': 120.5660870071812, 'L2': 241.1321740143624, 'L3': 0.012696760842460974}

# Read MultiPLIER Z (loadings)

In [6]:
multiplier_model_z = pd.read_pickle(os.path.join(conf.MULTIPLIER_SETTINGS['RECOUNT2_DATA_DIR'], 'multiplier_model_z.pkl'))

In [7]:
multiplier_model_z.shape

(6750, 987)

In [8]:
multiplier_model_z.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


# Load PhenomeXcan data for S-MultiXcan results

In [9]:
smultixcan_results_filename = os.path.join(conf.PHENOMEXCAN_SETTINGS['GENE_ASSOC_DIR'], 'smultixcan-mashr-zscores.pkl')
display(smultixcan_results_filename)

'/media/miltondp/Elements/projects/phenomexcan/base/gene_assoc/smultixcan-mashr-zscores.pkl'

In [10]:
smultixcan_results = pd.read_pickle(smultixcan_results_filename)

In [11]:
smultixcan_results.shape

(22515, 4091)

In [12]:
smultixcan_results.head()

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.169468,0.102558,0.239545,0.887758,1.313448,1.472148,0.72616,1.516367,1.299771,1.068093,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.377624,0.738444,0.298259
ENSG00000000457,1.358856,1.846875,0.139324,0.12953,0.757757,1.103979,0.612418,1.822327,2.035372,1.008058,...,1.441795,0.654791,2.545653,1.202984,0.514244,0.237223,0.414171,0.101731,1.012735,0.945167
ENSG00000000460,0.151008,1.173202,1.179426,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.033308,0.482261,0.695624,0.33648,0.083316,3.493196,0.991948
ENSG00000000938,1.302722,0.841524,1.578926,0.72134,0.139314,4.387016,0.125959,1.247123,0.215124,0.892083,...,0.126657,0.048048,1.886356,0.540496,0.127524,1.494501,0.056432,1.704863,1.351619,1.027297
ENSG00000000971,1.338813,0.262339,0.689379,1.702019,0.325859,0.063161,1.141126,0.882682,0.035533,1.810191,...,0.858497,1.675562,2.319072,1.598721,0.162958,0.005703,3.004544,0.803669,0.444266,0.165671


# Get genes in common

In [13]:
lvs_gene_ids = [
    GENE_NAME_TO_ID_MAP[g]
    for g in multiplier_model_z.index
    if g in GENE_NAME_TO_ID_MAP and GENE_NAME_TO_ID_MAP[g] in smultixcan_results.index
]

In [14]:
len(lvs_gene_ids)

6452

In [15]:
lvs_gene_names = [GENE_ID_TO_NAME_MAP[g] for g in lvs_gene_ids]

In [16]:
len(lvs_gene_names)

6452

In [17]:
assert len(lvs_gene_ids) == len(lvs_gene_names)

## MultiPLIER Z: select common genes and rename them to IDs

In [18]:
multiplier_model_z_common = multiplier_model_z.loc[lvs_gene_names].rename(index=GENE_NAME_TO_ID_MAP)

In [19]:
multiplier_model_z_common.shape

(6452, 987)

In [20]:
multiplier_model_z_common.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
ENSG00000183087,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
ENSG00000157227,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
ENSG00000096696,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
ENSG00000175130,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
ENSG00000113140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


## Quick look at LV603 genes

In [21]:
t = multiplier_model_z_common['LV603']

In [22]:
t.quantile(0.90)

0.38182870346382053

In [23]:
t.sort_values(ascending=False).head()

ENSG00000180871    5.320459
ENSG00000162747    5.128372
ENSG00000173535    5.035457
ENSG00000112303    4.680865
ENSG00000204160    4.495976
Name: LV603, dtype: float64

In [24]:
t = t[t > 0]

In [25]:
t.sort_values(ascending=False).rename(index=GENE_ID_TO_NAME_MAP).head(10)

CXCR2        5.320459
FCGR3B       5.128372
TNFRSF10C    5.035457
VNN2         4.680865
ZDHHC18      4.495976
MNDA         4.488505
CXCR1        4.442062
P2RY13       4.404405
VNN3         4.253184
FPR2         4.187560
Name: LV603, dtype: float64

# MultiPLIER Z: keep only those genes with high weight

In [26]:
PERCENTILE_SELECTED = 0.99

In [27]:
def zero_nonimportant_genes(x, perc=PERCENTILE_SELECTED):
#     x = x.copy()
    x_gt_zero = x[x > 0]
    q = x_gt_zero.quantile(perc)
    x[x < q] = 0.0
    return x

In [28]:
multiplier_model_z_common_important_genes = multiplier_model_z_common.copy().apply(zero_nonimportant_genes)

In [29]:
multiplier_model_z_common_important_genes.shape

(6452, 987)

In [30]:
multiplier_model_z_common_important_genes.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
ENSG00000183087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000157227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000096696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000175130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000113140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
multiplier_model_z_common_important_genes.describe()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
count,6452.0,6452.0,6452.0,6452.0,6452.0,6452.0,6452.0,6452.0,6452.0,6452.0,...,6452.0,6452.0,6452.0,6452.0,6452.0,6452.0,6452.0,6452.0,6452.0,6452.0
mean,0.000463,0.01018,0.014661,0.011473,0.005909,0.004569,0.011694,0.01708,0.016936,0.01262,...,0.006355,0.007879,0.006136,0.007588,0.005837,0.006172,0.015327,0.012379,0.009219,0.010752
std,0.037218,0.196579,0.232178,0.173251,0.125828,0.081948,0.207472,0.284127,0.228894,0.236841,...,0.10757,0.13558,0.104844,0.148526,0.131026,0.128982,0.185517,0.184729,0.151214,0.136551
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.989508,5.695377,4.631069,3.820836,4.68236,2.431865,5.125221,6.110712,4.178663,7.058765,...,4.597933,6.467211,3.18578,6.472468,7.383183,6.872569,3.52363,4.717425,4.486173,2.537906


### Some testing on LV603

In [32]:
_lv_prev = multiplier_model_z_common['LV603']

In [33]:
_lv_prev.shape

(6452,)

In [34]:
_lv_prev_nonzero = _lv_prev[_lv_prev > 0.0]

In [35]:
_lv_prev_nonzero.shape

(2179,)

In [36]:
_lv_prev_q = _lv_prev_nonzero.quantile(PERCENTILE_SELECTED)
display(_lv_prev_q)

3.5563536352038447

In [37]:
_tmp = _lv_prev[_lv_prev > _lv_prev_q].shape
display(_tmp)

(22,)

In [38]:
assert _tmp[0] == 22

How many non-zero genes are in any LV? There must be around 22 (1 percent of nonzero loadings genes)

In [39]:
_any_lv = multiplier_model_z_common_important_genes['LV603']

In [40]:
_any_lv.shape

(6452,)

In [41]:
_any_lv.head()

ENSG00000183087    0.0
ENSG00000157227    0.0
ENSG00000096696    0.0
ENSG00000175130    0.0
ENSG00000113140    0.0
Name: LV603, dtype: float64

In [42]:
_any_lv[_any_lv > 0].shape

(22,)

In [43]:
assert _tmp[0] == _any_lv[_any_lv > 0].shape[0]

### Some testing on LV136

In [44]:
_any_lv = multiplier_model_z_common_important_genes['LV136']

In [45]:
_any_lv.shape

(6452,)

In [46]:
_any_lv.head()

ENSG00000183087    0.0
ENSG00000157227    0.0
ENSG00000096696    0.0
ENSG00000175130    0.0
ENSG00000113140    0.0
Name: LV136, dtype: float64

In [47]:
_any_lv[_any_lv > 0].shape

(20,)

### What's the coverage of important genes?

In [48]:
genes_covered = set()

for lv_col in multiplier_model_z_common_important_genes.columns:
    lv_values = multiplier_model_z_common_important_genes[lv_col]
    lv_values = lv_values[lv_values > 0.0]
    genes_covered.update(lv_values.index.tolist())

In [49]:
len(genes_covered)

6376

In [50]:
len(genes_covered) / multiplier_model_z_common_important_genes.shape[0]

0.9882207067575945

Even considering only the top 1% of genes in LVs, we still have that most genes considered are part of some LV.

## How many genes, on average, are nonzero for each LV?

In [51]:
multiplier_model_z_common_important_genes.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
ENSG00000183087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000157227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000096696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000175130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000113140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
_tmp = multiplier_model_z_common_important_genes.apply(lambda x: x[x > 0].shape[0])

In [53]:
_tmp.describe()

count    987.000000
mean      27.256332
std        5.074710
min        1.000000
25%       25.000000
50%       27.000000
75%       30.000000
max       53.000000
dtype: float64

## Save MultiPLIER Z with only important genes selected

In [54]:
suffix = f'{PERCENTILE_SELECTED}'.replace('.', '_')
display(suffix)

'0_99'

In [55]:
output_filename = os.path.join(conf.LVS_MULTIPLIER_PHENOMEXCAN_SETTINGS['LVS_PROJECTIONS'], f'multiplier_z_important_genes_{suffix}.pkl')
display(output_filename)

'/media/miltondp/Elements/projects/k99/base/results/lvs_x_phenomexcan/projections/multiplier_z_important_genes_0_99.pkl'

In [56]:
multiplier_model_z_common_important_genes.to_pickle(output_filename)

## S-MultiXcan: keep common genes

In [57]:
smultixcan_results_common = smultixcan_results.loc[multiplier_model_z_common_important_genes.index]

In [58]:
smultixcan_results_common.shape

(6452, 4091)

In [59]:
smultixcan_results_common.head()

Unnamed: 0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
ENSG00000183087,0.322338,0.914191,0.257946,0.275937,1.14454,0.160631,2.45603,0.415369,0.398203,0.576533,...,0.118405,1.020714,1.071265,0.627512,0.016345,5.190657,0.142748,0.696664,0.359041,0.536451
ENSG00000157227,0.419139,0.968943,0.124378,0.237572,0.615571,0.379223,1.740254,0.639452,2.27427,1.293075,...,1.06105,0.318303,0.134067,0.291673,1.863238,1.134948,0.571293,1.031406,2.132095,1.700149
ENSG00000096696,1.12842,0.348985,0.65341,0.063173,0.413422,0.716294,0.076358,2.326302,0.544141,0.466012,...,0.530676,2.686939,0.122267,0.671125,1.188796,1.630909,1.007901,0.381546,1.364992,1.143065
ENSG00000175130,1.002128,0.02187,1.295205,0.29061,1.654534,0.64726,1.392347,0.903704,0.903001,0.262853,...,1.836999,0.37713,0.796055,0.42338,0.699479,1.114185,0.195246,1.577039,0.441571,0.141089
ENSG00000113140,0.981641,0.73406,0.058307,0.423182,2.159288,0.158809,1.230338,0.804094,1.148724,0.52607,...,0.272666,0.235158,1.421488,3.378371,0.010467,1.11032,0.71416,2.142983,0.893404,1.301364


# Multiply to obtains LVs associations to all traits

In [60]:
from numpy.linalg import pinv

In [61]:
# smultixcan_results_common_final = smultixcan_results_common_stded
smultixcan_results_common_final = smultixcan_results_common

In [62]:
# row z-score standardization
smultixcan_results_common_final = \
(
        # remove mean
        smultixcan_results_common_final.sub(smultixcan_results_common_final.mean(1), axis=0)
        # divide by std
        .div(smultixcan_results_common_final.std(1), axis=0)
)#.fillna(0)

In [63]:
# # row minmax standardization
# smultixcan_results_common_final = \
# (
#         # remove mean
#         smultixcan_results_common_final.sub(smultixcan_results_common_final.min(1), axis=0)
#         # divide by std
#         .div(smultixcan_results_common_final.max(1) - smultixcan_results_common_final.min(1), axis=0)
# )#.fillna(0)

In [64]:
# # row minmax total standardization
# smultixcan_results_common_final = \
# (
#         # remove mean
#         smultixcan_results_common_final.div(smultixcan_results_common_final.sum(1), axis=0)
# )#.fillna(0)

In [65]:
smultixcan_results_common_final.head(2)

Unnamed: 0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
ENSG00000183087,-0.745878,0.096334,-0.837508,-0.811906,0.424123,-0.975987,2.290384,-0.613494,-0.637921,-0.384155,...,-1.036076,0.247918,0.319853,-0.311612,-1.181309,6.181782,-1.001435,-0.213208,-0.693649,-0.441192
ENSG00000157227,-0.568076,0.186235,-0.972477,-0.817179,-0.298579,-0.62284,1.244446,-0.265815,1.977097,0.630932,...,0.312603,-0.70642,-0.959185,-0.742955,1.413176,0.413988,-0.359327,0.271932,1.782038,1.189423


In [66]:
_tmp = smultixcan_results_common_final.T.describe()

In [67]:
_tmp

Unnamed: 0,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679,...,ENSG00000111716,ENSG00000166796,ENSG00000114331,ENSG00000131584,ENSG00000165410,ENSG00000172757,ENSG00000147862,ENSG00000008323,ENSG00000167083,ENSG00000149257
count,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,...,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0,4091.0
mean,2.4315810000000003e-17,2.475002e-17,-1.862765e-16,7.207901000000001e-17,8.684218000000001e-18,-7.468428e-17,3.99474e-17,-1.215791e-17,-8.944745000000001e-17,-1.476317e-17,...,1.693423e-16,1.623949e-16,-3.300003e-17,-1.042106e-17,2.4315810000000003e-17,7.642112e-17,2.561844e-17,1.728159e-16,-1.102896e-16,-4.3421090000000004e-18
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.204328,-1.143089,-1.271972,-1.317103,-1.283498,-1.320244,-1.029331,-1.316081,-0.9829798,-1.310916,...,-1.319291,-1.333852,-1.301824,-1.156864,-1.264905,-1.088179,-1.238807,-1.290232,-1.113326,-1.122255
25%,-0.7486444,-0.7226303,-0.7495273,-0.7889164,-0.8041394,-0.7906524,-0.6222058,-0.7859982,-0.6238235,-0.7756578,...,-0.7778394,-0.7976366,-0.7892986,-0.7073513,-0.7579923,-0.6767003,-0.7742235,-0.7664349,-0.6765824,-0.6792223
50%,-0.2169492,-0.2222643,-0.2228226,-0.2030086,-0.2208637,-0.220064,-0.1828274,-0.2021253,-0.2080361,-0.2239812,...,-0.2341722,-0.1955173,-0.1880682,-0.193614,-0.2172526,-0.2327133,-0.208408,-0.2023133,-0.2157647,-0.2226477
75%,0.5211107,0.4730266,0.5390974,0.5539437,0.5801046,0.5850646,0.4406687,0.5864414,0.3281706,0.5576957,...,0.5651602,0.624457,0.5414168,0.4674506,0.5245787,0.417193,0.4930067,0.5398479,0.4115888,0.4289178
max,9.420778,8.4671,12.13591,5.666027,5.654056,5.942638,22.17474,8.660557,10.08603,6.200707,...,4.992684,4.89975,5.761907,12.18858,5.769775,10.18035,9.914657,11.73137,9.979559,19.87825


In [68]:
assert (_tmp.loc['mean'] < 1e-10).all()
assert np.allclose(_tmp.loc['std'].values, 1.0)

In [69]:
smultixcan_results_common_final = smultixcan_results_common_final.fillna(0)

In [70]:
multiplier_model_metadata['L2']

241.1321740143624

In [71]:
prev_mat = \
(
    multiplier_model_z_common_important_genes.T.dot(multiplier_model_z_common_important_genes)
    + (multiplier_model_metadata['L2'] * np.identity(multiplier_model_z_common_important_genes.shape[1]))
)

In [72]:
prev_mat.shape

(987, 987)

In [73]:
prev_mat = pd.DataFrame(pinv(prev_mat), index=prev_mat.index.copy(), columns=prev_mat.columns.copy())

In [74]:
lvs_traits_df = \
(
    prev_mat.dot(multiplier_model_z_common_important_genes.T).dot(smultixcan_results_common_final)
)

In [75]:
lvs_traits_df.shape

(987, 4091)

In [76]:
lvs_traits_df.head()

Unnamed: 0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
LV1,-0.011428,-0.003349,0.004548,0.00134,-0.008522,-0.01288,-0.010695,-0.00846,0.007192,0.000369,...,0.011257,-0.004408,-0.010043,-0.004443,-0.01403,-0.012685,0.025607,-0.011756,-0.009284,0.002057
LV2,-0.029025,-0.031933,0.035768,0.014905,-0.02803,-0.018171,-0.048591,-0.028944,0.03502,-0.022982,...,0.178994,-0.052794,0.08073,-0.010466,-0.030267,0.108304,-0.077986,-0.036163,0.086786,0.075796
LV3,-0.022593,0.041738,-0.052368,0.022981,-0.006197,-0.013814,0.007206,-0.01882,0.015699,-0.017102,...,0.083437,0.057528,0.113129,-0.027433,-0.043856,0.045791,-0.045977,-0.014493,0.014274,0.042264
LV4,0.032312,0.017544,0.049875,-0.007209,-0.008422,-0.003078,-0.012704,-0.003348,-0.011122,0.027604,...,0.05929,0.007098,0.082854,0.012517,-0.02701,0.08219,-0.000811,0.025551,-0.007174,0.052896
LV5,-0.020553,-0.031544,-0.029994,0.002896,0.023773,-0.018088,-0.009549,-0.033009,0.008574,-0.010971,...,-0.000531,-0.002261,0.002403,0.009198,-0.051971,0.077328,-0.010673,-0.036497,0.099145,0.047


# Quick analysis

In [77]:
lvs_traits_df.loc['LV603'].sort_values(ascending=False).head(20)

30140_raw-Neutrophill_count                          0.552626
30000_raw-White_blood_cell_leukocyte_count           0.508006
30180_raw-Lymphocyte_percentage                      0.451405
30200_raw-Neutrophill_percentage                     0.430967
30100_raw-Mean_platelet_thrombocyte_volume           0.403399
Astle_et_al_2016_Myeloid_white_cell_count            0.402407
Astle_et_al_2016_Sum_neutrophil_eosinophil_counts    0.398964
Astle_et_al_2016_Granulocyte_count                   0.398326
30090_raw-Platelet_crit                              0.397491
Astle_et_al_2016_Sum_basophil_neutrophil_counts      0.392550
Astle_et_al_2016_Neutrophil_count                    0.391924
30190_raw-Monocyte_percentage                        0.384196
Astle_et_al_2016_White_blood_cell_count              0.382439
30150-Eosinophill_count                              0.367542
30130_raw-Monocyte_count                             0.360042
50_raw-Standing_height                               0.353690
30220_ra

In [78]:
lvs_traits_df.loc['LV136'].sort_values(ascending=False).head(20)

50_raw-Standing_height                                 0.400824
20015_raw-Sitting_height                               0.350660
5132_raw-3mm_strong_meridian_right                     0.296917
5099_raw-3mm_weak_meridian_right                       0.294336
5096_raw-3mm_weak_meridian_left                        0.293114
4079_raw-Diastolic_blood_pressure_automated_reading    0.288737
5134_raw-6mm_strong_meridian_left                      0.285232
5097_raw-6mm_weak_meridian_left                        0.279296
5098_raw-6mm_weak_meridian_right                       0.273827
5135_raw-3mm_strong_meridian_left                      0.262497
5133_raw-6mm_strong_meridian_right                     0.261074
23107_raw-Impedance_of_leg_right                       0.249909
23106_raw-Impedance_of_whole_body                      0.246306
23110_raw-Impedance_of_arm_left                        0.237503
23109_raw-Impedance_of_arm_right                       0.233388
30280_raw-Immature_reticulocyte_fraction

# Save

In [79]:
lvs_traits_df.shape

(987, 4091)

In [80]:
lvs_traits_df.head()

Unnamed: 0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
LV1,-0.011428,-0.003349,0.004548,0.00134,-0.008522,-0.01288,-0.010695,-0.00846,0.007192,0.000369,...,0.011257,-0.004408,-0.010043,-0.004443,-0.01403,-0.012685,0.025607,-0.011756,-0.009284,0.002057
LV2,-0.029025,-0.031933,0.035768,0.014905,-0.02803,-0.018171,-0.048591,-0.028944,0.03502,-0.022982,...,0.178994,-0.052794,0.08073,-0.010466,-0.030267,0.108304,-0.077986,-0.036163,0.086786,0.075796
LV3,-0.022593,0.041738,-0.052368,0.022981,-0.006197,-0.013814,0.007206,-0.01882,0.015699,-0.017102,...,0.083437,0.057528,0.113129,-0.027433,-0.043856,0.045791,-0.045977,-0.014493,0.014274,0.042264
LV4,0.032312,0.017544,0.049875,-0.007209,-0.008422,-0.003078,-0.012704,-0.003348,-0.011122,0.027604,...,0.05929,0.007098,0.082854,0.012517,-0.02701,0.08219,-0.000811,0.025551,-0.007174,0.052896
LV5,-0.020553,-0.031544,-0.029994,0.002896,0.023773,-0.018088,-0.009549,-0.033009,0.008574,-0.010971,...,-0.000531,-0.002261,0.002403,0.009198,-0.051971,0.077328,-0.010673,-0.036497,0.099145,0.047


In [81]:
os.makedirs(conf.LVS_MULTIPLIER_PHENOMEXCAN_SETTINGS['LVS_PROJECTIONS'], exist_ok=True)

In [82]:
lvs_filename = os.path.join(conf.LVS_MULTIPLIER_PHENOMEXCAN_SETTINGS['LVS_PROJECTIONS'], 'lvs_x_smultixcan.pkl')
display(lvs_filename)

'/media/miltondp/Elements/projects/k99/base/results/lvs_x_phenomexcan/projections/lvs_x_smultixcan.pkl'

In [83]:
lvs_traits_df.to_pickle(lvs_filename)