# Description

It selects diseases only from traits.

# Modules loading

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name
from data.cache import read_data

# Settings

In [3]:
INPUT_FILEPATH = Path(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.pkl')

'smultixcan-efo_partial-mashr-zscores'

In [4]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'traits_selections'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/traits_selections')

# Load input file

In [5]:
data = pd.read_pickle(INPUT_FILEPATH).T

In [6]:
data.shape

(3749, 22515)

In [7]:
data.head()

gene_name,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,...,ENSG00000284240,ENSG00000284308,ENSG00000284395,ENSG00000284413,ENSG00000284418,ENSG00000284430,ENSG00000284452,ENSG00000284513,ENSG00000284526,ENSG00000284552
100001_raw-Food_weight,1.145442,0.618066,0.515724,0.280781,0.548127,0.045587,0.329995,0.109494,1.356741,1.474255,...,1.49041,0.230329,0.596503,0.519733,2.285074,0.12498,1.587903,1.522281,0.150938,1.010143
100002_raw-Energy,0.724557,1.028131,0.403596,0.25391,0.389877,0.197393,0.669649,0.04101,0.83212,0.954183,...,1.165679,0.111142,0.084263,1.229913,0.39019,0.505316,0.975901,1.817369,0.756393,0.729526
100003_raw-Protein,0.090876,2.21842,1.251359,0.879148,0.723469,0.777974,0.207873,0.536609,0.453969,1.286942,...,0.54137,0.734872,0.634674,1.31675,0.761859,1.276888,0.160988,0.346794,0.609476,0.222126
100004_raw-Fat,0.298165,0.762584,0.433091,0.352705,1.16725,0.578435,0.738983,0.565245,0.397189,0.192279,...,0.867217,0.540941,0.284347,1.661131,0.404078,1.248959,0.799771,1.443097,0.814969,0.545356
100005_raw-Carbohydrate,1.134347,0.934418,0.413466,0.051846,0.315952,0.046237,1.113674,0.319842,0.965217,0.919779,...,1.747265,0.496178,0.144053,0.701817,0.827677,0.587188,1.089338,2.001502,1.362716,1.49003


# Keep only genes in the MultiPLIER model

In [8]:
from entity import Gene
from multiplier import MultiplierProjection

In [9]:
multiplier_genes = \
    MultiplierProjection._read_model_z()\
    .rename(index=Gene.GENE_NAME_TO_ID_MAP).index

In [10]:
# common_genes = smultixcan_results.index.intersection(lincs_consensi_drugbank.index)
common_genes = multiplier_genes.intersection(data.columns)

In [11]:
common_genes

Index(['ENSG00000183087', 'ENSG00000157227', 'ENSG00000096696',
       'ENSG00000175130', 'ENSG00000113140', 'ENSG00000117984',
       'ENSG00000116016', 'ENSG00000129116', 'ENSG00000134686',
       'ENSG00000108679',
       ...
       'ENSG00000111716', 'ENSG00000166796', 'ENSG00000114331',
       'ENSG00000131584', 'ENSG00000165410', 'ENSG00000172757',
       'ENSG00000147862', 'ENSG00000008323', 'ENSG00000167083',
       'ENSG00000149257'],
      dtype='object', length=6452)

In [12]:
data = data[common_genes]

In [13]:
data.shape

(3749, 6452)

In [14]:
assert not data.isna().any().any()

# Select diseases only

In [15]:
input_file = conf.PHENOMEXCAN["TRAITS_FULLCODE_TO_EFO_MAP_FILE"]
display(input_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/data/phenomexcan/phenomexcan_traits_fullcode_to_efo.tsv')

In [16]:
ukb_to_efo_map = read_data(input_file)

In [17]:
ukb_to_efo_map.shape

(1087, 7)

In [18]:
ukb_to_efo_map.head()

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category
0,K55,vascular disease,"EFO:0004264, EFO:0009431",Broad,K55-Diagnoses_main_ICD10_K55_Vascular_disorder...,vascular disease AND intestinal disease,disease
1,M17,osteoarthritis || knee,EFO:0004616,Broad,M17-Diagnoses_main_ICD10_M17_Gonarthrosis_arth...,"osteoarthritis, knee",disease
2,R30,dysuria,EFO:0003901,? Broad,R30-Diagnoses_main_ICD10_R30_Pain_associated_w...,dysuria,
3,O60,premature birth,EFO:0003917,? Exact,O60-Diagnoses_main_ICD10_O60_Preterm_delivery,premature birth,
4,S64,carpal tunnel syndrome,EFO:0004143,? Narrow,S64-Diagnoses_main_ICD10_S64_Injury_of_nerves_...,carpal tunnel syndrome,disease


In [19]:
efo_diseases = ukb_to_efo_map[ukb_to_efo_map['category'] == 'disease']['current_term_label'].unique()

In [20]:
efo_diseases.shape

(538,)

In [21]:
data = data.loc[efo_diseases]

In [22]:
data.shape

(538, 6452)

In [23]:
data.head()

gene_name,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679,...,ENSG00000111716,ENSG00000166796,ENSG00000114331,ENSG00000131584,ENSG00000165410,ENSG00000172757,ENSG00000147862,ENSG00000008323,ENSG00000167083,ENSG00000149257
vascular disease AND intestinal disease,1.508862,1.621723,0.363885,0.853025,0.672102,1.661944,0.065144,0.852891,1.070464,1.646857,...,0.338839,1.045194,0.337591,0.727816,0.919068,1.028998,0.957071,1.032144,0.372147,1.101383
"osteoarthritis, knee",0.293745,2.495255,0.898634,1.097025,0.092686,1.35323,2.32963,0.875087,1.932763,0.830047,...,0.31505,1.740132,0.659764,0.585959,0.745088,0.859034,0.844895,0.577415,1.093594,1.46107
carpal tunnel syndrome,3.269874,1.031169,1.854991,0.849646,1.193563,0.963425,1.081281,0.583741,0.370495,0.987732,...,1.695482,0.469553,0.525354,1.462587,0.730298,1.266646,0.680878,2.512643,1.943234,0.574996
gastritis,0.731628,0.92258,0.20744,1.049796,0.237018,0.081444,1.578283,0.63575,1.193168,2.471475,...,1.927164,0.906981,0.535501,1.265527,0.412329,0.47471,0.454023,2.122722,0.842579,1.56191
neoplasm,3.09046,0.749946,0.479871,1.374612,0.164061,0.385841,0.960421,0.579108,0.311806,0.340309,...,1.405575,1.277434,1.423244,1.400503,0.210456,1.100135,1.864806,1.084138,1.477129,0.92596


In [24]:
assert not data.isna().any().any()

# Save

In [25]:
output_file = Path(
    RESULTS_DIR,
    f'diseases_only-{input_filepath_stem}.pkl',
).resolve()

display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/traits_selections/diseases_only-smultixcan-efo_partial-mashr-zscores.pkl')

In [26]:
data.to_pickle(output_file)