# Description

It selects diseases only from traits.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name
from data.cache import read_data

# Settings

In [3]:
INPUT_FILEPATH = Path(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.pkl')

'smultixcan-efo_partial-mashr-zscores'

In [4]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'traits_selections'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/traits_selections')

# Load input file

In [5]:
data = pd.read_pickle(INPUT_FILEPATH).T

In [6]:
data.shape

(3749, 22515)

In [7]:
data.head()

gene_name,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,...,ENSG00000284240,ENSG00000284308,ENSG00000284395,ENSG00000284413,ENSG00000284418,ENSG00000284430,ENSG00000284452,ENSG00000284513,ENSG00000284526,ENSG00000284552
100001_raw-Food_weight,1.145442,0.618066,0.515724,0.280781,0.548127,0.045587,0.329995,0.109494,1.356741,1.474255,...,1.49041,0.230329,0.596503,0.519733,2.285074,0.12498,1.587903,1.522281,0.150938,1.010143
100002_raw-Energy,0.724557,1.028131,0.403596,0.25391,0.389877,0.197393,0.669649,0.04101,0.83212,0.954183,...,1.165679,0.111142,0.084263,1.229913,0.39019,0.505316,0.975901,1.817369,0.756393,0.729526
100003_raw-Protein,0.090876,2.21842,1.251359,0.879148,0.723469,0.777974,0.207873,0.536609,0.453969,1.286942,...,0.54137,0.734872,0.634674,1.31675,0.761859,1.276888,0.160988,0.346794,0.609476,0.222126
100004_raw-Fat,0.298165,0.762584,0.433091,0.352705,1.16725,0.578435,0.738983,0.565245,0.397189,0.192279,...,0.867217,0.540941,0.284347,1.661131,0.404078,1.248959,0.799771,1.443097,0.814969,0.545356
100005_raw-Carbohydrate,1.134347,0.934418,0.413466,0.051846,0.315952,0.046237,1.113674,0.319842,0.965217,0.919779,...,1.747265,0.496178,0.144053,0.701817,0.827677,0.587188,1.089338,2.001502,1.362716,1.49003


# Select diseases only

In [8]:
input_file = conf.PHENOMEXCAN["TRAITS_FULLCODE_TO_EFO_MAP_FILE"]
display(input_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/data/phenomexcan/phenomexcan_traits_fullcode_to_efo.tsv')

In [9]:
ukb_to_efo_map = read_data(input_file)

In [10]:
ukb_to_efo_map.shape

(1087, 7)

In [11]:
ukb_to_efo_map.head()

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category
0,K55,vascular disease,"EFO:0004264, EFO:0009431",Broad,K55-Diagnoses_main_ICD10_K55_Vascular_disorder...,vascular disease AND intestinal disease,disease
1,M17,osteoarthritis || knee,EFO:0004616,Broad,M17-Diagnoses_main_ICD10_M17_Gonarthrosis_arth...,"osteoarthritis, knee",disease
2,R30,dysuria,EFO:0003901,? Broad,R30-Diagnoses_main_ICD10_R30_Pain_associated_w...,dysuria,
3,O60,premature birth,EFO:0003917,? Exact,O60-Diagnoses_main_ICD10_O60_Preterm_delivery,premature birth,
4,S64,carpal tunnel syndrome,EFO:0004143,? Narrow,S64-Diagnoses_main_ICD10_S64_Injury_of_nerves_...,carpal tunnel syndrome,disease


In [12]:
efo_diseases = ukb_to_efo_map[ukb_to_efo_map['category'] == 'disease']['current_term_label'].unique()

In [13]:
efo_diseases.shape

(538,)

In [14]:
data = data.loc[efo_diseases]

In [15]:
data.shape

(538, 22515)

In [16]:
data.head()

gene_name,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,...,ENSG00000284240,ENSG00000284308,ENSG00000284395,ENSG00000284413,ENSG00000284418,ENSG00000284430,ENSG00000284452,ENSG00000284513,ENSG00000284526,ENSG00000284552
vascular disease AND intestinal disease,1.463849,0.295149,0.862394,0.64243,2.560404,0.276682,1.25269,0.138074,0.253713,0.13149,...,0.238496,1.582408,0.470057,1.212982,1.320907,1.348926,0.925907,0.648023,3.172445,0.658885
"osteoarthritis, knee",1.139573,0.339193,0.063113,0.673883,1.480963,0.029437,0.738344,0.597559,0.310168,0.262036,...,0.144221,0.558024,1.497799,3.391703,0.745621,0.681965,0.577402,0.41246,0.24563,1.40189
carpal tunnel syndrome,0.91841,0.592007,0.221104,1.875068,0.427498,1.564119,1.739466,0.369079,0.795678,0.811066,...,0.326533,1.521576,0.307704,0.155117,1.459384,0.813065,0.526617,0.618006,0.384546,1.114975
gastritis,1.616585,0.675119,0.847838,0.976489,0.275587,0.13709,0.246508,1.27692,0.36882,0.205764,...,2.220743,0.934335,0.858603,0.42243,0.221982,0.083101,0.909691,0.539585,1.158575,0.101605
neoplasm,1.445983,2.245419,1.185771,1.236719,0.298348,0.987865,0.670254,0.563399,1.70164,1.710727,...,1.933531,0.586532,0.324582,0.557693,0.986534,1.205885,1.061329,0.743238,0.591596,0.730569


In [17]:
assert not data.isna().any().any()

# Save

In [18]:
output_file = Path(
    RESULTS_DIR,
    f'diseases_only-{input_filepath_stem}.pkl',
).resolve()

display(output_file)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/traits_selections/diseases_only-smultixcan-efo_partial-mashr-zscores.pkl')

In [19]:
data.to_pickle(output_file)