# Description

It selects diseases only from traits.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name
from data.cache import read_data

# Settings

In [3]:
INPUT_FILEPATH = Path(
    conf.RESULTS['PROJECTIONS_DIR'],
    'projection-smultixcan-efo_partial-mashr-zscores.pkl'
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections/projection-smultixcan-efo_partial-mashr-zscores.pkl')

'projection-smultixcan-efo_partial-mashr-zscores'

In [4]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'traits_selections'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/traits_selections')

# Load input file

In [5]:
data = pd.read_pickle(INPUT_FILEPATH).T

In [6]:
data.shape

(3749, 987)

In [7]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.018452,0.052938,-0.003629,0.028359,-0.0155,0.035854,-0.015439,0.023007,0.017368,0.026811,...,0.03356,0.047126,-0.036183,0.06875,0.023462,-0.030111,0.011272,-0.017171,0.016078,-0.022283
100002_raw-Energy,-0.043782,-0.012041,-0.011772,-0.006148,0.007011,0.018142,0.003144,0.018049,0.006926,0.038587,...,0.004833,0.022842,-0.009519,-0.000258,0.059764,-0.028394,-0.005967,0.045269,-0.007684,-0.01891
100003_raw-Protein,-0.021514,-0.028537,0.009441,0.007808,0.012707,0.021681,-0.006315,0.016129,7.6e-05,-0.001702,...,0.029704,0.029135,-0.056508,-0.002032,0.001189,-0.025507,-0.013012,0.037458,-0.009592,-0.016718
100004_raw-Fat,-0.030454,-0.052542,0.000459,-0.039613,0.006191,0.029523,0.000747,0.011876,-0.025758,0.025099,...,0.0159,0.016482,0.007409,-0.006833,0.036457,-0.034531,0.015365,0.023796,-0.017477,-0.005397
100005_raw-Carbohydrate,-0.017428,0.003757,-0.003708,-0.000929,-0.000647,-0.005729,0.02497,0.011531,0.035043,0.025159,...,-0.010071,0.002266,0.006664,0.00738,0.02994,-0.006989,0.014807,0.050208,0.005352,-0.049218


# Select diseases only

In [8]:
input_file = conf.PHENOMEXCAN["TRAITS_FULLCODE_TO_EFO_MAP_FILE"]
display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/phenomexcan_traits_fullcode_to_efo.tsv')

In [9]:
ukb_to_efo_map = read_data(input_file)

In [10]:
ukb_to_efo_map.shape

(1087, 7)

In [11]:
ukb_to_efo_map.head()

Unnamed: 0,ukb_code,term_label,term_codes,mapping_type,ukb_fullcode,current_term_label,category
0,K55,vascular disease,"EFO:0004264, EFO:0009431",Broad,K55-Diagnoses_main_ICD10_K55_Vascular_disorder...,vascular disease AND intestinal disease,disease
1,M17,osteoarthritis || knee,EFO:0004616,Broad,M17-Diagnoses_main_ICD10_M17_Gonarthrosis_arth...,"osteoarthritis, knee",disease
2,R30,dysuria,EFO:0003901,? Broad,R30-Diagnoses_main_ICD10_R30_Pain_associated_w...,dysuria,
3,O60,premature birth,EFO:0003917,? Exact,O60-Diagnoses_main_ICD10_O60_Preterm_delivery,premature birth,
4,S64,carpal tunnel syndrome,EFO:0004143,? Narrow,S64-Diagnoses_main_ICD10_S64_Injury_of_nerves_...,carpal tunnel syndrome,disease


In [12]:
efo_diseases = ukb_to_efo_map[ukb_to_efo_map['category'] == 'disease']['current_term_label'].unique()

In [13]:
efo_diseases.shape

(538,)

In [14]:
data = data.loc[efo_diseases]

In [15]:
data.shape

(538, 987)

In [16]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
vascular disease AND intestinal disease,0.014921,0.007221,-0.021297,-0.013469,-0.004764,0.019406,0.034742,0.015188,-0.054239,-0.057495,...,-6e-06,0.009317,-0.021804,-0.025295,0.004658,-0.007537,-0.038976,-0.000494,-0.046299,0.039424
"osteoarthritis, knee",-0.027514,-0.011319,-0.005507,0.02466,-0.012269,-0.037501,-0.041009,-0.018378,0.05672,-0.035201,...,-0.021507,0.023317,-0.02085,0.025021,0.003739,-0.008428,-0.040916,0.012425,0.03313,0.009771
carpal tunnel syndrome,0.037605,0.026694,-0.004169,-0.009875,-0.038713,-0.019403,-0.035912,0.008916,0.009359,0.034914,...,-0.009627,0.073954,-0.045759,0.030834,-0.013835,-0.016858,0.00382,-0.012988,-0.03613,-0.011489
gastritis,-0.028098,0.006805,0.011964,-0.008576,-0.034687,-0.033875,-0.04308,0.037237,-0.006757,0.003071,...,0.037084,0.039675,0.034724,0.004581,0.021483,-0.023586,0.031696,-0.003781,0.017152,0.041463
neoplasm,0.015709,0.004118,-0.015952,0.016398,0.010938,-0.01717,0.01179,0.055354,-0.020357,0.018452,...,-0.002537,-0.004774,0.006254,-0.00818,-0.014205,0.031993,0.021475,0.015875,0.031616,-0.041767


In [17]:
assert not data.isna().any().any()

# Save

In [18]:
output_file = Path(
    RESULTS_DIR,
    f'diseases_only-{input_filepath_stem}.pkl',
).resolve()

display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/traits_selections/diseases_only-projection-smultixcan-efo_partial-mashr-zscores.pkl')

In [19]:
data.to_pickle(output_file)