# Description

It uses the PhenomeXcan traits to EFO mapping files to group traits that end up having the same EFO label. This only combines the fastENLOC results (RCPs) by taking the maximum value across all traits with the same EFO label.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.display import display
import numpy as np
import pandas as pd

import conf
from data.cache import read_data
from entity import Trait

# Load fastENLOC

In [3]:
fastenloc_rcps = read_data(conf.PHENOMEXCAN["FASTENLOC_TORUS_RCP_FILE"])

In [4]:
fastenloc_rcps.shape

(38062, 4091)

In [5]:
fastenloc_rcps.head()

Unnamed: 0_level_0,O46-Diagnoses_main_ICD10_O46_Antepartum_haemorrhage_not_elsewhere_classified,K30-Diagnoses_main_ICD10_K30_Dyspepsia,2907-Ever_stopped_smoking_for_6_months,H7_DIPLOPIA-Diplopia,1538_0-Major_dietary_changes_in_the_last_5_years_No,5663-Length_of_longest_manicirritable_episode,20002_1538-Noncancer_illness_code_selfreported_arthritis_nos,S30-Diagnoses_main_ICD10_S30_Superficial_injury_of_abdomen_lower_back_and_pelvis,24010_raw-Inverse_distance_to_the_nearest_road,3143_raw-Ankle_spacing_width,...,2237-Plays_computer_games,20002_1461-Noncancer_illness_code_selfreported_inflammatory_bowel_disease,20002_1508-Noncancer_illness_code_selfreported_jaundice_unknown_cause,20003_1140881882-Treatmentmedication_code_timoptol_025_eye_drops,22601_71253330-Job_coding_merchandiser_window_dresser,23112_raw-Leg_fat_mass_right,20003_1140861778-Treatmentmedication_code_dipyridamole,20003_1199-Treatmentmedication_code_food_supplementplantherbal_extract,1309-Fresh_fruit_intake,100920_2105-Type_milk_consumed_soya_with_calcium
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,,,,,0.001213,,,,,,...,,,,,,,,,0.002131,
ENSG00000000457,,,,,0.001873,,,,,0.01772,...,0.068852,,,,,0.003383,,,0.009195,
ENSG00000000460,,,,,0.00262,,,0.000207,,0.000105,...,0.093284,,,,,0.00879,,,0.003105,
ENSG00000000938,,,,,0.002928,,,,0.000762,0.012773,...,0.0043,,,0.000424,,0.000612,,,0.00324,
ENSG00000000971,,,,,0.002858,,,,,0.019304,...,0.005419,,,,,0.007427,,,0.004804,


# Load S-MultiXcan z-scores (EFO-mapped)

In [6]:
smultixcan_zscores_combined = read_data(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
)

In [7]:
smultixcan_zscores_combined.shape

(22515, 3752)

In [8]:
smultixcan_zscores_combined.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
ENSG00000000457,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
ENSG00000000460,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
ENSG00000000938,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
ENSG00000000971,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


# Get PhenomeXcan traits

In [9]:
phenomexcan_fullcode_to_traits = {
    (trait_obj := Trait.get_trait(full_code=trait_name)).full_code: trait_obj
    for trait_name in fastenloc_rcps.columns
}

In [10]:
len(phenomexcan_fullcode_to_traits)

4091

In [11]:
assert len(phenomexcan_fullcode_to_traits) == fastenloc_rcps.columns.shape[0]

# Change/combine traits in S-MultiXcan results

## Get a list of EFO labels for PhenomeXcan traits

In [12]:
traits_efo_labels = [
    t.get_efo_info().label
    if (t := phenomexcan_fullcode_to_traits[c]).get_efo_info() is not None
    else t.full_code
    for c in fastenloc_rcps.columns
]

In [13]:
len(traits_efo_labels)

4091

In [14]:
traits_efo_labels[:10]

['antepartum hemorrhage',
 'dyspepsia',
 '2907-Ever_stopped_smoking_for_6_months',
 'H7_DIPLOPIA-Diplopia',
 '1538_0-Major_dietary_changes_in_the_last_5_years_No',
 'manic episode measurement AND irritability measurement',
 'arthritis',
 'abdominal injury',
 '24010_raw-Inverse_distance_to_the_nearest_road',
 '3143_raw-Ankle_spacing_width']

## Get `max(rcp)` for same EFO labels

In [15]:
fastenloc_rcps_combined = fastenloc_rcps.groupby(traits_efo_labels, axis=1).max()

In [16]:
fastenloc_rcps_combined.shape

(38062, 3752)

In [17]:
fastenloc_rcps_combined.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,,,,,,,,,,0.000117,...,,,,,,0.000356,0.00067,,0.000862,
ENSG00000000457,,,,,,,,,,0.000163,...,0.002152,,,,,0.003799,0.002538,,0.001622,0.000241
ENSG00000000460,,,,,,,,0.000104,,0.000156,...,0.000719,,,,,0.000375,0.0028,,0.002147,0.000489
ENSG00000000938,,,,,,,,,,,...,0.000142,0.000301,,,,0.000108,0.001551,,0.001443,
ENSG00000000971,,,,,,,,,,0.000124,...,0.000114,,,,,0.000197,0.003033,,0.00123,0.000158


### Keep same order traits as in the S-MultiXcan results

In [18]:
assert set(fastenloc_rcps_combined.columns) == set(smultixcan_zscores_combined.columns)

In [19]:
fastenloc_rcps_combined = fastenloc_rcps_combined.loc[
    :, smultixcan_zscores_combined.columns
]

In [20]:
fastenloc_rcps_combined.shape

(38062, 3752)

### Do we have NaN values?

In [21]:
fastenloc_rcps_combined.isna().any().any()

True

In [22]:
fastenloc_rcps_combined.isna().sum()

100001_raw-Food_weight                           36537
100002_raw-Energy                                37931
100003_raw-Protein                               37977
100004_raw-Fat                                   35639
100005_raw-Carbohydrate                          36108
                                                 ...  
wellbeing measurement AND family relationship    21078
wheezing                                          3903
whooping cough                                   37783
worry measurement                                 5271
wrist fracture                                   17883
Length: 3752, dtype: int64

## Testing

### Stats

In [23]:
_stats = fastenloc_rcps_combined.stack().describe()
display(_stats.apply(str))

count                24124104.0
mean       0.007321862349188196
std         0.04274180352073264
min                         0.0
25%      0.00028149999999999996
50%                   0.0008847
75%                   0.0028591
max                   2.4696036
dtype: object

In [24]:
assert _stats["min"] >= 0.0

In [25]:
assert _stats["max"] <= 2.5

### Same traits as in z-scores version

In [26]:
assert fastenloc_rcps_combined.columns.equals(smultixcan_zscores_combined.columns)

### EFO label (asthma) which combined three PhenomeXcan traits.

In [27]:
_asthma_traits = [
    "22127-Doctor_diagnosed_asthma",
    "20002_1111-Noncancer_illness_code_selfreported_asthma",
    "J45-Diagnoses_main_ICD10_J45_Asthma",
]

In [28]:
fastenloc_rcps[_asthma_traits]

Unnamed: 0_level_0,22127-Doctor_diagnosed_asthma,20002_1111-Noncancer_illness_code_selfreported_asthma,J45-Diagnoses_main_ICD10_J45_Asthma
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000419,,0.001026,
ENSG00000000457,0.000483,0.004945,0.000336
ENSG00000000460,0.000778,0.010204,0.000840
ENSG00000000938,0.000604,0.007698,
ENSG00000000971,0.001110,0.014103,0.000270
...,...,...,...
ENSG00000284543,0.000446,0.000839,
ENSG00000284552,,0.001231,
ENSG00000284574,,,
ENSG00000284595,,0.001164,


In [29]:
fastenloc_rcps[_asthma_traits].isna().sum(axis=1).sort_values(ascending=False).head()

gene_id
ENSG00000253892    3
ENSG00000271705    3
ENSG00000237460    3
ENSG00000214869    3
ENSG00000259395    3
dtype: int64

In [30]:
_trait = "asthma"

_gene = "ENSG00000000419"
assert fastenloc_rcps_combined.loc[_gene, _trait].round(3) == 0.001

_gene = "ENSG00000000457"
assert fastenloc_rcps_combined.loc[_gene, _trait].round(3) == 0.005

_gene = "ENSG00000284543"
assert fastenloc_rcps_combined.loc[_gene, _trait].round(4) == 0.0008

_gene = "ENSG00000253892"
assert pd.isnull(fastenloc_rcps_combined.loc[_gene, _trait])

### PhenomeXcan trait which has no EFO label.

In [31]:
_trait = "100001_raw-Food_weight"

In [32]:
fastenloc_rcps[_trait].sort_values(ascending=False)

gene_id
ENSG00000280319    0.360500
ENSG00000225554    0.153813
ENSG00000182901    0.130800
ENSG00000226191    0.029330
ENSG00000196071    0.014940
                     ...   
ENSG00000284543         NaN
ENSG00000284552         NaN
ENSG00000284574         NaN
ENSG00000284595         NaN
ENSG00000284600         NaN
Name: 100001_raw-Food_weight, Length: 38062, dtype: float64

In [33]:
_gene = "ENSG00000280319"
assert fastenloc_rcps_combined.loc[_gene, _trait].round(3) == 0.360

_gene = "ENSG00000196071"
assert fastenloc_rcps_combined.loc[_gene, _trait].round(3) == 0.015

_gene = "ENSG00000284552"
assert pd.isnull(fastenloc_rcps_combined.loc[_gene, _trait])

# Save full (all traits, some with EFO, some not)

In [34]:
fastenloc_rcps_combined.shape

(38062, 3752)

In [35]:
fastenloc_rcps_combined.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,,,,,,,,,,0.000117,...,,,,,,0.000356,0.00067,,0.000862,
ENSG00000000457,,,,,,,,,,0.000163,...,0.002152,,,,,0.003799,0.002538,,0.001622,0.000241
ENSG00000000460,,,,,,,,0.000104,,0.000156,...,0.000719,,,,,0.000375,0.0028,,0.002147,0.000489
ENSG00000000938,,,,,,,,,,,...,0.000142,0.000301,,,,0.000108,0.001551,,0.001443,
ENSG00000000971,,,,,,,,,,0.000124,...,0.000114,,,,,0.000197,0.003033,,0.00123,0.000158


## Pickle (binary)

In [36]:
output_file = conf.PHENOMEXCAN["FASTENLOC_EFO_PARTIAL_TORUS_RCP_FILE"]
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/fastenloc-efo_partial-torus-rcp.pkl')

In [37]:
fastenloc_rcps_combined.to_pickle(output_file)

## TSV (text)

In [38]:
# tsv format
output_text_file = output_file.with_suffix(".tsv.gz")
display(output_text_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/fastenloc-efo_partial-torus-rcp.tsv.gz')

In [39]:
fastenloc_rcps_combined.to_csv(
    output_text_file, sep="\t", index=True, float_format="%.5e"
)