# Description

It uses the PhenomeXcan traits to EFO mapping files to group traits that end up having the same EFO label. Currently, this only combines the S-MultiXcan results (z-scores) using the [Stouffer method](https://en.wikipedia.org/wiki/Fisher%27s_method#Relation_to_Stouffer's_Z-score_method) (implemented in functions `get_weights` and `_combine_z_scores` below).

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.display import display
import numpy as np
import pandas as pd

import conf
from data.cache import read_data
from entity import Trait

# Load S-MultiXcan results

In [3]:
smultixcan_zscores = read_data(conf.PHENOMEXCAN["SMULTIXCAN_MASHR_ZSCORES_FILE"])

In [4]:
smultixcan_zscores.shape

(22515, 4091)

In [5]:
smultixcan_zscores.head()

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.169468,0.102558,0.239545,0.887758,1.313448,1.472148,0.72616,1.516367,1.299771,1.068093,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.377624,0.738444,0.298259
ENSG00000000457,1.358856,1.846875,0.139324,0.12953,0.757757,1.103979,0.612418,1.822327,2.035372,1.008058,...,1.441795,0.654791,2.545653,1.202984,0.514244,0.237223,0.414171,0.101731,1.012735,0.945167
ENSG00000000460,0.151008,1.173202,1.179426,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.033308,0.482261,0.695624,0.33648,0.083316,3.493196,0.991948
ENSG00000000938,1.302722,0.841524,1.578926,0.72134,0.139314,4.387016,0.125959,1.247123,0.215124,0.892083,...,0.126657,0.048048,1.886356,0.540496,0.127524,1.494501,0.056432,1.704863,1.351619,1.027297
ENSG00000000971,1.338813,0.262339,0.689379,1.702019,0.325859,0.063161,1.141126,0.882682,0.035533,1.810191,...,0.858497,1.675562,2.319072,1.598721,0.162958,0.005703,3.004544,0.803669,0.444266,0.165671


In [6]:
pd.Series(smultixcan_zscores.values.flatten()).describe().apply(str)

count             91055810.0
mean      0.8548868013440799
std       0.7207745082616361
min                      0.0
25%      0.33468240503293545
50%       0.7071672980795678
75%       1.2090944428758905
max                     40.0
dtype: object

# Get PhenomeXcan traits

In [7]:
phenomexcan_fullcode_to_traits = {
    (trait_obj := Trait.get_trait(full_code=trait_name)).full_code: trait_obj
    for trait_name in smultixcan_zscores.columns
}

In [8]:
len(phenomexcan_fullcode_to_traits)

4091

In [9]:
assert len(phenomexcan_fullcode_to_traits) == smultixcan_zscores.columns.shape[0]

# Change/combine traits in S-MultiXcan results

In [10]:
traits_sample_size = pd.DataFrame(
    [
        {
            "fullcode": fc,
            "n_cases": t.n_cases,
            "n_controls": t.n_controls,
            "n": t.n,
        }
        for fc, t in phenomexcan_fullcode_to_traits.items()
    ]
)

In [11]:
traits_sample_size.shape

(4091, 4)

In [12]:
traits_sample_size.head()

Unnamed: 0,fullcode,n_cases,n_controls,n
0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,3078.0,48349.0,51427
1,2345-Ever_had_bowel_cancer_screening,114475.0,240878.0,355353
2,N49-Diagnoses_main_ICD10_N49_Inflammatory_diso...,121.0,361073.0,361194
3,100011_raw-Iron,,,51453
4,5221-Index_of_best_refractometry_result_right,,,77983


In [13]:
def get_weights(traits_fullcodes):
    """
    This function takes a list of PhenomeXcan traits that map to the same EFO label, and returns their weights using sample sizes
    from GWASs. In the case of binary traits (i.e. diseases) the formula is:
        (n_cases / n_controls) * sqrt(n)
    where n=n_cases+n_controls
    In case of continuous traits (such as height) it is sqrt(n)
    """
    return np.array(
        [
            (t.n_cases / t.n_controls) * np.sqrt(t.n)
            if not pd.isnull((t := phenomexcan_fullcode_to_traits[trait_name]).n_cases)
            and not pd.isnull(t.n_controls)
            else np.sqrt(t.n)
            for trait_name in traits_fullcodes
        ]
    )


def _combine_z_scores(x):
    """
    Combines PhenomeXcan traits that map to the same EFO label using the Stouffer's Z-score method:
    https://en.wikipedia.org/wiki/Fisher%27s_method#Relation_to_Stouffer's_Z-score_method

    It uses weights for each traits, which are computed with function get_weights.

    Args:
        x: a pandas.DataFrame with PhenomeXcan traits in the columns, and genes in the rows. Values are z-scores of association in S-MultiXcan.

    Returns:
        pandas.Series for all genes and the single EFO label for which all traits in x map to. Values are the combined z-scores.
    """
    # combine z-scores using Stouffer's method
    weights = get_weights(x.columns)
    numerator = (x * weights).sum(1)
    denominator = np.sqrt(np.power(weights, 2).sum())
    new_data = numerator / denominator

    return pd.Series(
        data=new_data.values,
        index=x.index.copy(),
        name=x.columns[0],
    )

## Get a list of EFO labels for PhenomeXcan traits

In [14]:
traits_efo_labels = [
    t.get_efo_info().label
    if (t := phenomexcan_fullcode_to_traits[c]).get_efo_info() is not None
    else t.full_code
    for c in smultixcan_zscores.columns
]

In [15]:
len(traits_efo_labels)

4091

In [16]:
traits_efo_labels[:10]

['20096_1-Size_of_red_wine_glass_drunk_small_125ml',
 '2345-Ever_had_bowel_cancer_screening',
 'male reproductive system disease',
 '100011_raw-Iron',
 '5221-Index_of_best_refractometry_result_right',
 '20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet',
 'injury',
 '20024_1136-Job_code_deduced_Information_and_communication_technology_managers',
 'food allergy',
 'G6_SLEEPAPNO-Sleep_apnoea']

## Combine z-scores for same EFO labels

In [17]:
smultixcan_zscores_combined = smultixcan_zscores.groupby(
    traits_efo_labels, axis=1
).apply(_combine_z_scores)

In [18]:
smultixcan_zscores_combined.shape

(22515, 3752)

In [19]:
smultixcan_zscores_combined.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
ENSG00000000457,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
ENSG00000000460,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
ENSG00000000938,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
ENSG00000000971,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


In [20]:
assert not smultixcan_zscores_combined.isna().any().any()

## Testing

### Stats

In [21]:
_stats = smultixcan_zscores_combined.stack().describe()
display(_stats.apply(str))

count             84476280.0
mean      0.8638967105401182
std       0.7355093768129249
min                      0.0
25%      0.33519032841499646
50%       0.7165349117038435
75%       1.2250506784524884
max       52.732309168781974
dtype: object

In [22]:
assert _stats["min"] >= 0.0

In [23]:
assert _stats["max"] < 55.0

### EFO label (asthma) which combined three PhenomeXcan traits.

In [24]:
_asthma_traits = [
    "22127-Doctor_diagnosed_asthma",
    "20002_1111-Noncancer_illness_code_selfreported_asthma",
    "J45-Diagnoses_main_ICD10_J45_Asthma",
]

In [25]:
smultixcan_zscores[_asthma_traits]

Unnamed: 0_level_0,22127-Doctor_diagnosed_asthma,20002_1111-Noncancer_illness_code_selfreported_asthma,J45-Diagnoses_main_ICD10_J45_Asthma
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000419,0.327024,0.707137,0.805021
ENSG00000000457,1.088281,0.700004,1.214683
ENSG00000000460,1.520867,0.695085,1.681098
ENSG00000000938,0.911426,2.321047,0.661209
ENSG00000000971,0.738320,1.788336,0.701912
...,...,...,...
ENSG00000284430,0.965250,0.760152,1.476950
ENSG00000284452,0.479626,0.508688,1.071783
ENSG00000284513,1.150067,1.378157,1.326444
ENSG00000284526,0.302116,0.006106,0.463360


In [26]:
traits_sample_size[traits_sample_size["fullcode"].isin(_asthma_traits)]

Unnamed: 0,fullcode,n_cases,n_controls,n
1213,20002_1111-Noncancer_illness_code_selfreported...,41934.0,319207.0,361141
3485,22127-Doctor_diagnosed_asthma,11717.0,80070.0,91787
3667,J45-Diagnoses_main_ICD10_J45_Asthma,1693.0,359501.0,361194


In [27]:
_trait = "asthma"

_gene = "ENSG00000000419"
_weights = np.array(
    [
        ((41934.0 / 319207.0) * np.sqrt(361141)),
        ((11717.0 / 80070.0) * np.sqrt(91787)),
        ((1693.0 / 359501.0) * np.sqrt(361194)),
    ]
)
assert smultixcan_zscores_combined.loc[_gene, _trait].round(3) == (
    (_weights[1] * 0.327024 + _weights[0] * 0.707137 + _weights[2] * 0.805021)
    / np.sqrt(_weights[0] ** 2 + _weights[1] ** 2 + _weights[2] ** 2)
).round(3)

_gene = "ENSG00000284526"
assert smultixcan_zscores_combined.loc[_gene, _trait].round(3) == (
    (_weights[1] * 0.302116 + _weights[0] * 0.006106 + _weights[2] * 0.463360)
    / np.sqrt(_weights[0] ** 2 + _weights[1] ** 2 + _weights[2] ** 2)
).round(3)

### PhenomeXcan trait which has no EFO label.

In [28]:
_trait = "100001_raw-Food_weight"

In [29]:
traits_sample_size[traits_sample_size["fullcode"].isin((_trait,))]

Unnamed: 0,fullcode,n_cases,n_controls,n
751,100001_raw-Food_weight,,,51453


In [30]:
smultixcan_zscores[_trait]

gene_name
ENSG00000000419    1.145442
ENSG00000000457    0.618066
ENSG00000000460    0.515724
ENSG00000000938    0.280781
ENSG00000000971    0.548127
                     ...   
ENSG00000284430    0.124980
ENSG00000284452    1.587903
ENSG00000284513    1.522281
ENSG00000284526    0.150938
ENSG00000284552    1.010143
Name: 100001_raw-Food_weight, Length: 22515, dtype: float64

In [31]:
_gene = "ENSG00000284513"
_weights = np.array(
    [
        np.sqrt(51453),
    ]
)
assert smultixcan_zscores_combined.loc[_gene, _trait].round(3) == (
    (_weights[0] * 1.522281) / np.sqrt(_weights[0] ** 2)
).round(3)

_gene = "ENSG00000000971"
assert smultixcan_zscores_combined.loc[_gene, _trait].round(3) == (
    (_weights[0] * 0.548127) / np.sqrt(_weights[0] ** 2)
).round(3)

# Save full (all traits, some with EFO, some not)

In [32]:
smultixcan_zscores_combined.shape

(22515, 3752)

In [33]:
smultixcan_zscores_combined.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
ENSG00000000457,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
ENSG00000000460,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
ENSG00000000938,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
ENSG00000000971,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


## Pickle (binary)

In [34]:
output_file = conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.pkl')

In [35]:
smultixcan_zscores_combined.to_pickle(output_file)

## TSV (text)

In [36]:
# tsv format
output_text_file = output_file.with_suffix(".tsv.gz")
display(output_text_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.tsv.gz')

In [37]:
smultixcan_zscores_combined.to_csv(
    output_text_file, sep="\t", index=True, float_format="%.5e"
)