# Description

Generates manubot tables for PhenomeXcan and eMERGE associations given an LV name (which is the only parameter that needs to be specified in the Settings section below).

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
from pathlib import Path

import pandas as pd

from entity import Trait
import conf

# Settings

In [3]:
LV_NAME = "LV136"

In [4]:
assert (
    conf.MANUSCRIPT["BASE_DIR"] is not None
), "The manuscript directory was not configured"

OUTPUT_FILE_PATH = conf.MANUSCRIPT["CONTENT_DIR"] / "50.00.supplementary_material.md"
display(OUTPUT_FILE_PATH)
assert OUTPUT_FILE_PATH.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier_manuscript/content/50.00.supplementary_material.md')

In [5]:
# result_set is either phenomexcan or emerge
LV_FILE_MARK_TEMPLATE = "<!-- {lv}:{result_set}_traits_assocs:{position} -->"

In [6]:
TABLE_CAPTION = "Table: Significant trait associations of {lv_name} in {result_set_name}. {table_id}"

In [7]:
TABLE_CAPTION_ID = "#tbl:sup:{result_set}_assocs:{lv_name_lower_case}"

In [8]:
RESULT_SET_NAMES = {
    "phenomexcan": "PhenomeXcan",
    "emerge": "eMERGE",
}

# Load data

## PhenomeXcan LV-trait associations

In [9]:
input_filepath = Path(conf.RESULTS["GLS"] / "gls_phenotypes-combined-phenomexcan.pkl")
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/gls/gls_phenotypes-combined-phenomexcan.pkl')

In [10]:
phenomexcan_lv_trait_assocs = pd.read_pickle(input_filepath)

In [11]:
phenomexcan_lv_trait_assocs.shape

(5782, 8)

In [12]:
phenomexcan_lv_trait_assocs.head()

Unnamed: 0,part_k,cluster_id,phenotype,lv,coef,pvalue,pvalue_twosided,fdr
0,29,22,100002_raw-Energy,LV246,0.002401,0.424213,0.848425,0.680765
1,29,22,100002_raw-Energy,LV607,-0.006323,0.691499,0.617002,0.87335
2,29,22,100002_raw-Energy,LV612,-0.000822,0.525786,0.948428,0.763843
3,29,22,100002_raw-Energy,LV74,-0.006035,0.685313,0.629373,0.870769
4,29,22,100002_raw-Energy,LV838,0.024454,0.023446,0.046891,0.087403


## eMERGE LV-trait associations

In [13]:
input_filepath = Path(conf.RESULTS["GLS"] / "gls_phenotypes-combined-emerge.pkl")
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/gls/gls_phenotypes-combined-emerge.pkl')

In [14]:
emerge_lv_trait_assocs = pd.read_pickle(input_filepath)

In [15]:
emerge_lv_trait_assocs.shape

(7725, 6)

In [16]:
emerge_lv_trait_assocs.head()

Unnamed: 0,phenotype,lv,coef,pvalue,pvalue_twosided,fdr
0,8,LV246,-0.011008,0.809325,0.38135,0.999418
1,8,LV30,0.022387,0.038699,0.077397,0.483734
2,8,LV57,0.00396,0.379121,0.758243,0.961449
3,8,LV865,-0.008742,0.758337,0.483327,0.999418
4,8,LV847,-0.028246,0.985827,0.028346,0.999418


## eMERGE traits info

In [17]:
input_filepath = conf.EMERGE["DESC_FILE_WITH_SAMPLE_SIZE"]
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/data/emerge/eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt')

In [18]:
emerge_traits_info = pd.read_csv(
    input_filepath,
    sep="\t",
    dtype={"phecode": str},
    usecols=[
        "phecode",
        "phenotype",
        "category",
        "eMERGE_III_EUR_case",
        "eMERGE_III_EUR_control",
    ],
)

In [19]:
emerge_traits_info = emerge_traits_info.set_index("phecode")

In [20]:
emerge_traits_info = emerge_traits_info.rename(
    columns={
        "eMERGE_III_EUR_case": "eur_n_cases",
        "eMERGE_III_EUR_control": "eur_n_controls",
    }
)

In [21]:
emerge_traits_info.shape

(309, 4)

In [22]:
emerge_traits_info.head()

Unnamed: 0_level_0,eur_n_cases,eur_n_controls,phenotype,category
phecode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8.0,1639,57495,Intestinal infection,infectious diseases
8.5,1024,57495,Bacterial enteritis,infectious diseases
8.52,893,57495,Intestinal infection due to C. difficile,infectious diseases
38.0,3172,50610,Septicemia,infectious diseases
38.3,1361,50610,Bacteremia,infectious diseases


In [23]:
assert emerge_traits_info.index.is_unique

# Trait associations

## PhenomeXcan

In [24]:
from traits import SHORT_TRAIT_NAMES

In [25]:
result_set = "phenomexcan"

In [26]:
def get_trait_objs(phenotype_full_code):
    if Trait.is_efo_label(phenotype_full_code):
        traits = Trait.get_traits_from_efo(phenotype_full_code)
    else:
        traits = [Trait.get_trait(full_code=phenotype_full_code)]

    # sort by sample size
    return sorted(traits, key=lambda x: x.n_cases / x.n, reverse=True)


def get_trait_description(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    desc = traits[0].description
    if desc in SHORT_TRAIT_NAMES:
        return SHORT_TRAIT_NAMES[desc]

    return desc


def get_trait_n(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    return traits[0].n


def get_trait_n_cases(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    return traits[0].n_cases


def num_to_int_str(num):
    if pd.isnull(num):
        return ""

    return f"{num:,.0f}"


def get_part_clust(row):
    return f"{row.part_k} / {row.cluster_id}"

In [27]:
lv_assocs = phenomexcan_lv_trait_assocs[
    (phenomexcan_lv_trait_assocs["lv"] == LV_NAME)
    & (phenomexcan_lv_trait_assocs["fdr"] < 0.05)
].sort_values("fdr")

In [28]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,part_k,cluster_id,phenotype,lv,coef,pvalue,pvalue_twosided,fdr
4202,29,10,5132_raw-3mm_strong_meridian_right,LV136,0.090448,1.763103e-12,3.526206e-12,6.371414e-11
4245,29,10,5134_raw-6mm_strong_meridian_left,LV136,0.083733,5.117735e-11,1.023547e-10,1.533199e-09
4210,29,10,5133_raw-6mm_strong_meridian_right,LV136,0.08347,5.95874e-11,1.191748e-10,1.77595e-09
4263,29,10,5135_raw-3mm_strong_meridian_left,LV136,0.08006,3.596316e-10,7.192632e-10,9.901857e-09
4921,29,11,coronary artery disease,LV136,0.081309,4.68221e-10,9.36442e-10,1.259188e-08
4184,29,10,5099_raw-3mm_weak_meridian_right,LV136,0.079461,4.704685e-10,9.409371e-10,1.259375e-08
4152,29,10,5098_raw-6mm_weak_meridian_right,LV136,0.078085,9.391214e-10,1.878243e-09,2.413333e-08
4132,29,10,5096_raw-3mm_weak_meridian_left,LV136,0.076922,1.561458e-09,3.122916e-09,3.959802e-08
4135,29,10,5097_raw-6mm_weak_meridian_left,LV136,0.075134,3.782731e-09,7.565462e-09,9.307127e-08
4602,29,14,I9_CORATHER-Coronary_atherosclerosis,LV136,0.066674,1.616015e-07,3.23203e-07,3.043583e-06


In [29]:
# remove repeated traits
lv_assocs = lv_assocs[
    ~lv_assocs["phenotype"].isin(
        (
            "I9_MI_STRICT-Myocardial_infarction_strict",
            "I9_CHD_NOREV-Major_coronary_heart_disease_event_excluding_revascularizations",
        )
    )
]

In [30]:
lv_assocs = lv_assocs.assign(
    phenotype_desc=lv_assocs["phenotype"].apply(get_trait_description)
)

In [31]:
lv_assocs = lv_assocs.assign(n=lv_assocs["phenotype"].apply(get_trait_n))

In [32]:
lv_assocs = lv_assocs.assign(n_cases=lv_assocs["phenotype"].apply(get_trait_n_cases))

In [33]:
lv_assocs = lv_assocs.assign(coef=lv_assocs["coef"].apply(lambda x: f"{x:.3f}"))

In [34]:
lv_assocs = lv_assocs.assign(
    fdr=lv_assocs["fdr"].apply(lambda x: f"{x:.2e}".replace("-", "&#8209;"))
)

In [35]:
lv_assocs = lv_assocs.assign(n=lv_assocs["n"].apply(num_to_int_str))

In [36]:
lv_assocs = lv_assocs.assign(n_cases=lv_assocs["n_cases"].apply(num_to_int_str))

In [37]:
lv_assocs = lv_assocs.assign(part_clust=lv_assocs.apply(get_part_clust, axis=1))

In [38]:
lv_assocs = lv_assocs.drop(columns=["phenotype"])

In [39]:
lv_assocs.shape

(16, 11)

In [40]:
lv_assocs = lv_assocs[["phenotype_desc", "n", "n_cases", "part_clust", "fdr"]]

In [41]:
lv_assocs = lv_assocs.rename(
    columns={
        "part_clust": "Partition / cluster",
        "lv": "Latent variable (LV)",
        #         "coef": r"$\beta$",
        "fdr": "FDR",
        "phenotype_desc": "Trait description",
        "n": "Sample size",
        "n_cases": "Cases",
    }
)

In [42]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,Trait description,Sample size,Cases,Partition / cluster,FDR
4202,3mm strong meridian (right),75410,,29 / 10,6.37e&#8209;11
4245,6mm strong meridian (left),65551,,29 / 10,1.53e&#8209;09
4210,6mm strong meridian (right),66256,,29 / 10,1.78e&#8209;09
4263,3mm strong meridian (left),75398,,29 / 10,9.90e&#8209;09
4921,Coronary Artery Disease,184305,60801.0,29 / 11,1.26e&#8209;08
4184,3mm weak meridian (right),75410,,29 / 10,1.26e&#8209;08
4152,6mm weak meridian (right),66256,,29 / 10,2.41e&#8209;08
4132,3mm weak meridian (left),75398,,29 / 10,3.96e&#8209;08
4135,6mm weak meridian (left),65551,,29 / 10,9.31e&#8209;08
4602,Coronary atherosclerosis,361194,14334.0,29 / 14,3.04e&#8209;06


### Fill empty

In [43]:
if lv_assocs.shape[0] == 0:
    lv_assocs.loc[0, "Trait description"] = "No significant associations"
    lv_assocs = lv_assocs.fillna("")

### Save

In [44]:
# start
lv_file_mark_start = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="start"
)
display(lv_file_mark_start)

# end
lv_file_mark_end = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="end"
)
display(lv_file_mark_end)

'<!-- LV136:phenomexcan_traits_assocs:start -->'

'<!-- LV136:phenomexcan_traits_assocs:end -->'

In [45]:
new_content = lv_assocs.to_markdown(index=False, disable_numparse=True)

In [46]:
# add table caption
table_caption = TABLE_CAPTION.format(
    lv_name=LV_NAME,
    result_set_name=RESULT_SET_NAMES[result_set],
    table_id="{"
    + TABLE_CAPTION_ID.format(result_set=result_set, lv_name_lower_case=LV_NAME.lower())
    + "}",
)
display(table_caption)

'Table: Significant trait associations of LV136 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv136}'

In [47]:
new_content += "\n\n" + table_caption

In [48]:
full_new_content = (
    lv_file_mark_start + "\n" + new_content.strip() + "\n" + lv_file_mark_end
)

In [49]:
with open(OUTPUT_FILE_PATH, "r", encoding="utf8") as f:
    file_content = f.read()

In [50]:
new_file_content = re.sub(
    lv_file_mark_start + ".*?" + lv_file_mark_end,
    full_new_content,
    file_content,
    flags=re.DOTALL,
)

In [51]:
with open(OUTPUT_FILE_PATH, "w", encoding="utf8") as f:
    f.write(new_file_content)  # .replace("\beta", r"\beta"))

## eMERGE

In [52]:
result_set = "emerge"

In [53]:
TABLE_CAPTION = (
    "Table: Trait associations of {lv_name} in {result_set_name}. {table_id}"
)

In [54]:
lv_assocs = emerge_lv_trait_assocs[
    (emerge_lv_trait_assocs["lv"] == LV_NAME) & (emerge_lv_trait_assocs["fdr"] < 0.20)
].sort_values("fdr")

In [55]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,phenotype,lv,coef,pvalue,pvalue_twosided,fdr
7669,747.1,LV136,0.035432,0.003495,0.00699,0.156964
3191,411.4,LV136,0.035326,0.00361,0.007219,0.15998


In [56]:
lv_assocs = lv_assocs.assign(
    phenotype_desc=lv_assocs["phenotype"].apply(
        lambda x: emerge_traits_info.loc[x, "phenotype"]
    )
)

In [57]:
lv_assocs = lv_assocs.assign(
    n=lv_assocs["phenotype"].apply(
        lambda x: emerge_traits_info.loc[x, ["eur_n_cases", "eur_n_controls"]].sum()
    )
)

In [58]:
lv_assocs = lv_assocs.assign(
    n_cases=lv_assocs["phenotype"].apply(
        lambda x: emerge_traits_info.loc[x, "eur_n_cases"]
    )
)

In [59]:
lv_assocs = lv_assocs.assign(coef=lv_assocs["coef"].apply(lambda x: f"{x:.3f}"))

In [60]:
lv_assocs = lv_assocs.assign(
    fdr=lv_assocs["fdr"].apply(lambda x: f"{x:.2e}".replace("-", "&#8209;"))
)

In [61]:
lv_assocs = lv_assocs.assign(n=lv_assocs["n"].apply(num_to_int_str))

In [62]:
lv_assocs = lv_assocs.assign(n_cases=lv_assocs["n_cases"].apply(num_to_int_str))

In [63]:
lv_assocs = lv_assocs.rename(columns={"phenotype": "phecode"})

In [64]:
lv_assocs.shape

(2, 9)

In [65]:
lv_assocs = lv_assocs[["phecode", "phenotype_desc", "n", "n_cases", "fdr"]]

In [66]:
lv_assocs = lv_assocs.rename(
    columns={
        "lv": "Latent variable (LV)",
        #         "coef": r"$\beta$",
        "fdr": "FDR",
        "phecode": "Phecode",
        "phenotype_desc": "Trait description",
        "n": "Sample size",
        "n_cases": "Cases",
    }
)

In [67]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,Phecode,Trait description,Sample size,Cases,FDR
7669,747.1,Cardiac congenital anomalies,59198,1871,1.57e&#8209;01
3191,411.4,Coronary atherosclerosis,52836,13715,1.60e&#8209;01


### Fill empty

In [68]:
if lv_assocs.shape[0] == 0:
    lv_assocs.loc[0, "Phecode"] = "No significant associations"
    lv_assocs = lv_assocs.fillna("")

### Save

In [69]:
# start
lv_file_mark_start = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="start"
)
display(lv_file_mark_start)

# end
lv_file_mark_end = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="end"
)
display(lv_file_mark_end)

'<!-- LV136:emerge_traits_assocs:start -->'

'<!-- LV136:emerge_traits_assocs:end -->'

In [70]:
new_content = lv_assocs.to_markdown(index=False, disable_numparse=True)

In [71]:
# add table caption
table_caption = TABLE_CAPTION.format(
    lv_name=LV_NAME,
    result_set_name=RESULT_SET_NAMES[result_set],
    table_id="{"
    + TABLE_CAPTION_ID.format(result_set=result_set, lv_name_lower_case=LV_NAME.lower())
    + "}",
)
display(table_caption)

'Table: Trait associations of LV136 in eMERGE. {#tbl:sup:emerge_assocs:lv136}'

In [72]:
new_content += "\n\n" + table_caption

In [73]:
full_new_content = (
    lv_file_mark_start + "\n" + new_content.strip() + "\n" + lv_file_mark_end
)

In [74]:
with open(OUTPUT_FILE_PATH, "r", encoding="utf8") as f:
    file_content = f.read()

In [75]:
new_file_content = re.sub(
    lv_file_mark_start + ".*?" + lv_file_mark_end,
    full_new_content,
    file_content,
    flags=re.DOTALL,
)

In [76]:
with open(OUTPUT_FILE_PATH, "w", encoding="utf8") as f:
    f.write(new_file_content)  # .replace("\beta", r"\beta"))