# Description

Generates manubot tables for PhenomeXcan and eMERGE associations given an LV name (which is the only parameter that needs to be specified in the Settings section below).

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
from pathlib import Path

import pandas as pd

from entity import Trait
import conf

# Settings

In [3]:
LV_NAME = "LV136"

In [4]:
assert (
    conf.MANUSCRIPT["BASE_DIR"] is not None
), "The manuscript directory was not configured"

OUTPUT_FILE_PATH = conf.MANUSCRIPT["CONTENT_DIR"] / "50.00.supplementary_material.md"
display(OUTPUT_FILE_PATH)
assert OUTPUT_FILE_PATH.exists()

PosixPath('/opt/manuscript/content/50.00.supplementary_material.md')

In [5]:
# result_set is either phenomexcan or emerge
LV_FILE_MARK_TEMPLATE = "<!-- {lv}:{result_set}_traits_assocs:{position} -->"

In [6]:
TABLE_CAPTION = "Table: Significant trait associations of {lv_name} in {result_set_name}. {table_id}"

In [7]:
TABLE_CAPTION_ID = "#tbl:sup:{result_set}_assocs:{lv_name_lower_case}"

In [8]:
RESULT_SET_NAMES = {
    "phenomexcan": "PhenomeXcan",
    "emerge": "eMERGE",
}

# Load data

## PhenomeXcan LV-trait associations

In [9]:
input_filepath = Path(conf.RESULTS["GLS"] / "gls-summary-phenomexcan.pkl.gz")
display(input_filepath)

PosixPath('/opt/data/results/gls/gls-summary-phenomexcan.pkl.gz')

In [10]:
phenomexcan_lv_trait_assocs = pd.read_pickle(input_filepath)

In [11]:
phenomexcan_lv_trait_assocs.shape

(4037817, 5)

In [12]:
phenomexcan_lv_trait_assocs.head()

Unnamed: 0,phenotype,phenotype_desc,lv,pvalue,fdr
0,AB1_OTHER_VIRAL,Other viral diseases,LV736,0.004725,0.504339
1,AB1_OTHER_VIRAL,Other viral diseases,LV320,0.004848,0.508291
2,AB1_OTHER_VIRAL,Other viral diseases,LV366,0.005306,0.523691
3,AB1_OTHER_VIRAL,Other viral diseases,LV964,0.006106,0.548143
4,AB1_OTHER_VIRAL,Other viral diseases,LV92,0.006565,0.560048


## eMERGE LV-trait associations

In [13]:
input_filepath = Path(conf.RESULTS["GLS"] / "gls-summary-emerge.pkl.gz")
display(input_filepath)

PosixPath('/opt/data/results/gls/gls-summary-emerge.pkl.gz')

In [14]:
emerge_lv_trait_assocs = pd.read_pickle(input_filepath)

In [15]:
emerge_lv_trait_assocs.shape

(304983, 5)

In [16]:
emerge_lv_trait_assocs.head()

Unnamed: 0,phenotype,phenotype_desc,lv,pvalue,fdr
0,EUR_440.2,Atherosclerosis of the extremities,LV472,1.033637e-07,0.000658
1,EUR_440.2,Atherosclerosis of the extremities,LV182,3.710244e-07,0.001432
2,EUR_440.2,Atherosclerosis of the extremities,LV348,7.379936e-07,0.002558
3,EUR_440.2,Atherosclerosis of the extremities,LV504,1.534424e-06,0.0045
4,EUR_440.2,Atherosclerosis of the extremities,LV445,2.912525e-06,0.007402


## eMERGE traits info

In [17]:
input_filepath = conf.EMERGE["DESC_FILE_WITH_SAMPLE_SIZE"]
display(input_filepath)

PosixPath('/opt/data/data/emerge/eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt')

In [18]:
emerge_traits_info = pd.read_csv(
    input_filepath,
    sep="\t",
    dtype={"phecode": str},
    usecols=[
        "phecode",
        "phenotype",
        "category",
        "eMERGE_III_EUR_case",
        "eMERGE_III_EUR_control",
    ],
)

In [19]:
emerge_traits_info["phecode"] = emerge_traits_info["phecode"].apply(
    lambda x: f"EUR_{x}"
)

In [20]:
emerge_traits_info = emerge_traits_info.set_index("phecode").sort_index()

In [21]:
emerge_traits_info = emerge_traits_info.rename(
    columns={
        "eMERGE_III_EUR_case": "eur_n_cases",
        "eMERGE_III_EUR_control": "eur_n_controls",
    }
)

In [22]:
emerge_traits_info.shape

(309, 4)

In [23]:
emerge_traits_info.head()

Unnamed: 0_level_0,eur_n_cases,eur_n_controls,phenotype,category
phecode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
EUR_008,1639,57495,Intestinal infection,infectious diseases
EUR_008.5,1024,57495,Bacterial enteritis,infectious diseases
EUR_008.52,893,57495,Intestinal infection due to C. difficile,infectious diseases
EUR_038,3172,50610,Septicemia,infectious diseases
EUR_038.3,1361,50610,Bacteremia,infectious diseases


In [24]:
assert emerge_traits_info.index.is_unique

# Trait associations

## PhenomeXcan

In [25]:
from traits import SHORT_TRAIT_NAMES

In [26]:
result_set = "phenomexcan"

In [27]:
def get_trait_objs(phenotype_full_code):
    if Trait.is_efo_label(phenotype_full_code):
        traits = Trait.get_traits_from_efo(phenotype_full_code)
    else:
        traits = [Trait.get_trait(code=phenotype_full_code)]

    # sort by sample size
    return sorted(traits, key=lambda x: x.n_cases / x.n, reverse=True)


def get_trait_description(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    desc = traits[0].description
    if desc in SHORT_TRAIT_NAMES:
        return SHORT_TRAIT_NAMES[desc]

    return desc


def get_trait_n(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    return traits[0].n


def get_trait_n_cases(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    return traits[0].n_cases


def num_to_int_str(num):
    if pd.isnull(num):
        return ""

    return f"{num:,.0f}"


def get_part_clust(row):
    return f"{row.part_k} / {row.cluster_id}"

In [28]:
lv_assocs = phenomexcan_lv_trait_assocs[
    (phenomexcan_lv_trait_assocs["lv"] == LV_NAME)
    & (phenomexcan_lv_trait_assocs["fdr"] < 0.05)
].sort_values("fdr")

In [29]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,phenotype,phenotype_desc,lv,pvalue,fdr
2866248,I9_CORATHER,Coronary atherosclerosis,LV136,1.088611e-13,1.839168e-09
1716393,I25,Diagnoses - main ICD10: I25 Chronic ischaemic heart disease,LV136,2.294066e-13,3.522061e-09
2303659,I9_IHD,"Ischaemic heart disease, wide definition",LV136,3.610922e-12,3.951285e-08
4023013,CARDIoGRAM_C4D_CAD_ADDITIVE,Coronary Artery Disease,LV136,3.837747e-12,4.176852e-08
24677,5132_raw,3mm strong meridian (right),LV136,1.117083e-08,5.541251e-05
1129131,5134_raw,6mm strong meridian (left),LV136,3.046658e-08,0.0001348887
2062833,5257_raw,Corneal resistance factor (right),LV136,4.800185e-08,0.0002018986
3757511,5133_raw,6mm strong meridian (right),LV136,6.403355e-08,0.0002575257
788614,6150_1,Vascular/heart problems diagnosed by doctor: Heart attack,LV136,9.804334e-08,0.0003748874
131271,I9_MI,Myocardial infarction,LV136,1.329681e-07,0.0004845677


In [30]:
# # remove repeated traits
# lv_assocs = lv_assocs[
#     ~lv_assocs["phenotype"].isin(
#         (
#             "I9_MI_STRICT",
#             "I9_CHD_NOREV",
#         )
#     )
# ]

In [31]:
lv_assocs = lv_assocs.assign(
    phenotype_desc=lv_assocs["phenotype"].apply(get_trait_description)
)

In [32]:
lv_assocs = lv_assocs.assign(n=lv_assocs["phenotype"].apply(get_trait_n))

In [33]:
lv_assocs = lv_assocs.assign(n_cases=lv_assocs["phenotype"].apply(get_trait_n_cases))

In [34]:
# lv_assocs = lv_assocs.assign(coef=lv_assocs["coef"].apply(lambda x: f"{x:.3f}"))

In [35]:
lv_assocs = lv_assocs.assign(
    fdr=lv_assocs["fdr"].apply(lambda x: f"{x:.2e}".replace("-", "&#8209;"))
)

In [36]:
lv_assocs = lv_assocs.assign(n=lv_assocs["n"].apply(num_to_int_str))

In [37]:
lv_assocs = lv_assocs.assign(n_cases=lv_assocs["n_cases"].apply(num_to_int_str))

In [38]:
# lv_assocs = lv_assocs.assign(part_clust="")  # lv_assocs.apply(get_part_clust, axis=1))

In [39]:
lv_assocs = lv_assocs.drop(columns=["phenotype"])

In [40]:
lv_assocs.shape

(26, 6)

In [41]:
lv_assocs = lv_assocs[["phenotype_desc", "n", "n_cases", "fdr"]]

In [42]:
lv_assocs = lv_assocs.rename(
    columns={
        "part_clust": "Partition / cluster",
        "lv": "Latent variable (LV)",
        #         "coef": r"$\beta$",
        "fdr": "FDR",
        "phenotype_desc": "Trait description",
        "n": "Sample size",
        "n_cases": "Cases",
    }
)

In [43]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,Trait description,Sample size,Cases,FDR
2866248,Coronary atherosclerosis,361194,14334.0,1.84e&#8209;09
1716393,Chronic ischaemic heart disease (ICD10 I25),361194,12769.0,3.52e&#8209;09
2303659,Ischaemic heart disease (wide definition),361194,20857.0,3.95e&#8209;08
4023013,Coronary Artery Disease,184305,60801.0,4.18e&#8209;08
24677,3mm strong meridian (right),75410,,5.54e&#8209;05
1129131,6mm strong meridian (left),65551,,1.35e&#8209;04
2062833,Corneal resistance factor (right),76630,,2.02e&#8209;04
3757511,6mm strong meridian (right),66256,,2.58e&#8209;04
788614,Heart attack,360420,8288.0,3.75e&#8209;04
131271,Myocardial infarction,361194,7018.0,4.85e&#8209;04


### Fill empty

In [44]:
if lv_assocs.shape[0] == 0:
    lv_assocs.loc[0, "Trait description"] = "No significant associations"
    lv_assocs = lv_assocs.fillna("")

### Save

In [45]:
# start
lv_file_mark_start = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="start"
)
display(lv_file_mark_start)

# end
lv_file_mark_end = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="end"
)
display(lv_file_mark_end)

'<!-- LV136:phenomexcan_traits_assocs:start -->'

'<!-- LV136:phenomexcan_traits_assocs:end -->'

In [46]:
new_content = lv_assocs.to_markdown(index=False, disable_numparse=True)

In [47]:
# add table caption
table_caption = TABLE_CAPTION.format(
    lv_name=LV_NAME,
    result_set_name=RESULT_SET_NAMES[result_set],
    table_id="{"
    + TABLE_CAPTION_ID.format(result_set=result_set, lv_name_lower_case=LV_NAME.lower())
    + "}",
)
display(table_caption)

'Table: Significant trait associations of LV136 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv136}'

In [48]:
new_content += "\n\n" + table_caption

In [49]:
full_new_content = (
    lv_file_mark_start + "\n" + new_content.strip() + "\n" + lv_file_mark_end
)

In [50]:
with open(OUTPUT_FILE_PATH, "r", encoding="utf8") as f:
    file_content = f.read()

In [51]:
new_file_content = re.sub(
    lv_file_mark_start + ".*?" + lv_file_mark_end,
    full_new_content,
    file_content,
    flags=re.DOTALL,
)

In [52]:
with open(OUTPUT_FILE_PATH, "w", encoding="utf8") as f:
    f.write(new_file_content)  # .replace("\beta", r"\beta"))

## eMERGE

In [53]:
result_set = "emerge"

In [54]:
lv_assocs = emerge_lv_trait_assocs[
    (emerge_lv_trait_assocs["lv"] == LV_NAME) & (emerge_lv_trait_assocs["fdr"] < 0.05)
].sort_values("fdr")

In [55]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,phenotype,phenotype_desc,lv,pvalue,fdr
220101,EUR_411.4,Coronary atherosclerosis,LV136,3.588278e-07,0.001417


In [56]:
lv_assocs = lv_assocs.assign(
    phenotype_desc=lv_assocs["phenotype"].apply(
        lambda x: emerge_traits_info.loc[x, "phenotype"]
    )
)

In [57]:
lv_assocs = lv_assocs.assign(
    n=lv_assocs["phenotype"].apply(
        lambda x: emerge_traits_info.loc[x, ["eur_n_cases", "eur_n_controls"]].sum()
    )
)

In [58]:
lv_assocs = lv_assocs.assign(
    n_cases=lv_assocs["phenotype"].apply(
        lambda x: emerge_traits_info.loc[x, "eur_n_cases"]
    )
)

In [59]:
lv_assocs["phenotype"] = lv_assocs["phenotype"].apply(lambda x: x.split("EUR_")[1])

In [60]:
# lv_assocs = lv_assocs.assign(coef=lv_assocs["coef"].apply(lambda x: f"{x:.3f}"))

In [61]:
lv_assocs = lv_assocs.assign(
    fdr=lv_assocs["fdr"].apply(lambda x: f"{x:.2e}".replace("-", "&#8209;"))
)

In [62]:
lv_assocs = lv_assocs.assign(n=lv_assocs["n"].apply(num_to_int_str))

In [63]:
lv_assocs = lv_assocs.assign(n_cases=lv_assocs["n_cases"].apply(num_to_int_str))

In [64]:
lv_assocs = lv_assocs.rename(columns={"phenotype": "phecode"})

In [65]:
lv_assocs.shape

(1, 7)

In [66]:
lv_assocs = lv_assocs[["phecode", "phenotype_desc", "n", "n_cases", "fdr"]]

In [67]:
lv_assocs = lv_assocs.rename(
    columns={
        "lv": "Latent variable (LV)",
        #         "coef": r"$\beta$",
        "fdr": "FDR",
        "phecode": "Phecode",
        "phenotype_desc": "Trait description",
        "n": "Sample size",
        "n_cases": "Cases",
    }
)

In [68]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,Phecode,Trait description,Sample size,Cases,FDR
220101,411.4,Coronary atherosclerosis,52836,13715,1.42e&#8209;03


### Fill empty

In [69]:
if lv_assocs.shape[0] == 0:
    lv_assocs = pd.DataFrame(columns=lv_assocs.columns.copy()).astype(str)
    lv_assocs.loc[0, "Phecode"] = "No significant associations"
    lv_assocs = lv_assocs.fillna("")

    display(lv_assocs)

### Save

In [70]:
# start
lv_file_mark_start = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="start"
)
display(lv_file_mark_start)

# end
lv_file_mark_end = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="end"
)
display(lv_file_mark_end)

'<!-- LV136:emerge_traits_assocs:start -->'

'<!-- LV136:emerge_traits_assocs:end -->'

In [71]:
new_content = lv_assocs.to_markdown(index=False, disable_numparse=True)

In [72]:
# add table caption
table_caption = TABLE_CAPTION.format(
    lv_name=LV_NAME,
    result_set_name=RESULT_SET_NAMES[result_set],
    table_id="{"
    + TABLE_CAPTION_ID.format(result_set=result_set, lv_name_lower_case=LV_NAME.lower())
    + "}",
)
display(table_caption)

'Table: Significant trait associations of LV136 in eMERGE. {#tbl:sup:emerge_assocs:lv136}'

In [73]:
new_content += "\n\n" + table_caption

In [74]:
full_new_content = (
    lv_file_mark_start + "\n" + new_content.strip() + "\n" + lv_file_mark_end
)

In [75]:
with open(OUTPUT_FILE_PATH, "r", encoding="utf8") as f:
    file_content = f.read()

In [76]:
new_file_content = re.sub(
    lv_file_mark_start + ".*?" + lv_file_mark_end,
    full_new_content,
    file_content,
    flags=re.DOTALL,
)

In [77]:
with open(OUTPUT_FILE_PATH, "w", encoding="utf8") as f:
    f.write(new_file_content)  # .replace("\beta", r"\beta"))