# Description

Generates manubot tables for PhenomeXcan and eMERGE associations given an LV name (which is the only parameter that needs to be specified in the Settings section below).

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
from pathlib import Path

import pandas as pd

from entity import Trait
import conf

# Settings

In [3]:
LV_NAME = "LV54"

In [4]:
assert (
    conf.MANUSCRIPT["BASE_DIR"] is not None
), "The manuscript directory was not configured"

OUTPUT_FILE_PATH = conf.MANUSCRIPT["CONTENT_DIR"] / "50.00.supplementary_material.md"
display(OUTPUT_FILE_PATH)
assert OUTPUT_FILE_PATH.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier_manuscript/content/50.00.supplementary_material.md')

In [5]:
# result_set is either phenomexcan or emerge
LV_FILE_MARK_TEMPLATE = "<!-- {lv}:{result_set}_traits_assocs:{position} -->"

In [6]:
TABLE_CAPTION = "Table: Significant trait associations of {lv_name} in {result_set_name}. {table_id}"

In [7]:
TABLE_CAPTION_ID = "#tbl:sup:{result_set}_assocs:{lv_name_lower_case}"

In [8]:
RESULT_SET_NAMES = {
    "phenomexcan": "PhenomeXcan",
    "emerge": "eMERGE",
}

# Load data

## PhenomeXcan LV-trait associations

In [9]:
input_filepath = Path(conf.RESULTS["GLS"] / "gls_phenotypes-combined-phenomexcan.pkl")
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/gls/gls_phenotypes-combined-phenomexcan.pkl')

In [10]:
phenomexcan_lv_trait_assocs = pd.read_pickle(input_filepath)

In [11]:
phenomexcan_lv_trait_assocs.shape

(5782, 8)

In [12]:
phenomexcan_lv_trait_assocs.head()

Unnamed: 0,part_k,cluster_id,phenotype,lv,coef,pvalue,pvalue_twosided,fdr
0,29,22,100002_raw-Energy,LV246,0.002401,0.424213,0.848425,0.680765
1,29,22,100002_raw-Energy,LV607,-0.006323,0.691499,0.617002,0.87335
2,29,22,100002_raw-Energy,LV612,-0.000822,0.525786,0.948428,0.763843
3,29,22,100002_raw-Energy,LV74,-0.006035,0.685313,0.629373,0.870769
4,29,22,100002_raw-Energy,LV838,0.024454,0.023446,0.046891,0.087403


## eMERGE LV-trait associations

In [13]:
input_filepath = Path(conf.RESULTS["GLS"] / "gls_phenotypes-combined-emerge.pkl")
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/gls/gls_phenotypes-combined-emerge.pkl')

In [14]:
emerge_lv_trait_assocs = pd.read_pickle(input_filepath)

In [15]:
emerge_lv_trait_assocs.shape

(7725, 6)

In [16]:
emerge_lv_trait_assocs.head()

Unnamed: 0,phenotype,lv,coef,pvalue,pvalue_twosided,fdr
0,8,LV246,-0.011008,0.809325,0.38135,0.999418
1,8,LV30,0.022387,0.038699,0.077397,0.483734
2,8,LV57,0.00396,0.379121,0.758243,0.961449
3,8,LV865,-0.008742,0.758337,0.483327,0.999418
4,8,LV847,-0.028246,0.985827,0.028346,0.999418


## eMERGE traits info

In [17]:
input_filepath = conf.EMERGE["DESC_FILE_WITH_SAMPLE_SIZE"]
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/data/emerge/eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt')

In [18]:
emerge_traits_info = pd.read_csv(
    input_filepath,
    sep="\t",
    dtype={"phecode": str},
    usecols=[
        "phecode",
        "phenotype",
        "category",
        "eMERGE_III_EUR_case",
        "eMERGE_III_EUR_control",
    ],
)

In [19]:
emerge_traits_info = emerge_traits_info.set_index("phecode")

In [20]:
emerge_traits_info = emerge_traits_info.rename(
    columns={
        "eMERGE_III_EUR_case": "eur_n_cases",
        "eMERGE_III_EUR_control": "eur_n_controls",
    }
)

In [21]:
emerge_traits_info.shape

(309, 4)

In [22]:
emerge_traits_info.head()

Unnamed: 0_level_0,eur_n_cases,eur_n_controls,phenotype,category
phecode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8.0,1639,57495,Intestinal infection,infectious diseases
8.5,1024,57495,Bacterial enteritis,infectious diseases
8.52,893,57495,Intestinal infection due to C. difficile,infectious diseases
38.0,3172,50610,Septicemia,infectious diseases
38.3,1361,50610,Bacteremia,infectious diseases


In [23]:
assert emerge_traits_info.index.is_unique

# Trait associations

## PhenomeXcan

In [24]:
from traits import SHORT_TRAIT_NAMES

In [25]:
result_set = "phenomexcan"

In [26]:
def get_trait_objs(phenotype_full_code):
    if Trait.is_efo_label(phenotype_full_code):
        traits = Trait.get_traits_from_efo(phenotype_full_code)
    else:
        traits = [Trait.get_trait(full_code=phenotype_full_code)]

    # sort by sample size
    return sorted(traits, key=lambda x: x.n_cases / x.n, reverse=True)


def get_trait_description(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    desc = traits[0].description
    if desc in SHORT_TRAIT_NAMES:
        return SHORT_TRAIT_NAMES[desc]

    return desc


def get_trait_n(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    return traits[0].n


def get_trait_n_cases(phenotype_full_code):
    traits = get_trait_objs(phenotype_full_code)

    return traits[0].n_cases


def num_to_int_str(num):
    if pd.isnull(num):
        return ""

    return f"{num:,.0f}"


def get_part_clust(row):
    return f"{row.part_k} / {row.cluster_id}"

In [27]:
lv_assocs = phenomexcan_lv_trait_assocs[
    (phenomexcan_lv_trait_assocs["lv"] == LV_NAME)
    & (phenomexcan_lv_trait_assocs["fdr"] < 0.05)
].sort_values("fdr")

In [28]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,part_k,cluster_id,phenotype,lv,coef,pvalue,pvalue_twosided,fdr
5189,29,8,malabsorption syndrome,LV54,0.13665,2.4655660000000003e-27,4.9311310000000004e-27,3.099109e-25
4687,29,8,K11_COELIAC-Coeliac_disease,LV54,0.13587,3.915065e-27,7.83013e-27,4.77545e-25
4271,29,13,6144_3-Never_eat_eggs_dairy_wheat_sugar_Wheat_products,LV54,0.130512,3.802965e-25,7.605929e-25,3.9265610000000004e-23
5380,29,26,systemic lupus erythematosus,LV54,0.129808,8.189427e-25,1.637885e-24,8.164011000000001e-23
3796,29,13,2986-Started_insulin_within_one_year_diagnosis_of_diabetes,LV54,0.122129,3.1970570000000003e-22,6.394115e-22,2.567415e-20
5100,29,13,hyperthyroidism AND thyrotoxicosis,LV54,0.117063,8.92404e-21,1.784808e-20,6.701142999999999e-19
2777,29,13,20003_1140883066-Treatmentmedication_code_insulin_product,LV54,0.115292,5.013609e-20,1.0027219999999999e-19,3.6235860000000004e-18
3788,29,13,2976_raw-Age_diabetes_diagnosed,LV54,0.11414,1.28908e-19,2.57816e-19,9.201803e-18
4888,29,8,celiac disease,LV54,0.103769,1.632907e-16,3.265814e-16,9.634153e-15
4498,29,13,6177_3-Medication_for_cholesterol_blood_pressure_or_diabetes_Insulin,LV54,0.100114,1.091992e-15,2.183984e-15,6.013236e-14


In [29]:
lv_assocs = lv_assocs.assign(
    phenotype_desc=lv_assocs["phenotype"].apply(get_trait_description)
)

In [30]:
lv_assocs = lv_assocs.assign(n=lv_assocs["phenotype"].apply(get_trait_n))

In [31]:
lv_assocs = lv_assocs.assign(n_cases=lv_assocs["phenotype"].apply(get_trait_n_cases))

In [32]:
lv_assocs = lv_assocs.assign(coef=lv_assocs["coef"].apply(lambda x: f"{x:.3f}"))

In [33]:
lv_assocs = lv_assocs.assign(
    fdr=lv_assocs["fdr"].apply(lambda x: f"{x:.2e}".replace("-", "&#8209;"))
)

In [34]:
lv_assocs = lv_assocs.assign(n=lv_assocs["n"].apply(num_to_int_str))

In [35]:
lv_assocs = lv_assocs.assign(n_cases=lv_assocs["n_cases"].apply(num_to_int_str))

In [36]:
lv_assocs = lv_assocs.assign(part_clust=lv_assocs.apply(get_part_clust, axis=1))

In [37]:
lv_assocs = lv_assocs.drop(columns=["phenotype"])

In [38]:
lv_assocs.shape

(15, 11)

In [39]:
lv_assocs = lv_assocs[["phenotype_desc", "n", "n_cases", "part_clust", "fdr"]]

In [40]:
lv_assocs = lv_assocs.rename(
    columns={
        "part_clust": "Partition / cluster",
        "lv": "Latent variable (LV)",
        #         "coef": r"$\beta$",
        "fdr": "FDR",
        "phenotype_desc": "Trait description",
        "n": "Sample size",
        "n_cases": "Cases",
    }
)

In [41]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,Trait description,Sample size,Cases,Partition / cluster,FDR
5189,Intestinal malabsorption (ICD10 K90),361194,922.0,29 / 8,3.10e&#8209;25
4687,Coeliac disease,361194,842.0,29 / 8,4.78e&#8209;25
4271,Never eat: Wheat products,359777,9573.0,29 / 13,3.93e&#8209;23
5380,Systemic Lupus Erythematosus,23210,7219.0,29 / 26,8.16e&#8209;23
3796,Started insulin within one year diagnosis of diabetes,16415,1999.0,29 / 13,2.57e&#8209;20
5100,hyperthyroidism (self-reported),361141,2730.0,29 / 13,6.70e&#8209;19
2777,Medication: insulin product,361141,3545.0,29 / 13,3.62e&#8209;18
3788,Age diabetes diagnosed,16166,,29 / 13,9.20e&#8209;18
4888,malabsorption/coeliac disease (self-reported),361141,1587.0,29 / 8,9.63e&#8209;15
4498,Insulin medication (males),165340,2248.0,29 / 13,6.01e&#8209;14


### Fill empty

In [42]:
if lv_assocs.shape[0] == 0:
    lv_assocs.loc[0, "Trait description"] = "No significant associations"
    lv_assocs = lv_assocs.fillna("")

### Save

In [43]:
# start
lv_file_mark_start = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="start"
)
display(lv_file_mark_start)

# end
lv_file_mark_end = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="end"
)
display(lv_file_mark_end)

'<!-- LV54:phenomexcan_traits_assocs:start -->'

'<!-- LV54:phenomexcan_traits_assocs:end -->'

In [44]:
new_content = lv_assocs.to_markdown(index=False, disable_numparse=True)

In [45]:
# add table caption
table_caption = TABLE_CAPTION.format(
    lv_name=LV_NAME,
    result_set_name=RESULT_SET_NAMES[result_set],
    table_id="{"
    + TABLE_CAPTION_ID.format(result_set=result_set, lv_name_lower_case=LV_NAME.lower())
    + "}",
)
display(table_caption)

'Table: Significant trait associations of LV54 in PhenomeXcan. {#tbl:sup:phenomexcan_assocs:lv54}'

In [46]:
new_content += "\n\n" + table_caption

In [47]:
full_new_content = (
    lv_file_mark_start + "\n" + new_content.strip() + "\n" + lv_file_mark_end
)

In [48]:
with open(OUTPUT_FILE_PATH, "r", encoding="utf8") as f:
    file_content = f.read()

In [49]:
new_file_content = re.sub(
    lv_file_mark_start + ".*?" + lv_file_mark_end,
    full_new_content,
    file_content,
    flags=re.DOTALL,
)

In [50]:
with open(OUTPUT_FILE_PATH, "w", encoding="utf8") as f:
    f.write(new_file_content)  # .replace("\beta", r"\beta"))

## eMERGE

In [51]:
result_set = "emerge"

In [52]:
lv_assocs = emerge_lv_trait_assocs[
    (emerge_lv_trait_assocs["lv"] == LV_NAME) & (emerge_lv_trait_assocs["fdr"] < 0.05)
].sort_values("fdr")

In [53]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,phenotype,lv,coef,pvalue,pvalue_twosided,fdr
867,250.1,LV54,0.105369,2.184681e-17,4.3693610000000004e-17,1.687666e-13
727,244.0,LV54,0.069915,1.257013e-08,2.514026e-08,1.008865e-05
788,244.4,LV54,0.069773,1.436572e-08,2.873144e-08,1.008865e-05
6893,695.0,LV54,0.063737,1.548398e-07,3.096795e-07,8.543836e-05
7073,714.0,LV54,0.060096,1.702623e-06,3.405245e-06,0.000657638
4498,440.0,LV54,0.054621,5.418659e-06,1.083732e-05,0.001902688
6255,585.0,LV54,0.049424,3.30692e-05,6.61384e-05,0.007096099
6397,585.33,LV54,0.049063,3.780752e-05,7.561503e-05,0.007488796
6356,585.32,LV54,0.049409,3.737712e-05,7.475423e-05,0.007488796
1002,250.6,LV54,0.048784,4.425391e-05,8.850781e-05,0.008546536


In [54]:
lv_assocs = lv_assocs.assign(
    phenotype_desc=lv_assocs["phenotype"].apply(
        lambda x: emerge_traits_info.loc[x, "phenotype"]
    )
)

In [55]:
lv_assocs = lv_assocs.assign(
    n=lv_assocs["phenotype"].apply(
        lambda x: emerge_traits_info.loc[x, ["eur_n_cases", "eur_n_controls"]].sum()
    )
)

In [56]:
lv_assocs = lv_assocs.assign(
    n_cases=lv_assocs["phenotype"].apply(
        lambda x: emerge_traits_info.loc[x, "eur_n_cases"]
    )
)

In [57]:
lv_assocs = lv_assocs.assign(coef=lv_assocs["coef"].apply(lambda x: f"{x:.3f}"))

In [58]:
lv_assocs = lv_assocs.assign(
    fdr=lv_assocs["fdr"].apply(lambda x: f"{x:.2e}".replace("-", "&#8209;"))
)

In [59]:
lv_assocs = lv_assocs.assign(n=lv_assocs["n"].apply(num_to_int_str))

In [60]:
lv_assocs = lv_assocs.assign(n_cases=lv_assocs["n_cases"].apply(num_to_int_str))

In [61]:
lv_assocs = lv_assocs.rename(columns={"phenotype": "phecode"})

In [62]:
lv_assocs.shape

(23, 9)

In [63]:
lv_assocs = lv_assocs[["phecode", "phenotype_desc", "n", "n_cases", "fdr"]]

In [64]:
lv_assocs = lv_assocs.rename(
    columns={
        "lv": "Latent variable (LV)",
        #         "coef": r"$\beta$",
        "fdr": "FDR",
        "phecode": "Phecode",
        "phenotype_desc": "Trait description",
        "n": "Sample size",
        "n_cases": "Cases",
    }
)

In [65]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
):
    display(lv_assocs)

Unnamed: 0,Phecode,Trait description,Sample size,Cases,FDR
867,250.1,Type 1 diabetes,42723,2450,1.69e&#8209;13
727,244.0,Hypothyroidism,54404,9720,1.01e&#8209;05
788,244.4,Hypothyroidism NOS,53968,9284,1.01e&#8209;05
6893,695.0,Erythematous conditions,48347,4210,8.54e&#8209;05
7073,714.0,Rheumatoid arthritis and other inflammatory polyarthropathies,50215,3303,6.58e&#8209;04
4498,440.0,Atherosclerosis,47471,4993,1.90e&#8209;03
6255,585.0,Renal failure,51437,9970,7.10e&#8209;03
6397,585.33,"Chronic Kidney Disease, Stage III",46279,4812,7.49e&#8209;03
6356,585.32,End stage renal disease,43309,1842,7.49e&#8209;03
1002,250.6,Polyneuropathy in diabetes,41948,1675,8.55e&#8209;03


### Fill empty

In [66]:
if lv_assocs.shape[0] == 0:
    lv_assocs.loc[0, "Phecode"] = "No significant associations"
    lv_assocs = lv_assocs.fillna("")

### Save

In [67]:
# start
lv_file_mark_start = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="start"
)
display(lv_file_mark_start)

# end
lv_file_mark_end = LV_FILE_MARK_TEMPLATE.format(
    result_set=result_set, lv=LV_NAME, position="end"
)
display(lv_file_mark_end)

'<!-- LV54:emerge_traits_assocs:start -->'

'<!-- LV54:emerge_traits_assocs:end -->'

In [68]:
new_content = lv_assocs.to_markdown(index=False, disable_numparse=True)

In [69]:
# add table caption
table_caption = TABLE_CAPTION.format(
    lv_name=LV_NAME,
    result_set_name=RESULT_SET_NAMES[result_set],
    table_id="{"
    + TABLE_CAPTION_ID.format(result_set=result_set, lv_name_lower_case=LV_NAME.lower())
    + "}",
)
display(table_caption)

'Table: Significant trait associations of LV54 in eMERGE. {#tbl:sup:emerge_assocs:lv54}'

In [70]:
new_content += "\n\n" + table_caption

In [71]:
full_new_content = (
    lv_file_mark_start + "\n" + new_content.strip() + "\n" + lv_file_mark_end
)

In [72]:
with open(OUTPUT_FILE_PATH, "r", encoding="utf8") as f:
    file_content = f.read()

In [73]:
new_file_content = re.sub(
    lv_file_mark_start + ".*?" + lv_file_mark_end,
    full_new_content,
    file_content,
    flags=re.DOTALL,
)

In [74]:
with open(OUTPUT_FILE_PATH, "w", encoding="utf8") as f:
    f.write(new_file_content)  # .replace("\beta", r"\beta"))