In [100]:
import pandas as pd
import re
from functools import reduce

# STable 1: Independent population results

In [62]:
def get_gene_mask(regenie_id):
    pattern = re.compile("(.+)\.(PTV.*)\.0\.001")
    m = re.match(pattern, regenie_id)
    if not m:
        print(regenie_id)
    gene = m.group(1)
    mask = m.group(2)
    mask_dict = {
        "PTV": "pLoF", 
        "PTV_Missense_strict": "pLoF|Missense_strict",
        "PTV_Missense_lenient": "pLoF|Missense_strict|Missense_lenient"
        }
    return pd.Series({"gene": gene, "mask": mask_dict[mask]})


def parse_regenie_output(df, add_columns=["nsamples", "population"]):
    df[["GENE", "MASK"]] = df.ID.apply(get_gene_mask)
    columns = ["GENE", "MASK"] + ["N", "BETA", "SE", "CHISQ", "p_value"] + add_columns
    df = df.loc[:, columns]
    return df

In [10]:
pop_df = pd.read_excel("../data/meta/independent_pop_assoc.xlsx")

In [11]:
pop_df = parse_regenie_output(pop_df)

# STable 2: Risk genes

In [12]:
risk_df = pd.read_excel("../data/meta/risk_meta.xlsx")

# STable 3: Protective genes

In [14]:
protective_df = pd.read_excel("../data/meta/protective_meta.xlsx")

# STable 4: Meta analysis results of discovered genes

In [61]:
def parse_meta_output(df):
    stats_cols = ["beta", "se", "ci_low", "ci_high", "z_score", "p_value"]
    mpop_dict = {"d": "eur", "r": "non_eur"}
    rename_dict = {"nsamples": "n_samples"}
    renamed_cols = ["n_samples"]
    for mpop in ["d", "r"]:
        for stat in stats_cols:
            rename_dict[f"{mpop}{stat}"] = f"{mpop_dict[mpop]}_{stat}"
            renamed_cols.append(f"{mpop_dict[mpop]}_{stat}")
        rename_dict[f"{mpop}samples"] = f"{mpop_dict[mpop]}_samples"
        renamed_cols.append(f"{mpop_dict[mpop]}_samples")
    df[["GENE", "MASK"]] = df.ID.apply(get_gene_mask)
    df = df.rename(columns=rename_dict)
    df = df.loc[:, ["GENE", "MASK"]+stats_cols+renamed_cols]
    return df

In [15]:
meta_df = pd.read_excel("../data/meta/monogenic_meta.xlsx")

In [23]:
meta_df = parse_meta_output(meta_df)

# STable 5: Known genes from other RVAS biases

In [37]:
known_df = pd.read_excel("../data/known_genes/known_obesity_genes.xlsx")

In [38]:
known_df = parse_meta_output(known_df).sort_values("beta")

# STable 6: Inter-population variance of discovered genes

In [60]:
def parse_var_df(df):
    df[["GENE", "MASK"]] = df.ID.apply(get_gene_mask)
    df = df.loc[:, ["GENE", "MASK"]+["variance", "status"]]
    return df.sort_values("variance")

In [43]:
var_df = pd.read_excel("../data/meta/assoc_variance.xlsx")

In [44]:
var_df = parse_var_df(var_df)

# STable 7: Obesity clinical category enrichment

In [63]:
def parse_obesity_cat_df(df):
    df[["GENE", "MASK"]] = df.ID.apply(get_gene_mask)
    df = df.rename(columns={"comorbidity": "obesity_category"})
    df = df.loc[:, ["GENE", "MASK", "obesity_category", "OR", "p_value", "ci_low", "ci_high"]]
    return df

In [64]:
bmi_cat_enrich_df = pd.read_excel("../data/bmi_cat_enrichment/monogenic_bmi_categories.xlsx", index_col=0)

In [65]:
bmi_cat_enrich_df = parse_obesity_cat_df(bmi_cat_enrich_df)

# STable 8: ICD enrichment

In [71]:
def parse_obesity_cat_df(df):
    df[["GENE", "MASK"]] = df.ID.apply(get_gene_mask)
    df = df.loc[:, ["GENE", "MASK", "comorbidity", "OR", "p_value", "ci_low", "ci_high"]]
    return df

In [69]:
icd_enrich_df = pd.read_excel("../data/icd_enrichment/monogenic_comorbidities.xlsx", index_col=0)

In [72]:
icd_enrich_df = parse_obesity_cat_df(icd_enrich_df)

# STable 9: Protein model coefficients

In [77]:
def parse_protein_df(df):
    df = df.rename(columns={"gene": "protein"}) 
    return df

In [78]:
protein_df = pd.read_csv("../data/proteomics/protein_model_coefs.csv")

In [80]:
protein_df = parse_protein_df(protein_df)

# STable 10: Literature review meta

In [81]:
lit_df = pd.read_excel("../data/manual_lit/monogenic_lit_review.xlsx")

# STable 11: Literature review summary

In [85]:
lit_summary_df = lit_df.loc[:, ["gene", "rvas", "gwas", "CMDKP", "pubmed_hits_max50", "impc"]]

# STable 12: Pubmed hits

In [87]:
pubmed_df = pd.read_excel("../data/manual_lit/pubmed_search.xlsx")

# STable 13: PGS interaction model results

In [92]:
def parse_pgs_df(df):
    df[["GENE", "MASK"]] = df.ID.apply(get_gene_mask)
    df = df.loc[:, ["GENE", "MASK", "beta", "se", "ci_low", "ci_high", "z_score", "p_value"]]
    return df

In [90]:
pgs_df = pd.read_excel("../data/pgs_interaction/monogenic_pgs_int_meta.xlsx")

In [94]:
pgs_df = parse_pgs_df(pgs_df)

# STable 14: Lifestyle factor meta results for pa, smoke, alcohol

In [111]:
def parse_lf_df(dfs, cohorts, lifestyles=["pa", "alcohol", "smoke"]):
    pre_dfs = []
    for df,cohort in zip(dfs, cohorts):
        df = df.set_index("ID")
        df.columns = [f"{cohort}_{c}" for c in df.columns]
        pre_dfs.append(df)
    df = reduce(lambda x,y: x.merge(y, left_index=True, right_index=True), pre_dfs)
    df = df.reset_index()
    df[["GENE", "MASK"]] = df.ID.apply(get_gene_mask)
    df = df.loc[:, ["GENE", "MASK"] + [f"{c}_{lf}_{sc}" for c in cohorts for lf in lifestyles for sc in ["F", "pvalue"]]]
    return df



In [97]:
aou_lf_bias_df = pd.read_csv("../data/sex_lf_bias/aou_sex_lf_bias.csv.gz")
ukb_lf_bias_df = pd.read_csv("../data/sex_lf_bias/ukb_sex_lf_bias.csv.gz")

In [113]:
lf_df = parse_lf_df([ukb_lf_bias_df, aou_lf_bias_df], ["ukb", "aou"])

# STable 15: Lifestyle results in UKB for diet, sleep and sedentary lifestyles

In [122]:
def parse_ukb_lf_df(df, lifestyles=["diet", "sleep", "sedentary"]):
    df[["GENE", "MASK"]] = df.ID.apply(get_gene_mask)
    df = df.loc[:, ["GENE", "MASK"] + [f"{lf}_{sc}" for lf in lifestyles for sc in ["F", "pvalue"]]]
    return df

In [123]:
ukb_lf_df = parse_ukb_lf_df(ukb_lf_bias_df)

# STable 16: Lifestyle selection

In [146]:
# UKB Data
lf_data = [
    ('met', '22036', "pa", "categorical", "No", "ukb"), # https,//biobank.n)dph.ox.ac.uk/ukb/field.cgi?id=22036
    ('alcohol', '1558', "alcohol", "categorical", "Daily or almost daily", "ukb"), # https,//biobank.n)dph.ox.ac.uk/ukb/field.cgi?id=1558
    ('smokecurr', '1239', "smoke", "categorical", "Yes, on most or all days", "ukb"), #https,//biobank.n)dph.ox.ac.uk/ukb/field.cgi?id=1239
    ('smokepast', '1249', "smoke", "categorical", "Yes, on most or all days", "ukb"), #https,//biobank.n)dph.ox.ac.uk/ukb/field.cgi?id=1249
    ('physicalhealth', '1585723', "pa", "categorical", "General Physical Health: Fair|General Physical Health: Poor", "aou"),
    ('alcoholfrequency', '1586201', "alcohol", "categorical", "Drink Frequency Past Year: 4 or More Per Week", "aou"),
    ('smoking', '1585860', "smoke", "categorical", "Smoke Frequency: Every Day", "aou"),
    ('tv', '1070', "sedentary", "numerical", "greater_95_percentile", "ukb"),
    ('computer', '1080', "sedentary", "numerical", "greater_95_percentile", "ukb"),
    ('sleep', '1160', "sleep", "numerical", "greater_95_percentile", "ukb"), 

]


lf_data.append([f"cookedvegetable", f"1289", "diet", "numerical", "lower_5_percentile", "ukb"])
lf_data.append([f"salad", f"1299", "diet", "numerical", "lower_5_percentile", "ukb"])
lf_data.append([f"freshfruit", f"1309", "diet", "numerical", "lower_5_percentile", "ukb"])
lf_data.append([f"driedfruit", f"1319", "diet", "numerical", "lower_5_percentile", "ukb"])
lf_data.append([f"oilyfish", f"1329", "diet", "categorical", "5-6 times a week|Once or more daily", "ukb"])
lf_data.append([f"nonoilyfish", f"1339", "diet", "categorical", "5-6 times a week|Once or more daily", "ukb"])
lf_data.append([f"procmeat", f"1349", "diet", "categorical", "5-6 times a week|Once or more daily", "ukb"])
lf_data.append([f"beef", f"1369", "diet", "categorical", "5-6 times a week|Once or more daily", "ukb"])
lf_data.append([f"mutton", f"1379", "diet", "categorical", "5-6 times a week|Once or more daily", "ukb"])
lf_data.append([f"pork", f"1389", "diet", "categorical", "5-6 times a week|Once or more daily", "ukb"])



In [147]:
raw_lf_df = pd.DataFrame(lf_data, columns=["field_name", "field_id", "lifestyle", "field_type", "obesogenic_encoding", "biobank"])

# STable 17: Comorbidity diagnosis

In [132]:
comorbid_df = pd.read_excel("../data/icd_enrichment/obesity_associated_diseases.xlsx")

In [149]:
supp_dfs = [
    pop_df, risk_df, protective_df, meta_df, known_df, var_df, # Result 1
    bmi_cat_enrich_df, icd_enrich_df,  # Result 2
    protein_df, lit_df, lit_summary_df, pubmed_df, # Result 3
    pgs_df, lf_df, ukb_lf_df, # Result 4
    raw_lf_df, comorbid_df, # Methods
]

In [150]:
with pd.ExcelWriter('../data/tables/Supplementary.xlsx', engine='xlsxwriter', mode="w") as writer:
    for i, df in enumerate(supp_dfs):
        # capitallize all columns
        df.columns = [c.upper() for c in df.columns]
        try:
            df.to_excel(writer, sheet_name=f"Supplementary Table {i+1}", index=False)
        except NotImplementedError:
            df.to_excel(writer, sheet_name=f"Supplementary Table {i+1}", index=True)