In [1]:
import os
import pandas as pd
from scipy.stats.contingency import odds_ratio
import forestplot as fp
from scipy.stats import fisher_exact

In [2]:
obese_case_cont_file = "/data6/deepro/ukb_bmi/1_data_processing/data/british/case_controls.csv"
combo_info_dir = "/data6/deepro/ukb_bmi/3_characterization/data/combo_info/comorbidities"
comorbidities = ["E03.9Hypothyroidism,unspecified", "E11Non-insulin-dependentdiabetesmellitus", "E78.0Purehypercholesterolaemia", "I10Essential(primary)hypertension", "I20-I25Ischaemicheartdiseases", "K80-K87Disordersofgallbladder,biliarytractandpancreas", "M15-M19Arthrosis"]
comorbidity_codes = ["E039", "E11", "E780", "I10", "BlockI20-I25", "BlockK80-K87", "BlockM15-M19"]
combo_info_filename_start = "discovery"

# Annotate obese info

In [3]:
for cf in os.scandir(combo_info_dir):
    print(cf.name)

I10Essential(primary)hypertension
E11Non-insulin-dependentdiabetesmellitus
M15-M19Arthrosis
E78.0Purehypercholesterolaemia
I20-I25Ischaemicheartdiseases
K80-K87Disordersofgallbladder,biliarytractandpancreas
E03.9Hypothyroidism,unspecified


In [4]:
combo_dfs = []
for cm, cmc in zip(comorbidities, comorbidity_codes):
    combo_df = pd.concat([pd.read_csv(cf) for cf in os.scandir(os.path.join(combo_info_dir, cm)) if (cf.name.startswith(combo_info_filename_start)&(cf.name.endswith(".csv")))])
    combo_df["combo_samples"] = combo_df.combo_samples.str.split("|")
    combo_df = combo_df.explode("combo_samples")
    combo_df = combo_df.reset_index(drop=True)
    combo_df["disease"] = cm
    combo_df["code"] = cmc
    combo_dfs.append(combo_df)

all_combo_df = pd.concat(combo_dfs).reset_index(drop=True)

In [5]:
obese_case_cont_df = pd.read_csv(obese_case_cont_file)

In [6]:
obese_samples = set(obese_case_cont_df.loc[obese_case_cont_df.Output_BMI==1, "Sample_Name"].astype(str).values)

In [7]:
all_combo_df["obese_case"] = all_combo_df.combo_samples.isin(obese_samples)

# ICD annotation

In [8]:
import sys

sys.path.append("../src/utils")

import parsing as utpa

In [9]:
icd_raw_dir = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/icd_codes/data/icd_raw"
icd_codes_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/icd_codes/data/icd_tree/coding19.tsv"
cohort_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/phenotype/data/bmi_processed/british/train_cohort_bmi.csv.gz"
hes_info_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/icd_codes/data/hes_info/hes_info.csv.gz"

In [10]:
icd_samples_df = utpa.create_icd_samples_file(icd_raw_dir)
icd_codes_df = pd.read_csv(icd_codes_file, usecols=["coding", "meaning", "node_id", "parent_id"], sep="\t")
icd_codes_df["coding"] = icd_codes_df.coding.str.replace(" ", "")
pheno_tree, root_pheno, c2nodeid_dict = utpa.create_tree(icd_codes_df, icd_samples_df)
hes_info_df = pd.read_csv(hes_info_file, dtype={"sample_names": str, "hes_info": float})
all_icd_samples = set(hes_info_df.loc[hes_info_df.hes_info>0, "sample_names"].values)

In [11]:
all_combo_w_icd_df = all_combo_df.loc[all_combo_df.combo_samples.isin(all_icd_samples)]

In [12]:
all_combo_w_icd_obese_df = all_combo_w_icd_df.loc[all_combo_w_icd_df.obese_case==True]

In [13]:
all_combo_w_icd_nonobese_df = all_combo_w_icd_df.loc[all_combo_w_icd_df.obese_case==False]

In [14]:
def get_table_icd(combo_samples, noncombo_samples, comorbid_samples, field):
    table = [
        [len(combo_samples.intersection(comorbid_samples)), len(combo_samples.difference(comorbid_samples))],
        [len(noncombo_samples.intersection(comorbid_samples)), len(noncombo_samples.difference(comorbid_samples))]
    ]
    df = pd.DataFrame(table, columns=[f"{field}", f"No {field}"], index=["Combo", "Non Combo"])
    return df

In [28]:
def check_icd_enrichment_combo_in_groups(all_combo_w_icd_df, all_samples):
    all_combo_w_icd_dfs = []
    icd_data = []

    for group, gdf in all_combo_w_icd_df.groupby("code"):
        icdc_node = pheno_tree.node_dict[c2nodeid_dict[group]]
        icd_samples = icdc_node.get_samples()
        gdf["icd_case"] = gdf.combo_samples.isin(icd_samples)
        combo_samples = set(gdf.combo_samples.unique())
        non_combo_samples = all_samples.difference(combo_samples)
        df = get_table_icd(combo_samples, non_combo_samples, icd_samples, group)
        res = fisher_exact(df)
        or_study = odds_ratio(df)
        cil, cih = or_study.confidence_interval(confidence_level=0.95)
        icdc_node_data = (group, icdc_node.meaning, df.iloc[0,0], df.iloc[0,1], df.iloc[1,0], df.iloc[1,1], or_study.statistic, res.pvalue, cil, cih)
        icd_data.append(icdc_node_data)
        all_combo_w_icd_dfs.append(gdf)

    all_combo_w_icd_annot_df = pd.concat(all_combo_w_icd_dfs)
    all_combo_w_icd_enrich_df = pd.DataFrame(icd_data, columns=["icd_code", "icd_meaning", "combo_comorbid", "combo_noncomorbid", "noncombo_comorbid", "noncombo_noncomorbid", "odds_ratio", "p_value", "ci_low", "ci_high"])
    return all_combo_w_icd_annot_df, all_combo_w_icd_enrich_df

In [29]:
non_obese_samples = all_icd_samples.difference(obese_samples)

In [30]:
all_combo_w_icd_annot_nonobese_df, all_combo_w_icd_enrich_nonobese_df = check_icd_enrichment_combo_in_groups(all_combo_w_icd_nonobese_df, non_obese_samples)

In [31]:
all_combo_w_icd_annot_obese_df, all_combo_w_icd_enrich_obese_df = check_icd_enrichment_combo_in_groups(all_combo_w_icd_obese_df, obese_samples)

In [32]:
all_combo_w_icd_annot_df, all_combo_w_icd_enrich_df = check_icd_enrichment_combo_in_groups(all_combo_w_icd_df, all_icd_samples)

In [33]:
all_combo_w_icd_enrich_df

Unnamed: 0,icd_code,icd_meaning,combo_comorbid,combo_noncomorbid,noncombo_comorbid,noncombo_noncomorbid,odds_ratio,p_value,ci_low,ci_high
0,BlockI20-I25,I20-I25 Ischaemic heart diseases,3761,10578,53605,381146,2.528044,0.0,2.432416,2.627023
1,BlockK80-K87,"K80-K87 Disorders of gallbladder, biliary trac...",1533,6483,33201,407873,2.904948,5.858715999999999e-240,2.74282,3.075226
2,BlockM15-M19,M15-M19 Arthrosis,7722,11197,85296,344875,2.788431,0.0,2.705804,2.873479
3,E039,"E03.9 Hypothyroidism, unspecified",1277,7085,27774,412954,2.679865,5.202388e-179,2.519783,2.848403
4,E11,E11 Non-insulin-dependent diabetes mellitus,3291,9851,37548,398400,3.544689,0.0,3.40174,3.692976
5,E780,E78.0 Pure hypercholesterolaemia,4918,10928,66070,367174,2.501003,0.0,2.415307,2.589479
6,I10,I10 Essential (primary) hypertension,8576,5970,142399,292145,2.94714,0.0,2.849301,3.048484


In [34]:
all_combo_w_icd_enrich_obese_df

Unnamed: 0,icd_code,icd_meaning,combo_comorbid,combo_noncomorbid,noncombo_comorbid,noncombo_noncomorbid,odds_ratio,p_value,ci_low,ci_high
0,BlockI20-I25,I20-I25 Ischaemic heart diseases,2683,2039,11113,69693,8.251709,0.0,7.760362,8.775136
1,BlockK80-K87,"K80-K87 Disorders of gallbladder, biliary trac...",1191,1230,8904,74203,8.069069,0.0,7.422277,8.77207
2,BlockM15-M19,M15-M19 Arthrosis,5707,2048,19064,58709,8.581341,0.0,8.13601,9.053576
3,E039,"E03.9 Hypothyroidism, unspecified",931,1377,6655,76565,7.778161,0.0,7.123037,8.490951
4,E11,E11 Non-insulin-dependent diabetes mellitus,2840,1940,12764,67984,7.796876,0.0,7.334368,8.289795
5,E780,E78.0 Pure hypercholesterolaemia,3539,2213,13680,66096,7.726326,0.0,7.301526,8.177097
6,I10,I10 Essential (primary) hypertension,6512,988,33104,44924,8.944287,0.0,8.351169,9.58726


In [35]:
all_combo_w_icd_enrich_nonobese_df

Unnamed: 0,icd_code,icd_meaning,combo_comorbid,combo_noncomorbid,noncombo_comorbid,noncombo_noncomorbid,odds_ratio,p_value,ci_low,ci_high
0,BlockI20-I25,I20-I25 Ischaemic heart diseases,1078,8539,42496,317558,0.94338,0.07533913,0.883942,1.006037
1,BlockK80-K87,"K80-K87 Disorders of gallbladder, biliary trac...",342,5253,24297,339779,0.910464,0.09939983,0.813115,1.016718
2,BlockM15-M19,M15-M19 Arthrosis,2015,9149,66236,292271,0.971836,0.2545554,0.92494,1.020738
3,E039,"E03.9 Hypothyroidism, unspecified",346,5708,21121,342496,0.982954,0.7817518,0.878542,1.096837
4,E11,E11 Non-insulin-dependent diabetes mellitus,451,7911,24788,336521,0.773956,5.938921e-08,0.701724,0.851881
5,E780,E78.0 Pure hypercholesterolaemia,1379,8715,52392,307185,0.927752,0.01040434,0.875234,0.982852
6,I10,I10 Essential (primary) hypertension,2064,4982,109305,253320,0.960142,0.1251137,0.911237,1.011398


# Obesity related disorders PRS

Check PRS for obesity, hypertension and t2 diabetes amongst

1. Obesity + comorbid combo carriers vs non-carriers
2. Non obese + comorbid combo carriers vs non-carriers
3. Obese combo carrier w vs w/o comorbidity
4. Obese non-combo carriers w vs w/o comorbidity

In [22]:
prs_obesity_related_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/obesity_related_prs/data/prs_processed/obesity_related.csv.gz"
prs_obesity_related_df = pd.read_csv(prs_obesity_related_file)