In [1]:
import os
import argparse
import numpy as np
import pandas as pd
from scipy.stats import fisher_exact, chi2_contingency
from scipy.stats.contingency import odds_ratio
from scipy import stats
from functools import reduce

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans

import sys
sys.path.append("../src")
import utils.parsing as utpa
import utils.plotting as utpl

def get_scaled_bmi(df, categorical_cols, numerical_cols, scaled_numerical_cols):
    # define encoders
    en = LabelEncoder()
    scaler = StandardScaler()
    # select the categorical and numerical columns
    # transform the categorical columns to integer values
    for cat_col in categorical_cols:
        df[cat_col] = en.fit_transform(df[cat_col])
    # scale the numerical columns
    df[numerical_cols] = scaler.fit_transform(df.loc[:, numerical_cols])
    # scale bmi separately
    df["bmi_scaled"] = scaler.fit_transform(df.loc[:, ["bmi"]])
    # Create the target variable (bmi_residuals) using linear regression
    X = df.loc[:, categorical_cols + numerical_cols + scaled_numerical_cols]
    y = df.loc[:, 'bmi_scaled']
    model = LinearRegression()
    model.fit(X, y)
    # save the residuals for bmi
    df['bmi_residuals'] = y - model.predict(X)
    return df

def create_bmi_res_prs_decile_data(phenotype_df, all_combo_samples):
    categorical_cols = ["genetic_sex"]
    numerical_cols = ["age"] + [f"genetic_pca{i}" for i in range(1, 40)]
    scaled_numerical_cols = []#["bmi_prs"]

    phenotype_df = get_scaled_bmi(phenotype_df, categorical_cols, numerical_cols, scaled_numerical_cols)
    phenotype_df["bmi_res_decile"] = pd.qcut(phenotype_df.bmi_residuals, q=10)
    phenotype_df["bmi_res_decile_num"] = pd.qcut(phenotype_df.bmi_residuals, q=10, labels=False)
    phenotype_df["bmi_prs_decile"] = pd.qcut(phenotype_df.bmi_prs, q=10)
    phenotype_df["bmi_prs_decile_num"] = pd.qcut(phenotype_df.bmi_prs, q=10, labels=False)
    phenotype_combo_samples_df = phenotype_df.loc[phenotype_df.sample_names.astype(str).isin(list(map(str, all_combo_samples)))]
    phenotype_other_samples_df = phenotype_df.loc[~phenotype_df.sample_names.astype(str).isin(list(map(str, all_combo_samples)))]

    phenotype_other_samples_df["description"] = "Non Combo"
    phenotype_combo_samples_df["description"] = "Combo"
    phenotype_samples_df = pd.concat((phenotype_combo_samples_df, phenotype_other_samples_df))
    return phenotype_samples_df

In [2]:
cohort_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/phenotype/data/bmi_processed/british/train_cohort_bmi.csv.gz"
combo_files = [
    "/data6/deepro/ukb_bmi/3_characterization/data/combo_info/british/discovery_combo2.csv",
    "/data6/deepro/ukb_bmi/3_characterization/data/combo_info/british/discovery_combo3.csv"
]


In [3]:
cohort_df = pd.read_csv(
    cohort_file, 
    usecols=["sample_names", "genetic_sex", "age"] + [f"genetic_pca{i}" for i in range(1, 40)] + ["bmi_prs", "bmi"])
cohort_df["sample_names"] = cohort_df.sample_names.astype(str)
combo_genes, combo_samples = utpa.get_combo_info_from_files(combo_files)

phenotype_samples_df = create_bmi_res_prs_decile_data(cohort_df, combo_samples)

In [4]:
icd_raw_dir = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/icd_codes/data/icd_raw"
icd_codes_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/icd_codes/data/icd_tree/coding19.tsv"
hes_info_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/icd_codes/data/hes_info/hes_info.csv.gz"

In [5]:
icd_samples_df = utpa.create_icd_samples_file(icd_raw_dir)
icd_codes_df = pd.read_csv(icd_codes_file, usecols=["coding", "meaning", "node_id", "parent_id"], sep="\t")
icd_codes_df["coding"] = icd_codes_df.coding.str.replace(" ", "")
pheno_tree, root_pheno, c2nodeid_dict = utpa.create_tree(icd_codes_df, icd_samples_df)
hes_info_df = pd.read_csv(hes_info_file, dtype={"sample_names": str, "hes_info": float})
all_icd_samples = set(hes_info_df.loc[hes_info_df.hes_info>0, "sample_names"].values)
print(f"Samples in cohort {len(phenotype_samples_df)}")
phenotype_samples_df = phenotype_samples_df.loc[phenotype_samples_df.sample_names.isin(all_icd_samples)]
print(f"Samples with icd {len(phenotype_samples_df)}")
print(f"Samples with combo {len(combo_samples)}")
combo_samples = combo_samples.intersection(all_icd_samples)
print(f"Samples with combo and icd {len(combo_samples)}")


Samples in cohort 427639
Samples with icd 383253
Samples with combo 10193
Samples with combo and icd 9269


In [10]:
def get_table_icd(combo_samples, noncombo_samples, comorbid_samples, field):
    table = [
        [len(combo_samples.intersection(comorbid_samples)), len(combo_samples.difference(comorbid_samples))],
        [len(noncombo_samples.intersection(comorbid_samples)), len(noncombo_samples.difference(comorbid_samples))]
    ]
    df = pd.DataFrame(table, columns=[f"{field}", f"No {field}"], index=["Combo", "Non Combo"])
    return df

def get_icd_enrich(phenotype_samples_df, pheno_tree, icd_codes_df, c2nodeid_dict):
    combo_samples = set(phenotype_samples_df.loc[phenotype_samples_df.description=="Combo", "sample_names"].astype(str).values)
    noncombo_samples = set(phenotype_samples_df.loc[phenotype_samples_df.description=="Non Combo", "sample_names"].astype(str).values)
    all_cohort_samples = set(phenotype_samples_df.sample_names)
    icd_data = []

    for icdc in icd_codes_df.coding.values:
        icdc_node = pheno_tree.node_dict[c2nodeid_dict[icdc]]
        comorbid_samples = icdc_node.get_samples()
        comorbid_samples = all_cohort_samples.intersection(comorbid_samples)
        df = get_table_icd(combo_samples, noncombo_samples, comorbid_samples, icdc_node.meaning)
        res = fisher_exact(df)
        or_study = odds_ratio(df)
        cil, cih = or_study.confidence_interval(confidence_level=0.95)
        icdc_node_data = (icdc, icdc_node.meaning, df.iloc[0,0], df.iloc[0,1], df.iloc[1,0], df.iloc[1,1], or_study.statistic, res.pvalue, cil, cih)
        icd_data.append(icdc_node_data)
    
    icd_df = pd.DataFrame(icd_data, columns=["icd_code", "icd_meaning", "combo_comorbid", "combo_noncomorbid", "noncombo_comorbid", "noncombo_noncomorbid", "odds_ratio", "p_value", "ci_low", "ci_high"])
    icd_df["FDR"] = stats.false_discovery_control(icd_df.p_value)
    return icd_df

In [11]:
icd_codes_of_interest_file = "/data6/deepro/ukb_bmi/3_characterization/data/enrichment/british/discovery/icd_enrichment_filtered.csv"
icd_codes_of_interest_df = pd.read_csv(icd_codes_of_interest_file)
icd_codes_of_interest_df = icd_codes_of_interest_df.loc[icd_codes_of_interest_df.level==2]
phenotype_samples_of_interest_df = phenotype_samples_df.loc[phenotype_samples_df.bmi_res_decile_num==9]

In [12]:
icd_enrich_extreme_df = get_icd_enrich(phenotype_samples_of_interest_df, pheno_tree, icd_codes_of_interest_df, c2nodeid_dict)

In [14]:
len(icd_enrich_extreme_df)

38

In [16]:
icd_enrich_extreme_df.to_csv("/data6/deepro/ukb_bmi/3_characterization/data/pilot/extreme_decile_icd_enrich.csv")