In [1]:
import numpy as np
import pandas as pd
import os
import tqdm
from scipy.stats import kstest,ttest_ind,fisher_exact
from scipy.stats.contingency import odds_ratio
from scipy import stats
import itertools as it
from collections import Counter
import seaborn as sns
from matplotlib.ticker import MultipleLocator
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams.update({'font.size': 14, 'axes.linewidth': 2, 'xtick.major.width': 1.5, 'xtick.major.size': 7, 'ytick.major.width': 1.5, 'ytick.major.size': 7})
from matplotlib.backends.backend_pdf import PdfPages
from functools import reduce
from scipy.stats import kstest,ttest_ind

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
class Node:
    """
    Each ICD10 diagnosis is stored as a Node object
    """
    def __init__(self, node_id, code, meaning, parent=None, child=None):
        self.node = node_id
        self.parent = parent
        self.child = child
        self.code, self.meaning = code, meaning
        self.samples = set()

    def add_child(self, child_node):
        if self.child:
            self.child.append(child_node)
        else:
            self.child = [child_node]
        return

    def add_parent(self, parent_node):
        if not self.parent:
            self.parent = parent_node
        else:
            assert self.parent == parent_node
        return

    def get_parent(self):
        return self.parent

    def get_child(self):
        return self.child

    def get_info(self):
        return self.code, self.meaning
    
    def get_samples(self):
        return self.samples
    
    def get_samples_number(self):
        return len(self.samples)


class Tree:
    def __init__(self, root_node, coding_df):
        self.root = root_node
        self.node_dict = {self.root.node : self.root}
        self.coding_df = coding_df

    def update_node_dict(self, node_id, node):
        if node_id not in self.node_dict:
            self.node_dict[node_id] = node
        return

    def create_node_from_df_helper(self, node_id):
        c, m, ni, pi =  self.coding_df.loc[self.coding_df.node_id==node_id].values[0]
        n = Node(ni, c, m)
        return n, pi

    def create_node_from_df(self, node_id):
        if node_id in self.node_dict:
            return self.node_dict[node_id]

        # creating a node and providing parent information
        mn, mnpi = self.create_node_from_df_helper(node_id)
        # if parent is not present in the tree
        if mnpi not in self.node_dict:
            # create the parent node and get its parent
            mnp = self.create_node_from_df(mnpi)
            # add that parent info to the created node
            mn.add_parent(mnp)
        else:
            mnp = self.node_dict[mnpi]
            # add that parent info to the created node
            mn.add_parent(mnp)

        # update the node dict with the created node
        self.update_node_dict(node_id, mn)
        # add the created node as a child of the parent node
        mnp.add_child(mn)
        return mn

    def print_node(self, curr_node, node_level, tree_file):
        curr_node_info = curr_node.get_info()
        if tree_file:
            tree_file.write(f"{'-' * node_level}{curr_node.node}\t{curr_node_info[1]}\n")
        else:
            print(f"{'-' * node_level}{curr_node.node}\t{curr_node_info[1]}\n")
        return

    def print_tree(self, curr_node, tree_file="", node_level=0, max_node_level=2):
        if node_level>max_node_level:
            return
        
        if curr_node:
            self.print_node(curr_node, node_level, tree_file)

            if curr_node.child:
                for c in curr_node.child:
                    self.print_tree(c, tree_file, node_level+1, max_node_level)
        return
    
    def add_sample_info(self, node_id, samples):
        curr_node = self.node_dict[node_id]
        curr_node.samples = samples.union(curr_node.samples)
        if curr_node.parent:
            self.add_sample_info(curr_node.parent.node, samples)
        return

In [3]:
def create_tree(icd_codes_df, icd_samples_df):
    # create tree
    # plant the tree
    root_pheno = Node(0, "0", "Root Phenotype")
    pheno_tree = Tree(root_pheno, icd_codes_df)
    # fill the tree with leaves and branches - takes 6 secs
    for ni in icd_codes_df.node_id:
        pheno_tree.create_node_from_df(ni)
    c2nodeid_dict = dict(zip(icd_codes_df.coding, icd_codes_df.node_id))
    # add sample info
    for icd_code, samples in tqdm.tqdm(zip(icd_samples_df.index, icd_samples_df.sample_names)):
        pheno_tree.add_sample_info(c2nodeid_dict[icd_code], set(samples.split("|")))
    return pheno_tree, root_pheno, c2nodeid_dict

In [4]:
def create_icd_samples_file(icd_raw_dir):
    dfs = []
    for file in os.scandir(icd_raw_dir):
        filepath = os.path.join(icd_raw_dir, file)
        df = pd.read_csv(filepath)
        dfs.append(df)
    icd_samples_df = pd.concat(dfs)
    icd_samples_df["icd"] = icd_samples_df.icd.str.split("|")
    icd_samples_df = icd_samples_df.explode("icd").groupby("icd").agg(lambda x: "|".join(map(str,x)))
    return icd_samples_df

In [5]:
icd_raw_dir = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/icd_codes/data/icd_raw"
icd_codes_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/icd_codes/data/icd_tree/coding19.tsv"

In [6]:
icd_samples_df = create_icd_samples_file(icd_raw_dir)

In [7]:
icd_codes_df = pd.read_csv(icd_codes_file, usecols=["coding", "meaning", "node_id", "parent_id"], sep="\t")

In [8]:
pheno_tree, root_pheno, c2nodeid_dict = create_tree(icd_codes_df, icd_samples_df)

12215it [08:48, 23.12it/s]


In [9]:
phenotype_file = "/data6/deepro/ukb_bmi/0_data_preparation_and_download/phenotype/data/bmi_processed/british/train_cohort_bmi.csv.gz"
cohort_files = [
    "/data6/deepro/ukb_bmi/3_characterization/data/combo_info/british/discovery_combo2.csv",
    "/data6/deepro/ukb_bmi/3_characterization/data/combo_info/british/discovery_combo3.csv"
]

phenotype_df = pd.read_csv(phenotype_file)


cohort_df = pd.concat([pd.read_csv(cf) for cf in cohort_files])
all_combo_samples = set("|".join(cohort_df.combo_samples.values).split("|"))

categorical_cols = ["genetic_sex"]
numerical_cols = ["age"] + [f"genetic_pca{i}" for i in range(1, 40)]
scaled_numerical_cols = []#["bmi_prs"]

def get_scaled_bmi(df, categorical_cols, numerical_cols, scaled_numerical_cols):
    # define encoders
    en = LabelEncoder()
    scaler = StandardScaler()
    # select the categorical and numerical columns
    # transform the categorical columns to integer values
    for cat_col in categorical_cols:
        df[cat_col] = en.fit_transform(df[cat_col])
    # scale the numerical columns
    df[numerical_cols] = scaler.fit_transform(df.loc[:, numerical_cols])
    # scale bmi separately
    df["bmi_scaled"] = scaler.fit_transform(df.loc[:, ["bmi"]])
    # Create the target variable (bmi_residuals) using linear regression
    X = df.loc[:, categorical_cols + numerical_cols + scaled_numerical_cols]
    y = df.loc[:, 'bmi_scaled']
    model = LinearRegression()
    model.fit(X, y)
    # save the residuals for bmi
    df['bmi_residuals'] = y - model.predict(X)
    return df

phenotype_df = get_scaled_bmi(phenotype_df, categorical_cols, numerical_cols, scaled_numerical_cols)
phenotype_df["bmi_res_decile"] = pd.qcut(phenotype_df.bmi_residuals, q=10)
phenotype_df["bmi_res_decile_num"] = pd.qcut(phenotype_df.bmi_residuals, q=10, labels=False)
phenotype_combo_samples_df = phenotype_df.loc[phenotype_df.sample_names.astype(str).isin(list(map(str, all_combo_samples)))]
phenotype_other_samples_df = phenotype_df.loc[~phenotype_df.sample_names.astype(str).isin(list(map(str, all_combo_samples)))]

phenotype_other_samples_df["description"] = "Non Combo"
phenotype_combo_samples_df["description"] = "Combo"
phenotype_samples_df = pd.concat((phenotype_combo_samples_df, phenotype_other_samples_df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_other_samples_df["description"] = "Non Combo"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_combo_samples_df["description"] = "Combo"


In [10]:
def get_table(combo_samples, noncombo_samples, comorbid_samples, field):
    table = [
        [len(combo_samples.intersection(comorbid_samples)), len(combo_samples.difference(comorbid_samples))],
        [len(noncombo_samples.intersection(comorbid_samples)), len(noncombo_samples.difference(comorbid_samples))]
    ]
    df = pd.DataFrame(table, columns=[f"{field}", f"No {field}"], index=["Combo", "Non Combo"])
    return df

  
def get_icd_enrich_per_decile(decile_df, save_file):
    combo_samples = set(decile_df.loc[decile_df.description=="Combo", "sample_names"].astype(str).values)
    noncombo_samples = set(decile_df.loc[decile_df.description=="Non Combo", "sample_names"].astype(str).values)

    icd_data = []

    for icdc in tqdm.tqdm(icd_codes_df.coding.values):
        icdc_node = pheno_tree.node_dict[c2nodeid_dict[icdc]]
        comorbid_samples = icdc_node.get_samples()
        df = get_table(combo_samples, noncombo_samples, comorbid_samples, icdc_node.meaning)
        res = fisher_exact(df)
        or_study = odds_ratio(df)
        cil, cih = or_study.confidence_interval(confidence_level=0.95)
        icdc_node_data = (icdc, icdc_node.meaning, df.iloc[0,0], df.iloc[0,1], df.iloc[1,0], df.iloc[1,1], or_study.statistic, res.pvalue, cil, cih)
        icd_data.append(icdc_node_data)
    
    icd_df = pd.DataFrame(icd_data, columns=["icd_code", "icd_meaning", "combo_comorbid", "combo_noncomorbid", "noncombo_comorbid", "noncombo_noncomorbid", "odds_ratio", "p_value", "ci_low", "ci_high"])
    icd_df["FDR"] = stats.false_discovery_control(icd_df.p_value)
    icd_df.to_csv(save_file, index=False)
    return

In [11]:
bottom_decile = phenotype_samples_df.loc[phenotype_samples_df.bmi_res_decile_num==0]
bottom_decile_save_file = "/data6/deepro/ukb_bmi/3_characterization/data/pilot/icd_enrich_zeroth_decile.csv"

In [12]:
get_icd_enrich_per_decile(bottom_decile, bottom_decile_save_file)

100%|██████████| 19190/19190 [01:56<00:00, 165.09it/s]


In [16]:
top_decile = phenotype_samples_df.loc[phenotype_samples_df.bmi_res_decile_num==9]
top_decile_save_file = "/data6/deepro/ukb_bmi/3_characterization/data/pilot/icd_enrich_tenth_decile.csv"

other_decile = phenotype_samples_df.loc[phenotype_samples_df.bmi_res_decile_num==8]
other_decile_save_file = "/data6/deepro/ukb_bmi/3_characterization/data/pilot/icd_enrich_ninth_decile.csv"

In [17]:
get_icd_enrich_per_decile(top_decile, top_decile_save_file)

  0%|          | 34/19190 [00:00<02:13, 143.81it/s]

100%|██████████| 19190/19190 [03:12<00:00, 99.47it/s] 


In [18]:
get_icd_enrich_per_decile(other_decile, other_decile_save_file)

  0%|          | 44/19190 [00:00<02:02, 156.20it/s]

100%|██████████| 19190/19190 [02:46<00:00, 115.39it/s]
