In [None]:
import os
import dxpy
import pandas as pd
import numpy as np

# Get consensus data for phenotypes and covariates

In [None]:
def get_consensus(row):
    values = [i for i in row.unique() if not pd.isnull(i)]
    uniq_val = np.nan
    if len(values)>1:
        uniq_val = "inconsistent"
    elif len(values) == 1:
        uniq_val = values[0]
    return uniq_val

def get_mean(row):
    return row.mean()

def process_sample_info(df: pd.DataFrame, categorical_fields: dict, numerical_fields: dict):

    # Make consensus column for categorical fields
    for field, field_measures in categorical_fields.items():
        df[field] = df.loc[:, field_measures].apply(get_consensus, axis=1)
        df = df.drop(columns=field_measures)

    # Get mean value for numerical fields
    for field, field_measures in numerical_fields.items():
        df[field] = df.loc[:, field_measures].apply(get_mean, axis=1)
        df = df.drop(columns=field_measures)

    # filter samples with nan values for all numerical fields
    df = df.loc[~df.loc[:, numerical_fields.keys()].isna().all(axis=1)]

    return df


In [None]:
pheno_w_cov_file = f"/mnt/project/notebooks/bmi/data/bmi_with_cov_raw.csv.gz"

numerical_fields = {
    "bmi": ["bmi0", "bmi1", "bmi2", "bmi3"],
    "age": ["age_assessment0", "age_assessment1", "age_assessment2", "age_assessment3"],
    }

categorical_fields = {
    "ethnic_background": ["ethnic_background0", "ethnic_background1", "ethnic_background2"]
    }

dfs = []

df = pd.read_csv(pheno_w_cov_file, dtype={"sample_names": str})
processed_df = process_sample_info(df, categorical_fields, numerical_fields)


In [None]:
processed_df.head()

# Add ancestry information

In [None]:
ancestry_df = pd.read_csv(
    "/mnt/project/notebooks/ancestry_inference/data/ancestry_pred.csv.gz", 
    dtype={"sample_names": str}
)
ancestry_dict = ancestry_df.loc[:, ["sample_names", "ancestry_pred"]].set_index("sample_names").to_dict()["ancestry_pred"]


In [None]:
processed_df["ancestry_pred"] = processed_df.sample_names.map(ancestry_dict)

# Add sample qc filter information

In [None]:
sample_qc_df = pd.read_csv(
    "/mnt/project/notebooks/wes/sample_qc/data/flagged_samples.tsv", 
    dtype={"s": str}, sep="\t", usecols=["s", "filters"]
).rename(columns={"s": "sample_names", "filters": "sample_qc_filters"})


In [None]:
processed_df = processed_df.merge(sample_qc_df, on="sample_names")

In [None]:
len(processed_df)

# Remove samples which
- do not have exome data
- failed qc filter
- do not have BMI
- do not have ancestry information
- with >=10 third-degree relatives

Note: Few missing covariate information apart from genetic sex can be handled by REGENIE. Therefore, age and genetic pcs were not checked.

In [None]:
filtered_df = processed_df.loc[
    (processed_df.exome_release_batch.notna())&
    (processed_df.sample_qc_filters.isna())&
    (processed_df.bmi.notna())&
    (processed_df.ancestry_pred.notna())&
    (processed_df.genetic_kinship_to_other_participants!="Ten or more third-degree relatives identified")
]

In [None]:
len(filtered_df)

# Add ICD info

In [None]:

############
# ICD tree #
############
class Node:
    """
    Each ICD10 diagnosis is stored as a Node object
    """
    def __init__(self, node_id, code, meaning, parent=None, child=None):
        self.node = node_id
        self.parent = parent
        self.child = child
        self.code, self.meaning = code, meaning
        self.samples = set()

    def add_child(self, child_node):
        if self.child:
            self.child.append(child_node)
        else:
            self.child = [child_node]
        return

    def add_parent(self, parent_node):
        if not self.parent:
            self.parent = parent_node
        else:
            assert self.parent == parent_node
        return

    def get_parent(self):
        return self.parent

    def get_child(self):
        return self.child

    def get_info(self):
        return self.code, self.meaning
    
    def get_samples(self):
        return self.samples
    
    def get_samples_number(self):
        return len(self.samples)

class Tree:
    def __init__(self, root_node, coding_df):
        self.root = root_node
        self.node_dict = {self.root.node : self.root}
        self.coding_df = coding_df

    def update_node_dict(self, node_id, node):
        if node_id not in self.node_dict:
            self.node_dict[node_id] = node
        return

    def create_node_from_df_helper(self, node_id):
        c, m, ni, pi =  self.coding_df.loc[self.coding_df.node_id==node_id].values[0]
        n = Node(ni, c, m)
        return n, pi

    def create_node_from_df(self, node_id):
        if node_id in self.node_dict:
            return self.node_dict[node_id]

        # creating a node and providing parent information
        mn, mnpi = self.create_node_from_df_helper(node_id)
        # if parent is not present in the tree
        if mnpi not in self.node_dict:
            # create the parent node and get its parent
            mnp = self.create_node_from_df(mnpi)
            # add that parent info to the created node
            mn.add_parent(mnp)
        else:
            mnp = self.node_dict[mnpi]
            # add that parent info to the created node
            mn.add_parent(mnp)

        # update the node dict with the created node
        self.update_node_dict(node_id, mn)
        # add the created node as a child of the parent node
        mnp.add_child(mn)
        return mn

    def print_node(self, curr_node, node_level, tree_file):
        curr_node_info = curr_node.get_info()
        if tree_file:
            tree_file.write(f"{'-' * node_level}{curr_node.node}\t{curr_node_info[1]}\n")
        else:
            print(f"{'-' * node_level}{curr_node.node}\t{curr_node_info[1]}\n")
        return

    def print_tree(self, curr_node, tree_file="", node_level=0, max_node_level=2):
        if node_level>max_node_level:
            return
        
        if curr_node:
            self.print_node(curr_node, node_level, tree_file)

            if curr_node.child:
                for c in curr_node.child:
                    self.print_tree(c, tree_file, node_level+1, max_node_level)
        return
    
    def add_sample_info(self, node_id, samples):
        curr_node = self.node_dict[node_id]
        curr_node.samples = samples.union(curr_node.samples)
        if curr_node.parent:
            self.add_sample_info(curr_node.parent.node, samples)
        return
    
def create_tree(icd_codes_df, icd_samples_df):
    # create tree
    # plant the tree
    root_pheno = Node(0, "0", "Root Phenotype")
    pheno_tree = Tree(root_pheno, icd_codes_df)
    # fill the tree with leaves and branches - takes 6 secs
    for ni in icd_codes_df.node_id:
        pheno_tree.create_node_from_df(ni)
    c2nodeid_dict = dict(zip(icd_codes_df.coding, icd_codes_df.node_id))
    # add sample info
    for icd_code, samples in zip(icd_samples_df.index, icd_samples_df.sample_names):
        pheno_tree.add_sample_info(c2nodeid_dict[icd_code], set(samples.split("|")))
    return pheno_tree, root_pheno, c2nodeid_dict

def create_icd_samples_file(icd_file):
    icd_samples_df = pd.read_csv(icd_file)
    icd_samples_df["icd"] = icd_samples_df.icd.str.split("|")
    icd_samples_df = icd_samples_df.explode("icd").groupby("icd").agg(lambda x: "|".join(map(str,x)))
    return icd_samples_df

In [None]:
icd_file = f"/mnt/project/notebooks/bmi/data/icd_raw.csv.gz"
icd_codes_file = "/mnt/project/fields/data/phenotype_processing/coding19.tsv"

icd_samples_df = create_icd_samples_file(icd_file)
icd_codes_df = pd.read_csv(icd_codes_file, usecols=["coding", "meaning", "node_id", "parent_id"], sep="\t")
icd_codes_df["coding"] = icd_codes_df.coding.str.replace(" ", "")
pheno_tree, root_pheno, c2nodeid_dict = create_tree(icd_codes_df, icd_samples_df)

In [None]:
icd_code_map_dict = {
    "cvd": {"G45": "G45", "I20-I25": "BlockI20-I25", "I63": "I63", "I64": "I64"}, # cvd
    "cad": {"I21": "I21", "I22": "I22", "I23": "I23", "I241": "I241", "I252": "I252"}, # cad
    "ht": {"I10": "I10", "I15": "I15"}, # ht
    "t1d": {"E10": "E10"}, # t1d
    "t2d": {"E11": "E11"}, #t2d
    "hf": {"I110": "I110", "I130": "I130", "I132": "I132", "I50": "I50"},
    "af": {"I48": "I48"},
    "pe": {"I26": "I26"},
    "vt": {"I81": "I81", "I82": "I82", "I26": "I26", "O223": "O223", "O871": "O871", "O082": "O082"},
    "avs": {"I350": "I350", "I352": "I352"},
    "grd": {"K219": "K219", "K210": "K210"},
    "cls": {"K80": "K80"},
    "ccs": {"K81": "K81"},
    "cd": {"K50": "K50", "M074": "M074", "M091": "M091"},
    "nfld": {"K760": "K760"},
    "koa": {"M170": "M170", "M171": "M171", "M179": "M179"},
    "ob": {"E65": "E65", "E66": "E66"},
}


In [None]:
for disease, icd_dict in icd_code_map_dict.items():
    comorbid_samples = set()
    for icd, icdc in icd_dict.items():
        icdc_node = pheno_tree.node_dict[c2nodeid_dict[icdc]]
        comorbid_samples = comorbid_samples.union(icdc_node.get_samples())
    filtered_df[disease] = filtered_df.sample_names.isin(comorbid_samples).astype(int)


# Add lifestyle factors

In [None]:
def get_consensus(row):
    values = [i for i in row.unique() if not pd.isnull(i)]
    uniq_val = np.nan
    if len(values)>1:
        uniq_val = "inconsistent"
    elif len(values) == 1:
        uniq_val = values[0]
    return uniq_val

def get_mean(row):
    return row.mean()

def process_sample_info(df, categorical_fields, numerical_fields):
    # Make consensus column for categorical fields
    for field in categorical_fields:
        field_columns = df.loc[:, df.columns.str.startswith(field)].columns.values
        df[field] = df.loc[:, field_columns].apply(get_consensus, axis=1)
        df = df.drop(columns=field_columns)

    # Get mean value for numerical fields
    for field in numerical_fields:
        field_columns = df.loc[:, df.columns.str.startswith(field)].columns.values
        df_fields = df.loc[:, field_columns].replace(["Less than one", "Less than an hour a day", "Do not know", "Prefer not to answer"], [0, 0, np.nan, np.nan]).astype(float)
        df[field] = df_fields.apply(get_mean, axis=1)
        df = df.drop(columns=field_columns)    
    return df.loc[:, ["sample_names"] + numerical_fields + categorical_fields]

def high_dir(df, field, thresh):
    return (df[field]>thresh).astype(int)

def low_dir(df, field, thresh):
    return (df[field]<thresh).astype(int)


def binarize_lifestyle(lf_df, cat_encoding_dict, numerical_fields_w_dir_dict, numerical_field_range_dict, combine_field_dict):
    df = lf_df.copy()
    for field, pattern in cat_encoding_dict.items():
        df[field] = df[field].str.fullmatch(rf"{pattern}", na=False).astype(int)

    # encode all non numeric values as numeric ones
    df.loc[:, numerical_fields_w_dir_dict.keys()] = df.loc[:, numerical_fields_w_dir_dict.keys()].replace(["Less than an hour a day", "Less than one", "Do not know", "Prefer not to answer", "inconsistent"], [0, 0, np.nan, np.nan, np.nan]).astype(float)
    q_dir = {"high": 0.95, "low": 0.05}
    f_dir = {"high": high_dir, "low": low_dir}
    for num_field, num_dir in numerical_fields_w_dir_dict.items():
        num_field_quantile_thresh = df[num_field].quantile(q=q_dir[num_dir])
        df[num_field] = f_dir[num_dir](df, num_field, num_field_quantile_thresh) # (df[num_field]>num_field_quantile_thresh).astype(int)
    
    # encode all wihtin normal range fields
    df.loc[:, numerical_field_range_dict.keys()] = df.loc[:, numerical_field_range_dict.keys()].replace(["Less than an hour a day", "Less than one", "Do not know", "Prefer not to answer", "inconsistent"], [0, 0, np.nan, np.nan, np.nan]).astype(float)
    for range_field, range_ in numerical_field_range_dict.items():
        df[range_field] = (~df[range_field].between(*range_)).astype(int)

    # combine fields to one numeric value
    # smoke
    df["smoke"] = (df.loc[:, combine_field_dict["smoke"]]==1).any(axis=1).astype(int)
    # sedentary lifestyle
    df["sedentary"] = (df.loc[:, combine_field_dict["sedentary"]]==1).any(axis=1).astype(int)
    # diet
    df["diet"] = (((df.loc[:, combine_field_dict["diet"][0]]).sum(axis=1)>1)|((df.loc[:, combine_field_dict["diet"][1]]).sum(axis=1)>0)).astype(int)
    return df.loc[:, ["sample_names", "met", "alcohol", "smoke", "sleep", "sedentary", "diet"]]


In [None]:
binary_fields = ["met"]

integer_fields = [
    "sleep", "tv", "computer", 
    "cookedvegetable", "salad", "freshfruit", "driedfruit"
    ]

categorical_fields = [
    "alcohol", "smokecurr", "smokepast", "oilyfish", "nonoilyfish", 
    "procmeat", "poultry", "beef", "mutton", "pork"
]

lf_df = pd.read_csv(f"/mnt/project/notebooks/bmi/data/lifestyle_raw.csv.gz")
lf_df = process_sample_info(lf_df, categorical_fields + binary_fields, integer_fields)

cat_encoding_dict = {
    "alcohol": "Daily or almost daily",
    "smokecurr": "Yes, on most or all days",
    "smokepast": "Yes, on most or all days",
    "met": "No",
    "oilyfish": "Never|Less than once a week",
    "nonoilyfish": "Never|Less than once a week",
    "procmeat": "5-6 times a week|Once or more daily",
    "beef": "5-6 times a week|Once or more daily",
    "mutton": "5-6 times a week|Once or more daily",
    "pork": "5-6 times a week|Once or more daily"
}

numerical_fields_w_dir_dict = {
    "tv": "high",
    "computer": "high",
    "cookedvegetable": "low",
    "salad": "low",
    "freshfruit": "low",
    "driedfruit": "low"
}

numerical_field_range_dict = {
    "sleep": (6, 8)
}

combine_field_dict = {
    "smoke": ["smokecurr", "smokepast"],
    "sedentary": ["tv", "computer"],
    "diet": [["cookedvegetable", "salad", "freshfruit", "driedfruit", "oilyfish", "nonoilyfish"], ["procmeat", "beef", "mutton", "pork"]]
}

binarized_df = binarize_lifestyle(lf_df, cat_encoding_dict, numerical_fields_w_dir_dict, numerical_field_range_dict, combine_field_dict)
binarized_df = binarized_df.rename(columns={"met": "pa"})

In [None]:
binarized_df["sample_names"] = binarized_df.sample_names.astype(str)

In [None]:
filtered_df = filtered_df.merge(binarized_df, on="sample_names", how="left")

# Upload phenotype file to project

In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return

In [None]:
proj_dir = f"/notebooks/bmi/data/"
filename = f"pheno.csv.gz"
filtered_df.to_csv(filename, index=False)
upload_file_to_project(filename, proj_dir)