In [19]:
import re
import numpy as np
import pandas as pd
from scipy import stats
from functools import reduce

In [38]:
monogenic_meta_df = pd.read_excel("./monogenic_meta.xlsx")
gene_burden_df = pd.read_csv("/mnt/project/notebooks/regenie/data/gene_burden.csv.gz")
pheno_df = pd.read_csv("/mnt/project/notebooks/regenie/data/pheno.csv.gz", dtype={"sample_names": str})
comorbidities = ["cvd", "cad", "ht", "t1d", "t2d", "hf", "af", "pe", "vt", "avs", "grd", "cls", "ccs", "cd", "nfld", "koa", "ob"]


In [41]:
def get_table_icd(gene_samples, nongene_samples, comorbid_samples, field):
    table = [
        [len(gene_samples.intersection(comorbid_samples)), len(gene_samples.difference(comorbid_samples))],
        [len(nongene_samples.intersection(comorbid_samples)), len(nongene_samples.difference(comorbid_samples))]
    ]
    df = pd.DataFrame(table, columns=[f"{field}", f"No {field}"], index=["Combo", "Non Combo"])
    return df

def create_gene_burden_table_helper(burden_df, annotations, maf, lf_samples_df):
    masked_burden_df = burden_df.loc[(burden_df.annotation.isin(annotations))&(burden_df.maf<=maf)].groupby("gene").agg({"samples": lambda x: set(",".join(x).split(","))}).reset_index()
    masked_burden_df = pd.concat([masked_burden_df, lf_samples_df])
    return masked_burden_df

def create_gene_burden_tables(burden_df, maf, lf_samples_df):
    masks = ["PTV", "PTV_Missense_strict", "PTV_Missense_lenient"]
    annot_terms = [["lof"], ["lof", "missense_strict"], ["lof", "missense_strict", "missense_lenient"]]
    gene_burden_dict = dict(zip(masks, [create_gene_burden_table_helper(burden_df, at, maf, lf_samples_df) for at in annot_terms]))
    return gene_burden_dict


def get_samples_helper(combos, genotype_df, cohort_samples):
    if len(set(combos).intersection(set(genotype_df.gene.values))) == len(combos):
        samples_per_gene = genotype_df.loc[genotype_df.gene.isin(combos)].samples.values
        samples_per_combo = reduce(lambda a,b: set(a).intersection(set(b)), samples_per_gene)
        samples_per_combo = cohort_samples.intersection(samples_per_combo)
    else:
        samples_per_combo = []
    return samples_per_combo


def get_samples(ser, gene_burden_dict, pop_samples):
    pattern = re.compile("(.+)\.(PTV.*)\.0\.001")
    m = re.match(pattern, ser.ID)
    if not m:
        print(ser.ID)
    gene = m.group(1)
    mask = m.group(2)
    gene_samples_df = gene_burden_dict[mask]
    
    combos = [gene]
    if "lf" in ser.index:
        lf = ser.lf
        combos.append(lf)
    
    samples = get_samples_helper(combos, gene_samples_df, pop_samples)
    return samples


def get_comorbidity_data_helper(gene_samples, pop_samples, pheno_df, comorbidity):
    comorbid_samples = set(pheno_df.loc[pheno_df[comorbidity]==1].sample_names)
    nongene_samples = pop_samples.difference(gene_samples)
    df = get_table_icd(gene_samples, nongene_samples, comorbid_samples, comorbidity)
    data_dict = dict(zip(
        [f"gene_{comorbidity}", f"gene_non{comorbidity}", f"nongene_{comorbidity}", f"nongene_non{comorbidity}"],
        [df.iloc[0,0], df.iloc[0,1], df.iloc[1,0], df.iloc[1,1]]
    ))
    return data_dict


def get_comorbidity_data(ser, gene_burden_dict, pop_samples, comorbidities):
    gene_samples = get_samples(ser, gene_burden_dict, pop_samples)
    comorbid_dict = dict()
    for comorbidity in comorbidities:
        cdict = get_comorbidity_data_helper(gene_samples, pop_samples, pheno_df, comorbidity)
        comorbid_dict.update(cdict)
    comorbid_dict["ID"] = ser.ID
    return pd.Series(comorbid_dict)

In [29]:
gene_burden_dict = create_gene_burden_tables(gene_burden_df, 0.001, pd.DataFrame())

In [30]:
pop_samples = set(pheno_df.sample_names.astype(str))

In [68]:
monogenic_ukb_comorbid_df = monogenic_meta_df.apply(get_comorbidity_data, axis=1 , args=(gene_burden_dict, pop_samples, comorbidities,))

In [75]:
monogenic_ukb_comorbid_df.set_index("ID").to_csv("./monogenic_ukb_comorbid.csv.gz")