In [None]:
import os
import pandas as pd
from functools import reduce
import dxpy

In [None]:
def read_chrm_regenie_file(filedir, anc, lf, chrm, pheno):
    filepath = os.path.join(filedir, anc, chrm, f"output{lf}", f"bmi_quant_{pheno}.regenie")
    df = pd.read_csv(filepath,  sep="\s", comment="#", engine="python")
    return df

def read_monogenic_file(filedir, anc, pheno):
    mono_df = []
    for chrm in [i for i in range(1, 23)]:
        df = read_chrm_regenie_file(filedir, anc, "", f"chrm{chrm}", pheno)
        # filter for ultrarare
        df = df.loc[df.ID.str.endswith("0.001")]
        df["p_value"] = 10**(-df.LOG10P)
        mono_df.append(df)
    mono_df = pd.concat(mono_df)
    mono_df.columns = [f"{c}_{anc}" if c!="ID" else c for c in mono_df.columns]
    return mono_df

def read_mono_lf_file(filedir, anc, pheno, lf):
    mono_df = []
    for chrm in [i for i in range(1, 23)]:
        df = read_chrm_regenie_file(filedir, anc, lf, f"chrm{chrm}", pheno)
        # filter for ultrarare
        df = df.loc[df.ID.str.endswith("0.001")]
        df["p_value"] = 10**(-df.LOG10P)
        mono_df.append(df)
    mono_df = pd.concat(mono_df)
    mono_df.columns = [f"{c}_{anc}" if c!="ID" else c for c in mono_df.columns]
    return mono_df

In [None]:
monogenic_dir = f"/mnt/project/notebooks/regenie/data/step2/monogenic"
mono_lf_dir = f"/mnt/project/notebooks/regenie/data/step2/mono_lf"
pheno_dir = f"/mnt/project/notebooks/regenie/data/"
ancestry = ["british", "nonbritish"]
pheno = ["bmi", "hba1c_df", "hdl", "ldl_sf"]
lifestyle = ["pa", "alcohol", "smoke"]

gene_burden_df = pd.read_csv("/mnt/project/notebooks/regenie/data/gene_burden.csv.gz")
lifestyle_df = pd.read_csv("/mnt/project/notebooks/regenie/data/pheno.csv.gz")


In [None]:
def create_gene_burden_table_helper(burden_df, annotations, maf, lf_samples_df):
    masked_burden_df = burden_df.loc[(burden_df.annotation.isin(annotations))&(burden_df.maf<=maf)].groupby("gene").agg({"samples": lambda x: set(",".join(x).split(","))}).reset_index()
    masked_burden_df = pd.concat([masked_burden_df, lf_samples_df])
    return masked_burden_df

def create_gene_burden_tables(burden_df, maf, lf_samples_df):
    masks = ["PTV", "PTV_Missense_strict", "PTV_Missense_lenient"]
    annot_terms = [["lof"], ["lof", "missense_strict"], ["lof", "missense_strict", "missense_lenient"]]
    gene_burden_dict = dict(zip(masks, [create_gene_burden_table_helper(burden_df, at, maf, lf_samples_df) for at in annot_terms]))
    return gene_burden_dict


def get_nsamples_helper(combos, genotype_df, cohort_samples):
    if len(set(combos).intersection(set(genotype_df.gene.values))) == len(combos):
        samples_per_gene = genotype_df.loc[genotype_df.gene.isin(combos)].samples.values
        samples_per_combo = reduce(lambda a,b: set(a).intersection(set(b)), samples_per_gene)
        samples_per_combo = cohort_samples.intersection(samples_per_combo)
    else:
        samples_per_combo = []
    return samples_per_combo

def get_nsamples(ser, gene_burden_dict, pop_samples):
    gene = ser.ID.split(".")[0]
    mask = ser.ID.split(".")[1]
    gene_samples_df = gene_burden_dict[mask]
    
    combos = [gene]
    if "lf" in ser.index:
        lf = ser.lf
        combos.append(lf)
    samples = get_nsamples_helper(combos, gene_samples_df, pop_samples)
    return len(samples)

def get_lifestyle_burden(lifestyle_df, lifestyles):
    lifestyle_long = lifestyle_df.loc[:, ["sample_names"]+lifestyle].melt(id_vars=['sample_names'], value_vars=lifestyles)
    lifestyle_long["sample_names"] = lifestyle_long.sample_names.astype(str)
    lifestyle_long = lifestyle_long.loc[lifestyle_long.value==1]
    lifestyle_long = lifestyle_long.groupby("variable").agg({"sample_names": lambda x: set(x)}).reset_index()
    lifestyle_long = lifestyle_long.rename(columns={"variable": "gene", "sample_names": "samples"})
    return lifestyle_long
    

In [None]:
lifestyle_burden = get_lifestyle_burden(lifestyle_df, lifestyle)
gene_burden_dict = create_gene_burden_tables(gene_burden_df, 0.001, lifestyle_burden)

In [None]:
bmi_mono_df = []
for a in ancestry:
    mono_df = read_monogenic_file(monogenic_dir, a, pheno[0]).reset_index(drop=True)
    pheno_df = pd.read_csv(os.path.join(pheno_dir, f"{a}_phenotype.tsv.gz"), sep="\t")
    pop_samples = set(pheno_df.IID.astype(str))
    mono_df[f"nsamples_{a}"] = mono_df.apply(get_nsamples, axis=1, args=(gene_burden_dict, pop_samples))
    bmi_mono_df.append(mono_df)

In [None]:
bmi_mono_df = reduce(lambda x,y: x.merge(y, on="ID", how="outer"), bmi_mono_df)

In [None]:
bmi_mono_df.to_csv("./bmi_ukb_meta_w_samples.csv.gz", index=False)

In [None]:
bmi_mono_lf_df = []
for a in ancestry:
    print(a)
    mld = pd.DataFrame()
    for lf in lifestyle:
        print(lf)
        mono_lf_df = read_mono_lf_file(mono_lf_dir, a, pheno[0], lf).reset_index(drop=True)
        mono_lf_df["lf"] = lf
        pheno_df = pd.read_csv(os.path.join(pheno_dir, f"{a}_phenotype.tsv.gz"), sep="\t")
        pop_samples = set(pheno_df.IID.astype(str))
        mono_lf_df[f"nsamples_{a}"] = mono_lf_df.apply(get_nsamples, axis=1, args=(gene_burden_dict, pop_samples))
        mld = pd.concat([mono_lf_df, mld])
    mld["TEST"] = mld[f"TEST_{a}"]
    bmi_mono_lf_df.append(mld)

In [None]:
bmi_mono_lf_df = reduce(lambda x,y: x.merge(y, on=["ID", "lf", "TEST"], how="outer"), bmi_mono_lf_df)


In [None]:
bmi_mono_lf_df_short = bmi_mono_lf_df.loc[bmi_mono_lf_df.ID.str.endswith("0.001")]


In [None]:
len(bmi_mono_lf_df), len(bmi_mono_lf_df_short)

In [None]:
bmi_mono_lf_df_short.to_csv("./bmi_lf_ukb_meta_w_samples.csv.gz", index=False)


In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    os.remove(filename)
    return

In [None]:
proj_dir="/notebooks/regenie/data/meta/"
filename="bmi_lf_ukb_meta_w_samples.csv.gz"
upload_file_to_project(filename, proj_dir)