In [None]:
import os
import dxpy
import numpy as np
import pandas as pd

# Add gnomad annotations to burden

In [None]:
burden_df = pd.read_csv("/mnt/project/notebooks/wes/burden_preparation/data/ukb_burden.tsv.gz", sep="\t")
gnomad_df = pd.read_csv(
    "/mnt/project/notebooks/wes/burden_preparation/data/gnomad_annot.tsv.gz", sep="\t",
    usecols=["locus", "alleles", "maf_gnomad_popmax"]
)
burden_df = burden_df.merge(gnomad_df, on=["locus", "alleles"])
burden_df["maf_max"] = burden_df.apply(lambda ser: max(ser.maf, ser.maf_gnomad_popmax), axis=1)


# Save REGENIE step 2 helper files

In [None]:
def keep_most_del(vals):
    vals = set(vals)
    if "lof" in vals:
        return "lof"
    elif "missense_strict" in vals:
        return "missense_strict"
    return list(vals)[0]

def create_regenie_helper_files(burden_df):
    # create annotation df
    annot_df = burden_df.loc[:, ["variants", "gene", "annotation"]]
    # this gets rid of duplicates due to transcripts of same gene with same consequence
    annot_df = annot_df.dropna().drop_duplicates()
    # this annotates the same locus for the same gene with the most severe consequence
    annot_df =  annot_df.groupby(["variants", "gene"]).agg({"annotation": lambda x: keep_most_del(x)}).reset_index()
    # create set list df
    set_df = annot_df.groupby("gene").agg({"variants": lambda x: ",".join(x)})
    set_df[["chrm", "location"]] = set_df.variants.apply(lambda x: pd.Series(dict(zip(["chrm", "location"], x.split(",")[0].split(":")[:2]))))
    set_df = set_df.reset_index().loc[:, ["gene", "chrm", "location", "variants"]]
    # create aaf df
    aaf_df = burden_df.loc[:, ["variants", "maf_max"]]
    aaf_df = aaf_df.dropna().drop_duplicates()
    return annot_df, set_df, aaf_df

def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    os.remove(filename)
    return

In [None]:
mask_names = ["pLoF", "Missense_strict", "Missense_lenient"]
categories =  ["lof", "lof,missense_strict", "lof,missense_strict,missense_lenient"]

annot_df, set_df, aaf_df = create_regenie_helper_files(burden_df)
mask_df = pd.DataFrame({"mask_name": mask_names, "categories": categories})
annot_df_name = "ukb_annotations.tsv.gz"
set_df_name = "ukb_sets.tsv.gz"
aaf_df_name = "ukb_aafs.tsv.gz"
mask_df_name = "ukb_masks.tsv.gz"
proj_dir = "/notebooks/wes/burden_preparation/data/"

for df, name in zip(
    [annot_df, set_df, aaf_df, mask_df],
    [annot_df_name, set_df_name, aaf_df_name, mask_df_name]
):
    df.to_csv(name, sep='\t', index=False, header=False)
    # upload table to project
    upload_file_to_project(name, proj_dir)
