In [None]:
import os 
import dxpy
import pandas as pd

In [None]:
RERUN=False

In [None]:
def annotate_variant_consequence(ser):
    annot = pd.NA
    consequence =  set(ser.consequence.split(";"))
    ptv_terms = set(["frameshift_variant", "stop_gained", "splice_acceptor_variant", "splice_donor_variant"])
    if len(ptv_terms.intersection(consequence))>0:
        annot = "lof"
    elif "missense_variant" in consequence:
        if ser.del_score==9:
            annot = "missense_strict"
        elif ser.del_score>6:
            annot = "missense_lenient"
    return annot

def keep_most_del(vals):
    vals = set(vals)
    if "lof" in vals:
        return "lof"
    elif "missense_strict" in vals:
        return "missense_strict"
    return list(vals)[0]

def create_helper_files(chr_exome_file):
    df = pd.read_csv(chr_exome_file, sep="\t")
    # get the variants in correct format
    df["variants"] = df.locus.str.lstrip("chr") + ":" + df.alleles.str.replace("_", ":")
    # add lof and missense annotations
    df["annotation"] = df.apply(annotate_variant_consequence, axis=1)
    # filter by annotations, biotype, and call rate
    df = df.loc[(df.annotation.isin(["lof", "missense_strict", "missense_lenient"]))&(df.biotype=="protein_coding")&(df.call_rate>0.5)]
    return df
    

def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    os.remove(filename)
    return

In [None]:
if RERUN:
    exome_annot_path = "/mnt/project/notebooks/exome_annot/"

    vcfs_per_chrm = {
        "chr1": 97, "chr2": 71, "chr3": 56, "chr4": 39, "chr5": 43, "chr6": 48, 
        "chr7": 47, "chr8": 35, "chr9": 42, "chr10": 40, "chr11": 57, "chr12": 52, 
        "chr13": 18, "chr14": 30, "chr15": 34, "chr16": 47, "chr17": 56, "chr18": 16, 
        "chr19": 65, "chr20": 25, "chr21": 11, "chr22": 23, "chrX": 24,
    }

    burden_dfs = []
    # pool = mp.Pool(mp.cpu_count()-1)

    def get_burden_dfs(chrm):
        burden_dfs_chrm = []
        chr_exome_files = [os.path.join(exome_annot_path, chrm, "annot_tables_vep109_v4", f"block_{i}.tsv.gz") for i in range(vcfs_per_chrm[chrm])]
        for ef in chr_exome_files:
            burden_df = create_helper_files(ef)
            burden_dfs_chrm.append(burden_df)
        return burden_dfs_chrm

    chrms = [f"chr{i}" for i in list(range(1,23))] +["chrX"]
    for chrm in chrms:
        print(chrm,  end=" ")
        burden_dfs_chrm = get_burden_dfs(chrm)
        burden_dfs.extend(burden_dfs_chrm)

    burden_df = pd.concat(burden_dfs)
    burden_df.to_csv("gene_burden.csv.gz")
    upload_file_to_project("gene_burden.csv.gz", "/notebooks/regenie/data/")
    
else:
    burden_df=pd.read_csv("/mnt/project/notebooks/regenie/data/gene_burden.csv.gz")

In [None]:
gene_burden_df=burden_df.copy()
pheno_df= pd.read_csv("/mnt/project/notebooks/regenie/data/british_phenotype.tsv.gz", sep="\t")

In [None]:
def create_helper_files(all_burden_df, terms, pheno_df, lf):
    terms = terms.split(",")
    burden_df = all_burden_df.loc[all_burden_df.annotation.isin(terms)]
    discovery_samples = set(pheno_df.IID.astype(str))
    if not lf:
        # filter by number of samples in the discovery cohort
        gene_sample_df = burden_df.groupby("gene").agg({"samples": lambda x: set(",".join(x).split(","))})
        gene_sample_df["samples"] = gene_sample_df.samples.apply(lambda x: x.intersection(discovery_samples))
        min_sample_genes = set(gene_sample_df.loc[gene_sample_df.samples.apply(lambda x: len(x))>=30].index)
        burden_df = burden_df.loc[burden_df.gene.isin(min_sample_genes)]
    else:
        # filter by number of samples carrying genes and lifestyle factors in the discovery cohort
        lf_samples = set(pheno_df.loc[pheno_df[lf]==1, "IID"].astype(str))
        gene_sample_df = burden_df.groupby("gene").agg({"samples": lambda x: set(",".join(x).split(","))})
        gene_sample_df["samples"] = gene_sample_df.samples.apply(lambda x: x.intersection(lf_samples).intersection(discovery_samples))
        min_sample_genes = set(gene_sample_df.loc[gene_sample_df.samples.apply(lambda x: len(x))>=10].index)
        burden_df = burden_df.loc[burden_df.gene.isin(min_sample_genes)]
    
    # create annotation df
    annot_df = burden_df.loc[:, ["variants", "gene", "annotation"]]
    # this gets rid of duplicates due to transcripts in same gene with same consequence
    annot_df = annot_df.dropna().drop_duplicates()
    # this annotates the same locus for the same gene with the most severe consequence
    annot_df =  annot_df.groupby(["variants", "gene"]).agg({"annotation": lambda x: keep_most_del(x)}).reset_index()
    # create set list df
    set_df = annot_df.groupby("gene").agg({"variants": lambda x: ",".join(x)})
    set_df[["chrm", "location"]] = set_df.variants.apply(lambda x: pd.Series(dict(zip(["chrm", "location"], x.split(",")[0].split(":")[:2]))))
    set_df = set_df.reset_index().loc[:, ["gene", "chrm", "location", "variants"]]
    # create aaf df
    aaf_df = burden_df.loc[:, ["variants", "maf"]]
    aaf_df = aaf_df.dropna().drop_duplicates()
    return annot_df, set_df, aaf_df
    

In [None]:
mask_names = ["PTV", "PTV_Missense_strict", "PTV_Missense_lenient"]
categories =  ["lof", "lof,missense_strict", "lof,missense_strict,missense_lenient"]
lfs = ["", "pa", "alcohol", "smoke", "sleep", "sedentary", "diet"]


for mask, cat in zip(mask_names, categories):
    print(cat)
    for lf in lfs:
        print(lf)
        print(mask)
        annot_df, set_df, aaf_df = create_helper_files(gene_burden_df, cat, pheno_df, lf)
        mask_df = pd.DataFrame({"mask_name": [mask], "categories": [cat]})
        annot_df_name = "ukb_annotations.tsv.gz"
        set_df_name = "ukb_sets.tsv.gz"
        aaf_df_name = "ukb_aafs.tsv.gz"
        mask_df_name = "ukb_masks.tsv.gz"
        proj_dir = f"/notebooks/regenie/data/step2/{mask}/{lf}/"
        for df, name in zip(
            [annot_df, set_df, aaf_df, mask_df],
            [annot_df_name, set_df_name, aaf_df_name, mask_df_name]
        ):
            df.to_csv(name, sep='\t', index=False, header=False)
            # upload table to project
            upload_file_to_project(name, proj_dir)