In [None]:
import os
import dxpy
import numpy as np
import pandas as pd

In [None]:
def annotate_variant_consequence(ser):
    annot = pd.NA
    consequence =  set(ser.consequence.split(";"))
    ptv_terms = set(["frameshift_variant", "stop_gained", "splice_acceptor_variant", "splice_donor_variant"])
    if len(ptv_terms.intersection(consequence))>0:
        annot = "lof"
    elif "missense_variant" in consequence:
        if ser.del_score==9:
            annot = "missense_strict"
        elif ser.del_score>6:
            annot = "missense_lenient"
    return annot

def keep_most_del(vals):
    vals = set(vals)
    if "lof" in vals:
        return "lof"
    elif "missense_strict" in vals:
        return "missense_strict"
    return list(vals)[0]

def create_helper_files(chr_exome_file):
    df = pd.read_csv(chr_exome_file, sep="\t", dtype={
        "locus": str, "alleles": str, "gene": str, "transcript": str, "consequence": str, "biotype": str, "loftee": str,
        "lof": bool, "splice_lof": bool, "missense": bool,  "del_score": float, "maf": float, "mac": float, "call_rate": float,
        "p_value_hwe": float, "min_rd": float, "samples": str, "hetz_samples": str, "homo_samples": str
    })
    # get the variants in correct format
    df["variants"] = df.locus.str.lstrip("chr") + ":" + df.alleles.str.replace("_", ":")
    # add lof and missense annotations
    df["annotation"] = df.apply(annotate_variant_consequence, axis=1)
    # filter by annotations, biotype, and call rate
    df = df.loc[
        (df.annotation.isin(["lof", "missense_strict", "missense_lenient"]))&
        (df.biotype=="protein_coding")&
        (df.samples.notna()) # there are some variants which do not have samples because sqc was applied after vqc
    ]
    return df

def create_burden_df(chrm_dir):
    chrms = [i for i in range(1, 23)]
    dfs = []
    for chrm in chrms:
        chr_df = create_helper_files(f"{chrm_dir}/chr{chrm}.tsv.gz")
        dfs.append(chr_df)
    df = pd.concat(dfs)
    return df

def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    os.remove(filename)
    return


# Create and save burden df

In [None]:
burden_df = create_burden_df("/mnt/project/notebooks/wes/variant_annot/data")


In [None]:
proj_dir = "/notebooks/wes/burden_preparation/data/"
filename = "ukb_burden.tsv.gz"
burden_df.to_csv(filename, sep='\t', index=False, header=True)
upload_file_to_project(filename, proj_dir)


# Save unique locus and alleles

In [None]:
burden_df = pd.read_csv("/mnt/project/notebooks/wes/burden_preparation/data/ukb_burden.tsv.gz", sep="\t")

In [None]:
proj_dir = "/notebooks/wes/burden_preparation/data/"
filename = "ukb_ptv_locus.tsv"
burden_df.loc[:, ["locus", "alleles"]].drop_duplicates().to_csv(filename, sep='\t', index=False, header=True)
upload_file_to_project(filename, proj_dir)

In [None]:
burden_df.loc[:, ["locus", "alleles"]]