In [None]:
from pyspark.sql import SparkSession
import hail as hl
import os
import time
import dxpy
import logging
import pandas as pd
import re


# Had to set the configuration to navigate RDD partition error
# Build spark
builder = (
    SparkSession
    .builder
    .appName("gnomad annotation")  # Set a meaningful application name
    .config("spark.driver.memory", "12g")  # Set driver memory (e.g., 8 GB)
    .config("spark.executor.memory", "12g")  # Set executor memory (e.g., 16 GB)
    .config("spark.executor.cores", "14")  # Optional: Set number of cores per executor 
    .enableHiveSupport()
)
spark = builder.getOrCreate()

hl.init(sc=spark.sparkContext, idempotent=True)


In [None]:
def save_in_hail_format(hail_obj, db_name, hail_obj_name, rerun):
    # Create DB if it does not exist
    stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
    spark.sql(stmt).show()
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    # Write hail object
    url = f"dnax://{db_uri}/{hail_obj_name}"
    if rerun:
        hail_obj.write(url, overwrite=True)
    return url

def get_url(db_name, hail_obj_name):
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    # Write hail object
    url = f"dnax://{db_uri}/{hail_obj_name}"
    return url


In [None]:
RERUN_VAT=False
RERUN_VEP=False
RERUN_GNOMAD=False

# Save locus table in hail format

In [None]:
def process_locus_alleles(ht):
    ht = ht.annotate(
        locus=hl.locus(ht.locus.split(":")[0], hl.int(ht.locus.split(":")[1]), reference_genome='GRCh38'),
        alleles=ht.alleles.split("_")
    )
    ht = ht.key_by("locus", "alleles")
    return ht

In [None]:
if RERUN_VAT:
    variant_file = "file:///mnt/project/notebooks/wes/burden_preparation/data/ukb_ptv_locus.tsv"
    variant_ht = hl.import_table(variant_file)
    variant_ht = process_locus_alleles(variant_ht)
    variant_ht = variant_ht.repartition(100)
    url = save_in_hail_format(variant_ht, "variant_annot", "ukb_ptv_locus.ht", rerun=RERUN_VAT)
    variant_ht = hl.read_table(url)
else:
    url = get_url("variant_annot", "ukb_ptv_locus.ht")
    variant_ht = hl.read_table(url)
    print(variant_ht.count())


# Annotate with vep

In [None]:
def add_vep_annotations(ht, vep_file="file:///mnt/project/notebooks/wes/burden_preparation/data/vep_config_109_v8.json"):
    """
    Add vep annotations
    """
    # add vep annotations
    ht = hl.vep(ht, vep_file) # annot table with vep
    return ht

In [None]:
if RERUN_VEP:
    variant_ht = add_vep_annotations(variant_ht)
    url = save_in_hail_format(variant_ht, "variant_annot", "ukb_ptv_locus_annot.ht", rerun=RERUN_VEP)
    variant_ht = hl.read_table(url)
else:
    url = get_url("variant_annot", "ukb_ptv_locus_annot.ht")
    variant_ht = hl.read_table(url)
    print(variant_ht.count())


# Process and filter gnomad columns

In [None]:
def add_gnomad_annotations(ht):
    # create dummy struct for missing variants in gnomad
    dummy_struct = hl.struct(
        gnomade=0.0, gnomade_afr=0.0, gnomade_amr=0.0, gnomade_eas=0.0, 
        gnomade_fin=0.0, gnomade_nfe=0.0, gnomade_sas=0.0
    )
    # Replace all the missing variants in gnomad with array of dict of struct
    ht = ht.annotate(
        gnomad_freq=hl.or_else(
            ht.vep.colocated_variants.frequencies, 
            hl.array([{"NA": dummy_struct}]))
    )
    ht = ht.explode(ht.gnomad_freq)
    ht = ht.annotate(
        gnomade=ht.gnomad_freq.get(ht.alleles[1]).gnomade,
        gnomade_afr=ht.gnomad_freq.get(ht.alleles[1]).gnomade_afr,
        gnomade_amr=ht.gnomad_freq.get(ht.alleles[1]).gnomade_amr,
        gnomade_eas=ht.gnomad_freq.get(ht.alleles[1]).gnomade_eas,
        gnomade_fin=ht.gnomad_freq.get(ht.alleles[1]).gnomade_fin,
        gnomade_nfe=ht.gnomad_freq.get(ht.alleles[1]).gnomade_nfe,
        gnomade_sas=ht.gnomad_freq.get(ht.alleles[1]).gnomade_sas,
    )
    ht = ht.select(ht.gnomade, ht.gnomade_afr, ht.gnomade_amr, ht.gnomade_eas, ht.gnomade_sas, gnomade_eur=hl.max(ht.gnomade_fin, ht.gnomade_nfe))
    return ht


In [None]:
if RERUN_GNOMAD:
    variant_ht = add_gnomad_annotations(variant_ht)
    url = save_in_hail_format(variant_ht, "variant_annot", "ukb_ptv_gnomad_annot.ht", rerun=RERUN_GNOMAD)
    variant_ht = hl.read_table(url)
else:
    url = get_url("variant_annot", "ukb_ptv_gnomad_annot.ht")
    variant_ht = hl.read_table(url)
    print(variant_ht.count())

# Save as pandas df

In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    os.remove(filename)
    return


In [None]:
gnomad_df = variant_ht.to_pandas()

# Get gnomad pop max

In [None]:
gnomad_df = gnomad_df.fillna(0)
gnomad_df["locus"] = gnomad_df.locus.astype(str)
gnomad_df["alleles"] = gnomad_df.alleles.apply(lambda x: "_".join(x))
gnomad_df = gnomad_df.groupby(["locus", "alleles"]).agg(max)

In [None]:
gnomad_df["maf_gnomad_popmax"] = gnomad_df.loc[:, ["gnomade_afr", "gnomade_amr", "gnomade_eas", "gnomade_sas", "gnomade_eur"]].max(axis=1)


In [None]:
proj_dir = f"/notebooks/wes/burden_preparation/data/"
filename = f"gnomad_annot.tsv.gz"
gnomad_df.to_csv(filename, sep='\t', index=True)
upload_file_to_project(filename, proj_dir)
