In [None]:
# !pip install gnomad

In [None]:
from pyspark.sql import SparkSession
import hail as hl
import os
import time
import dxpy
import logging
import pandas as pd
import re


# Had to set the configuration to navigate RDD partition error
# Build spark
builder = (
    SparkSession
    .builder
    .appName("HailApplication")  # Set a meaningful application name
    .config("spark.driver.memory", "16g")  # Set driver memory (e.g., 8 GB)
    .config("spark.executor.memory", "108g")  # Set executor memory (e.g., 16 GB)
    .config("spark.executor.cores", "30")  # Optional: Set number of cores per executor 
    .enableHiveSupport()
)
spark = builder.getOrCreate()

hl.init(sc=spark.sparkContext, idempotent=True)


In [None]:
from gnomad.utils.filtering import filter_to_adj


# Create database to store hail format files

In [None]:
# Create database in DNAX
db_name = f"exomes"
stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
print(stmt)
spark.sql(stmt).show()


# Import pvcf files for a given chromosome

In [None]:
chr_num = "1"

In [None]:
def import_and_save_vcf(chr_num, db_name):
    vcf_dir = "/mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/"
    vcf_files = sorted(["file://" + os.path.join(vcf_dir, fp) for fp in os.listdir(vcf_dir) if (f"_c{chr_num}_" in fp and fp.endswith("vcf.gz"))])
    mt = hl.import_vcf(
        vcf_files, force_bgz=True, reference_genome="GRCh38", array_elements_required=False
    )
    # split multiallelic sites
    mt = hl.split_multi_hts(mt, permit_shuffle=True)
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    # save matrix table
    mt_name = f"chr{chr_num}_unfiltered.mt"
    print(mt_name)
    url = f"dnax://{db_uri}/{mt_name}" # Note: the dnax url must follow this format to properly save MT to DNAX
    mt.write(url, overwrite=True) # Note: output should describe size of MT (i.e. number of rows, columns, partitions)
    return url

In [None]:
mt_url = import_and_save_vcf(chr_num, db_name)

In [None]:
mt = hl.read_matrix_table(mt_url)

# Perform initial qc

In [None]:
def initial_variant_qc(
    mt, db_name, min_call_rate=0.9, min_hardy_weinberg_threshold=1e-15,
):
    
    # get gnomad suggested genotypes only
    qc_mt = filter_to_adj(mt)
    # annotate low complexity regions
    db = hl.experimental.DB(region='us', cloud='aws')
    qc_mt = db.annotate_rows_db(qc_mt, 'Ensembl_homo_sapiens_low_complexity_regions')
    # perform variant qc
    qc_mt = hl.variant_qc(qc_mt)
    # filter low complexity regions and those which fail minimal qc
    qc_mt = qc_mt.filter_rows(
        (qc_mt.Ensembl_homo_sapiens_low_complexity_regions==False)&
        (qc_mt.variant_qc.call_rate>min_call_rate)&
        (qc_mt.variant_qc.p_value_hwe>min_hardy_weinberg_threshold)&
        (qc_mt.variant_qc.AC[1]>0)
    )
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    mt_name = f"chr{chr_num}_initial_variant_qc.mt"
    print(mt_name)
    url = f"dnax://{db_uri}/{mt_name}" # Note: the dnax url must follow this format to properly save MT to DNAX
    qc_mt.write(url, overwrite=True) # Note: output should describe size of MT (i.e. number of rows, columns, partitions)
    return url


In [None]:
mt_url = initial_variant_qc(mt, db_name)

In [None]:
mt = hl.read_matrix_table(mt_url)