In [None]:
from pyspark.sql import SparkSession
import hail as hl
import os
import time
import dxpy
import logging
import pandas as pd
import re


# Had to set the configuration to navigate RDD partition error
# Build spark
builder = (
    SparkSession
    .builder
    .appName("Autosome QC")  # Set a meaningful application name
    # .config("spark.driver.memory", "96g")  # Set driver memory (e.g., 8 GB)
    # .config("spark.executor.memory", "108g")  # Set executor memory (e.g., 16 GB)
    # .config("spark.executor.cores", "30")  # Optional: Set number of cores per executor 
    .enableHiveSupport()
)
spark = builder.getOrCreate()

hl.init(sc=spark.sparkContext, idempotent=True)


In [None]:
# Define GLOBALS
AUTOSOMES_RERUN=False
SAMPLE_QC_RERUN=False
SAMPLE_QC_RES_RERUN=True
SAMPLE_ANNOT_ALL_RERUN=True

In [None]:
def save_in_hail_format(hail_obj, db_name, hail_obj_name, rerun):
    # Create DB if it does not exist
    stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
    spark.sql(stmt).show()
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    # Write hail object
    url = f"dnax://{db_uri}/{hail_obj_name}"
    if rerun:
        hail_obj.write(url, overwrite=True)
    return url

def get_url(db_name, hail_obj_name):
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    # Write hail object
    url = f"dnax://{db_uri}/{hail_obj_name}"
    return url

def get_chrm_mt(chr_num):
    db_name = f"exomes"
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    url = f"dnax://{db_uri}/chr{chr_num}_vqc.mt"
    mt = hl.read_matrix_table(url)
    return mt

def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    os.remove(filename)
    return


In [None]:
if AUTOSOMES_RERUN:
    autosome_mts = [get_chrm_mt(chr_num) for chr_num in range(1,23)]
    autosome_mt = hl.MatrixTable.union_rows(*autosome_mts)
    url = save_in_hail_format(autosome_mt, "exomes", "autosomes_vqc.mt", AUTOSOMES_RERUN)
    autosome_mt = hl.read_matrix_table(url)
else:
    url = get_url("exomes", "autosomes_vqc.mt")
    autosome_mt = hl.read_matrix_table(url)
    print(autosome_mt.n_partitions())

In [None]:
autosome_mt.count()

In [None]:
if SAMPLE_QC_RERUN:
    autosome_mt = hl.sample_qc(autosome_mt)
    sample_qc_ht = autosome_mt.cols()
    sample_qc_ht = sample_qc_ht.annotate(
        r_snv_indel=((sample_qc_ht.sample_qc.n_snp)/(sample_qc_ht.sample_qc.n_insertion + sample_qc_ht.sample_qc.n_deletion)),
    )
    url = save_in_hail_format(sample_qc_ht, "sample_qc", "sample_qc_annot.ht", rerun=SAMPLE_QC_RERUN)
    sample_qc_ht = hl.read_table(url)
else:
    url = get_url("sample_qc", "sample_qc_annot.ht")
    sample_qc_ht = hl.read_table(url)


In [None]:
sample_qc_ht.count()

In [None]:
if SAMPLE_QC_RES_RERUN:
    from gnomad.sample_qc.filtering import compute_qc_metrics_residuals
    url = get_url("sample_qc", "sample_annot.ht")
    sample_annot_ht = hl.read_table(url)
    
    sample_qc_ht = sample_qc_ht.annotate(
        pca=sample_annot_ht[sample_qc_ht.s].pca
    )
    
    sample_residuals_ht = compute_qc_metrics_residuals(
        sample_qc_ht, pc_scores=sample_qc_ht.pca, qc_metrics={
            "r_ti_tv": sample_qc_ht.sample_qc.r_ti_tv,
            "r_het_hom_var": sample_qc_ht.sample_qc.r_het_hom_var,
            "r_insertion_deletion": sample_qc_ht.sample_qc.r_insertion_deletion,
            "n_singleton": sample_qc_ht.sample_qc.n_singleton,
            "r_snv_indel": sample_qc_ht.r_snv_indel,
        }
    )
    url = save_in_hail_format(sample_residuals_ht, "sample_qc", "sample_annot_res.ht", rerun=SAMPLE_QC_RES_RERUN)
    sample_qc_residuals_ht = hl.read_table(url)

else:
    url = get_url("sample_qc", "sample_annot_res.ht")
    sample_qc_residuals_ht = hl.read_table(url)


Final set of sample qc annotations:

1. Duplicates
2. Related
3. Ratio of heterozygous concordance between array and exomes
4. Sample call rate
5. Eight SD deviation mean ancestry normalized
    - Transition/transversion ratio
    - Insertion/Deletion allele ratio
    - Heterozygous/homozygous call ratio
    - SNV/indel 
    - number of singletons
6. Sex from survey
7. Genetic sex from array
8. Genetic sex from exomes
9. Sex chromosome aneuploidy
10. Genetic kinship to other participants
11. Outlier for heterozygosity or missingness

In [None]:
if SAMPLE_ANNOT_ALL_RERUN:
    url = get_url("sample_qc", "sample_annot.ht")
    sample_annot_ht = hl.read_table(url)
    # add call rate
    sample_annot_ht = sample_annot_ht.annotate(
            call_rate=sample_qc_ht[sample_annot_ht.s].sample_qc.call_rate
        )
    # add all residuals
    sample_annot_ht = sample_annot_ht.join(sample_qc_residuals_ht)
    sample_annot_ht = sample_annot_ht.drop("lms")
    url = save_in_hail_format(sample_annot_ht, "sample_qc", "sample_annot_all.ht", rerun=SAMPLE_ANNOT_ALL_RERUN)
    sample_annot_ht = hl.read_table(url)
else:
    url = get_url("sample_qc", "sample_annot_all.ht")
    sample_annot_ht = hl.read_table(url)
    

# Save to pandas

In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return


In [None]:
sample_df = sample_annot_ht.to_pandas()


In [None]:
proj_dir = f"/notebooks/wes/sample_qc/data/"
filename = "sample_qc_annot_all.tsv"
sample_df.to_csv(filename, index=False, sep="\t")
upload_file_to_project(filename, proj_dir)
