In [None]:
from pyspark.sql import SparkSession
import hail as hl
import os
import time
import dxpy
import logging
import pandas as pd
import numpy as np
import re


# Had to set the configuration to navigate RDD partition error
# Build spark
builder = (
    SparkSession
    .builder
    .appName("HailApplication")  # Set a meaningful application name
    .config("spark.driver.memory", "16g")  # Set driver memory (e.g., 8 GB)
    .config("spark.executor.memory", "24g")  # Set executor memory (e.g., 16 GB)
    .config("spark.executor.cores", "12")  # Optional: Set number of cores per executor 
    .enableHiveSupport()
)
spark = builder.getOrCreate()

hl.init(sc=spark.sparkContext, idempotent=True)


In [None]:
def save_in_hail_format(hail_obj, db_name, hail_obj_name, rerun):
    # Create DB if it does not exist
    stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
    spark.sql(stmt).show()
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    # Write hail object
    url = f"dnax://{db_uri}/{hail_obj_name}"
    if rerun:
        hail_obj.write(url, overwrite=True)
    return url

def get_url(db_name, hail_obj_name):
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    # Write hail object
    url = f"dnax://{db_uri}/{hail_obj_name}"
    return url
    

In [None]:
# Define GLOBALS
HQC_READ_RERUN=False
ARRAY_READ_RERUN=False
UNRELATED_PCA_RERUN=False
RELATED_PCA_RERUN=False
CONCORDANCE_RERUN=False
SAMPLE_ANNOT_RERUN=True


# Read high quality autosome data

In [None]:
if HQC_READ_RERUN:
    mt = hl.import_plink(
        bed='file:///mnt/project/notebooks/wes/sample_qc/high_quality_variants/autosomes/autosome_hqc_pruned.bed',
        bim='file:///mnt/project/notebooks/wes/sample_qc/high_quality_variants/autosomes/autosome_hqc_pruned.bim',
        fam='file:///mnt/project/notebooks/wes/sample_qc/high_quality_variants/autosomes/autosome_hqc_pruned.fam',
        reference_genome="GRCh38"
    )

    url = save_in_hail_format(mt, "sample_qc", "autosomes_hqc_pruned.mt", rerun=HQC_READ_RERUN)
    mt = hl.read_matrix_table(url)
else:
    url = get_url("sample_qc", "autosomes_hqc_pruned.mt")
    mt = hl.read_matrix_table(url)

In [None]:
mt.count()

# Read array data

In [None]:
if ARRAY_READ_RERUN:
    array_data_pre = "file:///mnt/project/notebooks/snp/liftover/ukb_c1-22_GRCh38_full_analysis_set_plus_decoy_hla_merged"

    geno_mt = hl.import_plink(
        bed=f'{array_data_pre}.bed',
        bim=f'{array_data_pre}.bim',
        fam=f'{array_data_pre}.fam',
        reference_genome='GRCh38'
    )

    url = save_in_hail_format(geno_mt, "sample_qc", "array.mt", rerun=ARRAY_READ_RERUN)
    geno_mt = hl.read_matrix_table(url)
else:
    url = get_url("sample_qc", "array.mt")
    geno_mt = hl.read_matrix_table(url)

In [None]:
geno_mt.count()

# Get relatedness statistics for individuals based on KING estimates

In [None]:
relatedness_file = "file:///mnt/project//notebooks/wes/sample_qc/relatedness/related_exome.tsv"
relatedness_table = hl.import_table(
    relatedness_file,
)
relatedness_table = relatedness_table.key_by("s")


In [None]:
mt = mt.annotate_cols(
    related=hl.or_else(relatedness_table[mt.s].third_degree, "False")
)


# Calculate PCA for unrelated individuals

In [None]:
if UNRELATED_PCA_RERUN:
    unrelated_mt = mt.filter_cols(mt.related=="True", keep=False)
    # Compute loadings and allele frequency for reference dataset
    eigenvalues, scores, loadings_ht = hl.hwe_normalized_pca(unrelated_mt.GT, k=20, compute_loadings=True)   
    unrelated_mt = unrelated_mt.annotate_rows(af=hl.agg.mean(unrelated_mt.GT.n_alt_alleles()) / 2)                
    loadings_ht = loadings_ht.annotate(af=unrelated_mt.rows()[loadings_ht.key].af)  

    url = save_in_hail_format(loadings_ht, "sample_qc", "unrelated_samples_loadings.ht", rerun=UNRELATED_PCA_RERUN)
    loadings_ht = hl.read_table(url)

    url = save_in_hail_format(scores, "sample_qc", "unrelated_samples_scores.ht", rerun=UNRELATED_PCA_RERUN)
    unrelated_pca_ht = hl.read_table(url)

else:
    url = get_url("sample_qc", "unrelated_samples_loadings.ht")
    loadings_ht = hl.read_table(url)
    
    url = get_url("sample_qc", "unrelated_samples_scores.ht")
    unrelated_pca_ht = hl.read_table(url)

In [None]:
unrelated_pca_ht.count()

# Project PCA for related individuals

In [None]:
if RELATED_PCA_RERUN:
    related_mt = mt.filter_cols(mt.related=="True", keep=True)
    # Project new genotypes onto loadings
    related_ht = hl.experimental.pc_project(related_mt.GT, loadings_ht.loadings, loadings_ht.af)
    url = save_in_hail_format(related_ht, "sample_qc", "related_samples_scores.ht", rerun=RELATED_PCA_RERUN)
    related_pca_ht = hl.read_table(url)

else:
    url = get_url("sample_qc", "related_samples_scores.ht")
    related_pca_ht = hl.read_table(url)


In [None]:
related_pca_ht.count()

In [None]:
pca_ht = unrelated_pca_ht.union(related_pca_ht)

# Get sex imputations

In [None]:
url = get_url("sample_qc", "imputed_sex.ht")
imputed_sex = hl.read_table(url)

# Calculate concordance between exome and array

Summary:

[[0, 10982378230, 265133751019, 58002191288, 11555617883], [35915807, 60381, 1293830, 1223639, 3991], [61992352350, 188536797, 19407337412, 6822719, 107976], [6540281780, 18154898, 4927111, 1827226779, 786962], [3482473571, 4615467, 98920, 1161281, 270922177]]

0.9949861324152116

469452 overlap


[[0, 11050233119, 271184739424, 59029313047, 11719489922], [24208429, 39640, 932677, 634431, 944], [42348255958, 131819648, 13358936421, 3759103, 41894], [2630485865, 9723962, 2759599, 804378228, 326711], [1438527752, 1929404, 40171, 540897, 107579518]]

0.9939029360962955

469452 overlap

In [None]:
if CONCORDANCE_RERUN:
    summary_conc, samples_conc, variants_conc = hl.concordance(mt, geno_mt)
    print(summary_conc)
    print(summary_conc[3][3]/np.array(summary_conc)[1:, 3].sum())
    url = save_in_hail_format(samples_conc, "sample_qc", "sample_concordance.ht", rerun=CONCORDANCE_RERUN)
    samples_conc = hl.read_table(url)
else:
    url = get_url("sample_qc", "sample_concordance.ht")
    samples_conc = hl.read_table(url)

In [None]:
samples_conc = samples_conc.annotate(
    hetz_concordance_array=samples_conc.concordance[3][3]/hl.sum(samples_conc.concordance[3][3:])
)

# Get previous sample qc (based on array data) performed by UKB

In [None]:
geno_sample_qc_file = "file:///mnt/project/fields/data/sample_qc/sample_qc_info.tsv"
geno_sample_qc_table = hl.import_table(
    geno_sample_qc_file,
)
geno_sample_qc_table = geno_sample_qc_table.key_by("sample_names")


# Mark the following samples

1. Duplicates
2. Related
3. Ratio of heterozygous concordance between array and exomes
6. Sex from survey
7. Genetic sex from array
8. Genetic sex from exomes
9. Sex chromosome aneuploidy
10. Genetic kinship to other participants
11. Outlier for heterozygosity or missingness

Note:
1. Sample call rate
2. Eight SD deviation mean ancestry normalized
    - Transition/transversion ratio
    - Insertion/Deletion allele ratio
    - Heterozygous/homozygous call ratio
    - SNV/indel 
    - number of singletons

Will be marked after all autosomal variants QC

In [None]:
if SAMPLE_ANNOT_RERUN:
    sample_ht = mt.cols()
    # add pca info
    sample_ht = sample_ht.annotate(
        pca=pca_ht[sample_ht.s].scores
    )
    # duplicate info
    sample_ht = sample_ht.annotate(
        duplicate=hl.or_else(relatedness_table[sample_ht.s].duplicate_ind, "False"),
    )
    # imputed sex info
    sample_ht = sample_ht.join(imputed_sex)
    # heterozygote concordance
    sample_ht = sample_ht.annotate(
        hetz_concordance_array=samples_conc[sample_ht.s].hetz_concordance_array
    )
    # additional information from array qc
    sample_ht = sample_ht.annotate(
        sex_chromosome_aneuploidy=geno_sample_qc_table[sample_ht.s].sex_chromosome_aneuploidy,
        genetic_kinship_to_other_participants=geno_sample_qc_table[sample_ht.s].genetic_kinship_to_other_participants,
        out_hetz_missing=geno_sample_qc_table[sample_ht.s].out_hetz_missing
    )
    fields_to_drop = ["fam_id", "pat_id", "mat_id", "is_female", "is_case", "is_female_1"]
    sample_ht = sample_ht.drop(*fields_to_drop)
    url = save_in_hail_format(sample_ht, "sample_qc", "sample_annot.ht", rerun=SAMPLE_ANNOT_RERUN)
    sample_ht = hl.read_table(url)
    
else:
    url = get_url("sample_qc", "sample_annot.ht")
    sample_ht = hl.read_table(url)
    

In [None]:
hl.stop()