In [None]:
from pyspark.sql import SparkSession
import hail as hl
import os
import time
import dxpy
import logging
import pandas as pd
import re


# Had to set the configuration to navigate RDD partition error
# Build spark
builder = (
    SparkSession
    .builder
    .appName("HailApplication")  # Set a meaningful application name
    .config("spark.driver.memory", "12g")  # Set driver memory (e.g., 8 GB)
    .config("spark.executor.memory", "24g")  # Set executor memory (e.g., 16 GB)
    .config("spark.executor.cores", "12")  # Optional: Set number of cores per executor 
    .enableHiveSupport()
)
spark = builder.getOrCreate()

hl.init(sc=spark.sparkContext, idempotent=True)


# Import previously filtered high quality chr x variants

In [None]:
def save_in_hail_format(hail_obj, db_name, hail_obj_name, rerun):
    # Create DB if it does not exist
    stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
    spark.sql(stmt).show()
    # Find database ID of newly created database using dxpy method
    db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
    # Write hail object
    url = f"dnax://{db_uri}/{hail_obj_name}"
    if rerun:
        hail_obj.write(url, overwrite=True)
    return url

In [None]:
mt = hl.import_plink(
    bed='file:///mnt/project/notebooks/wes/sample_qc/high_quality_variants/chrX/chrX_hqc_nopar_pruned.bed',
    bim='file:///mnt/project/notebooks/wes/sample_qc/high_quality_variants/chrX/chrX_hqc_nopar_pruned.bim',
    fam='file:///mnt/project/notebooks/wes/sample_qc/high_quality_variants/chrX/chrX_hqc_nopar_pruned.fam',
    reference_genome="GRCh38"
)

In [None]:
url = save_in_hail_format(mt, "sample_qc", "chrX_hqc_pruned.mt", rerun=True)

# Import sample qc table created previously using genotype data

In [None]:
geno_sample_qc_file = "file:///mnt/project/fields/data/sample_qc/sample_qc_info.tsv"
geno_sample_qc_table = hl.import_table(
    geno_sample_qc_file,
)
geno_sample_qc_table = geno_sample_qc_table.key_by("sample_names")


# Calculate f-statistic and impute sex using previously given guidelines
We also checked the f statistics by plotting it

https://blog.hail.is/whole-exome-and-whole-genome-sequencing-recommendations/


female_threshold=0.5, male_threshold=0.75

In [None]:
imputed_sex = hl.impute_sex(mt.GT, female_threshold=0.5, male_threshold=0.75)

In [None]:
imputed_sex = imputed_sex.annotate(
    survey_sex=geno_sample_qc_table[imputed_sex.s].sex,
    array_sex=geno_sample_qc_table[imputed_sex.s].genetic_sex,
    exome_sex=hl.if_else(imputed_sex.is_female, "Female", "Male")
)

In [None]:
url = save_in_hail_format(imputed_sex, "sample_qc", "imputed_sex.ht", rerun=True)

In [None]:
imputed_sex = hl.read_table(url)

In [None]:
imputed_sex.count()

In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return

In [None]:
imputed_sex_df = imputed_sex.to_pandas()

In [None]:
imputed_sex_df.tail()

In [None]:
proj_dir = f"/notebooks/wes/sample_qc/impute_sex/"
filename = "sex_imputations_exome.tsv"
imputed_sex_df.to_csv(filename, index=False, sep="\t")
upload_file_to_project(filename, proj_dir)


In [None]:
hl.stop()