# Initiate Spark and Hail

In [None]:
# Running this cell will output a red-colored message- this is expected.
# The 'Welcome to Hail' message in the output will indicate that Hail is ready to use in the notebook.

from pyspark.sql import SparkSession
import hail as hl

builder = (
    SparkSession
    .builder
    .enableHiveSupport()
)
spark = builder.getOrCreate()
hl.init(sc=spark.sparkContext)

# Import vcf files in batches and store them as tables

In [None]:
import os

vcf_dir = "/mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/"
chr_num = "1"
vcf_files = sorted(["file://" + os.path.join(vcf_dir, fp) for fp in os.listdir(vcf_dir) if (f"_c{chr_num}_" in fp and fp.endswith("vcf.gz"))])

In [None]:
# Create database in DNAX
db_name = f"exome_chr{chr_num}"
stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
print(stmt)
spark.sql(stmt).show()

# Create mt tables in batches and write to the database

In [None]:
# Create mt table for each file and write to the database
import time
import dxpy
import logging

logging.basicConfig(filename=f"chr{chr_num}_mt.log", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

# Find database ID of newly created database using dxpy method
db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']

for i,vcf in enumerate(vcf_files[48:]):
    time_start = time.time()
    mt = hl.import_vcf(
        vcf, force_bgz=True, reference_genome="GRCh38", array_elements_required=False, block_size=512
    )
    mt_name = f"block_{i+48}.mt"
    url = f"dnax://{db_uri}/{mt_name}" # Note: the dnax url must follow this format to properly save MT to DNAX
    mt.write(url, overwrite=True) # Note: output should describe size of MT (i.e. number of rows, columns, partitions) 
    time_end = time.time()
    time_taken = (time_end - time_start)/60
    logging.info(f"Time to create block {i+48}: {time_taken} mins\n")

In [None]:
%%bash
dx upload "chr1_mt.log" --path "exome_annot/annot_run/notebooks/chr1/"