# Initiate Spark and Hail

In [1]:
# Running this cell will output a red-colored message- this is expected.
# The 'Welcome to Hail' message in the output will indicate that Hail is ready to use in the notebook.

from pyspark.sql import SparkSession
import hail as hl

builder = (
    SparkSession
    .builder
    .enableHiveSupport()
)
spark = builder.getOrCreate()
hl.init(sc=spark.sparkContext)

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/cluster/dnax/jars/dnanexus-api-0.1.0-SNAPSHOT-jar-with-dependencies.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/cluster/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2023-08-29 00:40:54.859 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2023-08-29 00:40:55.190 WARN  Utils:69 - Service 'sparkDriver' could not bind on port 42000. Attempting port 42001.
2023-08-29 00:40:55.932 WARN  Utils:69 - Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 43000. Attempting port 43001.
2023-08-29 00:40:56.187 WARN  MetricsReporter:84 - No metrics configured for reporting
2023-08-29 00:40:56.189 WARN  LineProtoUsageReporter:48 - Telegraf configurations: url [metrics.push.telegraf.hostport], user [metrics.push.telegraf.user] or password [metrics.push.telegraf.password] missing.
2023-08-29 00:40:56.189 WARN  MetricsReporter:117 - metrics.scraping.httpserver.port


pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jar

log4j: Parsing for [root] with value=[INFO, logfile].
log4j: Level token is [INFO].
log4j: Category root set to INFO
log4j: Parsing appender named "logfile".
log4j: Parsing layout options for "logfile".
log4j: Setting property [conversionPattern] to [%d{yyyy-MM-dd HH:mm:ss.SSS} %c{1}: %p: %m%n].
log4j: End of parsing for "logfile".
log4j: Setting property [append] to [false].
log4j: Setting property [threshold] to [INFO].
log4j: Setting property [file] to [/opt/notebooks/hail-20230829-0040-0.2.116-cd64e0876c94.log].
log4j: setFile called: /opt/notebooks/hail-20230829-0040-0.2.116-cd64e0876c94.log, false
log4j: setFile ended
log4j: Parsed "logfile" options.
log4j: Parsing for [Hail] with value=[INFO, HailSocketAppender].
log4j: Level token is [INFO].
log4j: Category Hail set to INFO
log4j: Parsing appender named "HailSocketAppender".
log4j: Parsed "HailSocketAppender" options.
log4j: Handling log4j.additivity.Hail=[null]
log4j: Finished configuring.


Running on Apache Spark version 3.2.3
SparkUI available at http://ip-10-60-107-222.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.116-cd64e0876c94
LOGGING: writing to /opt/notebooks/hail-20230829-0040-0.2.116-cd64e0876c94.log


In [2]:
import os

vcf_dir = "/mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/"
chr_num = "2"
vcf_files = sorted(["file://" + os.path.join(vcf_dir, fp) for fp in os.listdir(vcf_dir) if (f"_c{chr_num}_" in fp and fp.endswith("vcf.gz"))])

In [3]:
# Create database in DNAX
db_name = f"exome_chr{chr_num}"
stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
print(stmt)
spark.sql(stmt).show()

# Create mt tables in batches and write to the database

CREATE DATABASE IF NOT EXISTS exome_chr2 LOCATION 'dnax://'
++
||
++
++



In [4]:
# Create mt table for each file and write to the database
import time
import dxpy
import logging

logging.basicConfig(filename=f"chr{chr_num}_mt.log", level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

# Find database ID of newly created database using dxpy method
db_uri = dxpy.find_one_data_object(name=f"{db_name}", classname="database")['id']

for i,vcf in enumerate(vcf_files[52:]):
    time_start = time.time()
    mt = hl.import_vcf(
        vcf, force_bgz=True, reference_genome="GRCh38", array_elements_required=False, block_size=512
    )
    mt_name = f"block_{i+52}.mt"
    url = f"dnax://{db_uri}/{mt_name}" # Note: the dnax url must follow this format to properly save MT to DNAX
    mt.write(url, overwrite=True) # Note: output should describe size of MT (i.e. number of rows, columns, partitions) 
    time_end = time.time()
    time_taken = (time_end - time_start)/60
    logging.info(f"Time to create block {i+52}: {time_taken} mins\n")

2023-08-29 00:41:40.311 Hail: INFO: scanning VCF for sortedness...
2023-08-29 00:45:50.600 Hail: INFO: Coerced sorted VCF - no additional import work to do
2023-08-29 01:02:08.889 Hail: INFO: wrote matrix table with 24551 rows and 469835 columns in 43 partitions to dnax://database-GYbqJY8JX0K0VXVv7GFGky02/block_52.mt
2023-08-29 01:02:50.420 Hail: INFO: scanning VCF for sortedness...
2023-08-29 01:07:32.158 Hail: INFO: Coerced sorted VCF - no additional import work to do
[1693271253968] ResourceNotFound: The specified database file could not be found.. Code: 404 Request ID: 
2023-08-29 01:23:01.346 Hail: INFO: wrote matrix table with 23840 rows and 469835 columns in 45 partitions to dnax://database-GYbqJY8JX0K0VXVv7GFGky02/block_53.mt
2023-08-29 01:23:26.057 Hail: INFO: scanning VCF for sortedness...
2023-08-29 01:27:55.726 Hail: INFO: Coerced sorted VCF - no additional import work to do
[1693272477386] ResourceNotFound: The specified database file could not be found.. Code: 404 Request

In [5]:
%%bash
dx upload "chr2_mt.log" --path "exome_annot/annot_run/notebooks/chr2/"

ID                          file-GYfkgk8JX0K66fjg07VB579y
Class                       file
Project                     project-GQpgZf8JX0KKbFBGK9yff4Zg
Folder                      /exome_annot/annot_run/notebooks/chr2
Name                        chr2_mt.log
State                       closing
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Tue Aug 29 07:19:01 2023
Created by                  dzb5732
 via the job                job-GYfYVK8JX0K3zV1fQkPyy8fb
Last modified               Tue Aug 29 07:19:02 2023
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"
