In [None]:
import os
from glob import glob
from tqdm import tqdm
import datetime
import hail as hl

from hail.plot import show
import pandas as pd
from pprint import pprint
hl.plot.output_notebook()

from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

In [None]:
# TODO: set external ip
import socket

hostname = socket.gethostname()
print(f"hostname: {hostname}")
internal_ip = socket.gethostbyname(hostname)
external_ip = "172.19.179.46"
print(f"internal ip: {internal_ip}")
print(f"external ip: {external_ip}")

In [None]:
HAIL_JARS = hl.__path__[0]
HAIL_JARS += "/backend/hail-all-spark.jar"
HAIL_JARS += ",/spark/jars/aws-java-sdk-bundle-1.11.1026.jar"
HAIL_JARS += ",/spark/jars/hadoop-aws-3.3.2.jar"
print(HAIL_JARS)

#### Start an [Apache Spark](https://en.wikipedia.org/wiki/Apache_Spark) instance

In [None]:
log_file_name = f"logs/hail-{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.log"
# run spark
spark_conf = SparkConf().setAppName("hail-test").setMaster("spark://172.19.179.106:30077")
# hail
spark_conf.set("spark.jars", HAIL_JARS)
spark_conf.set("spark.driver.host", external_ip)
spark_conf.set("spark.driver.bindAddress", internal_ip)
spark_conf.set("spark.driver.port", 32123)
spark_conf.set("spark.blockManager.port", 32124)
# s3
spark_conf.set("spark.hadoop.fs.s3a.endpoint", "http://172.19.179.106:30900/")
spark_conf.set("spark.hadoop.fs.s3a.access.key", "root")
spark_conf.set("spark.hadoop.fs.s3a.secret.key", "passpass" )
spark_conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
spark_conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
spark_conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark_conf.set("spark.hadoop.fs.s3a.connection.maximum", 1024);
spark_conf.set("spark.hadoop.fs.s3a.threads.max", 1024);
spark_conf.set("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
# varia
spark_conf.set("spark.executor.memory", "200g")
spark_conf.set("spark.driver.memory", "10g")
spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
spark_conf.set("spark.kryoserializer.buffer.max", "2047m")
spark_conf.set("spark.rpc.message.maxSize", "512")
spark_conf.set("spark.network.timeout", "600s")
spark_conf.set("spark.driver.maxResultSize", "4g")
spark_conf.set("spark.speculation", "true")
spark_conf.set("spark.speculation.quantile", "0.75")
spark_conf.set("spark.speculation.multiplier", "1.5")

try:
    sc = SparkContext(conf=spark_conf)
except:
    print ("Spark session already up")

In [None]:
hl.init(sc=sc, log=log_file_name)

In [None]:
# test hail
mt = hl.balding_nichols_model(n_populations=3, n_samples=100, n_variants=1000)
mt.count()

In [None]:
## Read the matrix table from the file and assign it to the mt vaiable
mt_fn="s3://lifemap/1kg_sparse.mt"
mt = hl.read_matrix_table(mt_fn)
print(f"partitions: {mt.n_partitions()}")
row_table = mt.rows()
col_table = mt.cols()
entry_fields = mt.entries()

In [None]:
## Summary of the matrix table:
mt.describe()

#### Row table:

In [None]:
row_table.show(5)