In [None]:
import os
from glob import glob
from tqdm import tqdm
import datetime
import hail as hl

from hail.plot import show
import pandas as pd
from pprint import pprint
hl.plot.output_notebook()

from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

import boto3
from botocore.exceptions import NoCredentialsError

import warnings
warnings.filterwarnings('ignore')

In [None]:
log_file_name = f"logs/hail-{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}.log"
# run spark
spark_conf = SparkConf().setAppName("hail-test")
# .setMaster("spark://spark-master:7077")
spark_conf.set("spark.hadoop.fs.s3a.endpoint", "http://lifemap-minio:9000/")
spark_conf.set("spark.hadoop.fs.s3a.access.key", "root")
spark_conf.set("spark.hadoop.fs.s3a.secret.key", "passpass" )
spark_conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
spark_conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
spark_conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark_conf.set("spark.hadoop.fs.s3a.connection.maximum", 1024);
spark_conf.set("spark.hadoop.fs.s3a.threads.max", 1024);
spark_conf.set("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc = SparkContext(conf=spark_conf)

In [None]:
# S3 configuration
s3 = boto3.client(
    's3',
    endpoint_url="http://lifemap-minio:9000",
    aws_access_key_id="root",
    aws_secret_access_key="passpass",
)

bucket_name = "data-hail"

# Check if the bucket exists, if not, create it
try:
    s3.head_bucket(Bucket=bucket_name)
    print(f"Bucket '{bucket_name}' exists.")
except Exception:
    # If the bucket does not exist, create it
    s3.create_bucket(Bucket=bucket_name)
    print(f"Bucket '{bucket_name}' created.")


In [None]:
hl.init(sc=sc, log=log_file_name)

In [None]:
vcf_fn = 'data/1kg.vcf'
annotations_fn = 'data/1kg_annotations.txt'
mt_fn = 's3://data-hail/1kg.mt'

In [None]:
_ = hl.import_vcf(vcf_fn).write(mt_fn, overwrite=True) # Read a vcf file and write it as matrix table (assign this to a dummy variable to avoid errors in notebook)
mt = hl.read_matrix_table(mt_fn) # Read matrix table
annotation_table = hl.import_table(annotations_fn, impute=True).key_by('Sample') # Read annotation table

mt = mt.annotate_cols(pheno = annotation_table[mt.s])
mt = hl.sample_qc(mt)
mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
ab = mt.AD[1] / hl.sum(mt.AD)
filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                        (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                        (mt.GT.is_hom_var() & (ab >= 0.9)))
mt = mt.filter_entries(filter_condition_ab)
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

mt = mt.annotate_cols(scores = pcs[mt.s].scores)
gwas = hl.linear_regression_rows(
    y=mt.pheno.CaffeineConsumption,
    x=mt.GT.n_alt_alleles(),
    covariates=[1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2]])

In [None]:
gwas.show()

In [None]:
p = hl.plot.qq(gwas.p_value)
show(p)

In [None]:
p = hl.plot.manhattan(gwas.p_value)
show(p)