In [None]:
# Set the output bucket to write to, dataproc service account must have write access
# Do not include trailing slash or "gs://"
output_bucket = "clingen-dataproc-workspace-kferrite"
# Set the TSV path to write into bucket. Can contain slash like "folder/file.tsv"
# Do not include leading slash
report_filename = "clinvar-annotation.tsv"


In [None]:
import hail as hl
# `idempontent=True` is useful for running all cells in the notebook
hl.init(idempotent=True)

In [None]:
# utility functions for file placement
import subprocess

def run_args(args, fail_on_stderr=False, success_codes=[0]) -> tuple: # (stdout,stderr,returncode)
    print(args)
    p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if (fail_on_stderr and len(stderr) > 0) or (p.returncode not in success_codes):
        raise RuntimeError("command {} failed with code {}:{}".format(
            args, p.returncode, stderr))
    return (stdout, stderr, p.returncode)

def local_to_bucket(local_path:str, gcs_path:str):
    if not gcs_path.startswith("gs://"):
        gcs_path = "gs://{}/{}".format(output_bucket, gcs_path)
    args = ["gsutil", "cp", local_path, gcs_path]
    run_args(args)
    
def bucket_to_local(gcs_path:str, local_path:str):
    if not gcs_path.startswith("gs://"):
        gcs_path = "gs://{}/{}".format(output_bucket, gcs_path)
    args = ["gsutil", "cp", gcs_path, local_path]
    run_args(args)
    
def local_to_hdfs(local_path:str, hdfs_path:str):
    args = ["hdfs", "dfs", "-rm", hdfs_path]
    run_args(args, success_codes=[0,1]) # Allow error
    args = ["hdfs", "dfs", "-cp", "file://" + local_path, hdfs_path]
    run_args(args)
    
def hdfs_to_local(hdfs_path:str, local_path:str):
    if os.path.exists(local_path):
        os.remove(local_path)
    args = ["hdfs", "dfs", "-cp", hdfs_path, "file://" + local_path]
    run_args(args)

In [None]:
# Obtain desired thresholds
import io, re

input_filename = "input_files/clinvar_variation_ids.txt"
bucket_to_local(input_filename, input_filename)
with open(input_filename) as f_in:
    variation_ids = [line.strip() for line in f_in if len(line) > 0]
print("Loaded {} variation ids".format(len(variation_ids)))
print(variation_ids)

In [None]:
# Import ClinVar VCF as Hail Table
# clinvar = hl.import_vcf("/path/to/clinvar.vcf.gz", force_bgz=True, drop_samples=True, skip_invalid_loci=True).rows()

# Download clinvar BGZF
import os, requests, subprocess

# Function to download a file to a localpath. ClinVar VCF is small enough to download to dataproc default local disk.
def download_to_file(url, filepath):
    r = requests.get(url, stream=True)
    if r.status_code != 200:
        raise RuntimeError("Failed to obtain ClinVar VCF:{}\n{}".format(r.status_code))
    with open(filepath, "wb") as fout: 
        for chunk in r.iter_content(chunk_size=1024): 
             if chunk:
                 fout.write(chunk)
# This url always points to the latest dump file, updated periodically by ClinVar
clinvar_vcf_url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz"
clinvar_vcf_localpath = "/home/hail/clinvar.vcf.gz"
clinvar_vcf_hdfs = "clinvar.vcf.gz"
download_to_file(clinvar_vcf_url, clinvar_vcf_localpath)
assert(os.path.exists(clinvar_vcf_localpath))
print("Downloaded ClinVar VCF, file size (expecting ~30M): %d" % os.path.getsize(clinvar_vcf_localpath))

# Hail needs the file in HDFS
local_to_hdfs(clinvar_vcf_localpath, clinvar_vcf_hdfs)
# p = subprocess.Popen(["hdfs", "dfs", "-cp", "file://" + clinvar_vcf_localpath, clinvar_vcf_hdfs])
# print(p.communicate())

clinvar = hl.import_vcf(
    clinvar_vcf_hdfs,
    force_bgz=True,
    drop_samples=True, 
    skip_invalid_loci=True
).rows()
print("Imported {} records from ClinVar".format(clinvar.count()))


In [None]:
# clinvar.describe()
# clinvar.show()

# Filter to input set
variation_ids_hl = hl.literal(variation_ids)
clinvar_filtered = clinvar.filter(
    variation_ids_hl.contains(clinvar.rsid)
)

In [None]:
# Find any ids in input that don't exist in table

# up to megabytes in size
clinvar_ids = [
    rec.rsid for rec in clinvar_filtered.select(clinvar_filtered.rsid).collect()
]
# print(clinvar_ids)

missing_ids = [i for i in variation_ids if i not in clinvar_ids]
print("Missing:\n" + "\n".join(missing_ids))

duplicate_ids = []
id_counts = {}
for i in variation_ids:
    if i not in id_counts:
        id_counts[i] = 0
    id_counts[i] += 1
print("Duplicates:")
for k,v in id_counts.items():
    if v > 1:
        print("{}, count={}".format(k, v))

In [None]:
# Select desired output fields (columns are ordered as provided)
output_ds = clinvar_filtered

output_ds = output_ds.select(
    clinvar_variation_id=output_ds.rsid,
    clinvar_review_status=hl.delimit(output_ds.info["CLNREVSTAT"], ","),
    clinvar_significance=hl.delimit(output_ds.info["CLNSIG"], ","),
    clinvar_significance_interpretations=hl.delimit(output_ds.info["CLNSIGCONF"], ","),
    # Hail parses the CLNDN (and related like CLNDNINCL) incorrectly
    # Since ',' is allowed in condition names, ClinVar uses '|' to separate them
    # But Hail separates into an array based on ',' instead of '|'
    # If we re-join the string with ',' it will match that from ClinVar
    clinvar_conditions=hl.delimit(output_ds.info["CLNDN"], ",") 
)
output_ds = output_ds.order_by(
    hl.int(output_ds.clinvar_variation_id) # Assume all clinvar variation ids are integers
)

# output_ds.describe()

# Export to TSV
import time
print("Starting export to %s" % report_filename)
start_time = time.time()
output_ds.export(report_filename)
end_time = time.time()
print("Export took %.2f seconds" % (end_time - start_time))

In [None]:
# The export is in HDFS now, copy to machine-local file
report_localpath = os.path.join(os.getcwd(), report_filename)
hdfs_to_local(report_filename, report_localpath)

In [None]:
# Upload to bucket and filepath set at top of notebook
print("Uploading {} bytes to GCS".format(os.path.getsize(report_localpath)))
local_to_bucket(report_localpath, report_filename)