In [None]:
#setup dependencies, install metric parser
import sys
!{sys.executable} -m pip install --no-cache-dir --upgrade  qc-metric-aggregator

import os
import firecloud.api as fapi

In [None]:
#set up constants
bucket = os.environ['WORKSPACE_BUCKET']
workspace_namespace = os.environ['WORKSPACE_NAMESPACE']
workspace_name = os.environ['WORKSPACE_NAME']
threshold_file_name = "thresholds.yml"
final_output_file_name = "qc_results.tsv"

#we should pull this from a central place rather than a workspace specific bucket
master_thresholds_file = bucket + "/" + threshold_file_name
#copy the thresholds file to the notebook env
!gsutil cp $master_thresholds_file .

In [None]:
#fetch sample ids from the terra workspace table
samples = fapi.get_entities(workspace_namespace, workspace_name, "sample").json()
sample_ids = [s['name'] for s in samples]

In [None]:
#figure out which files from cromwell runs we need to localize
#this heuristic can definitely be futher optimized
files_in_bucket = !gsutil ls -r $bucket/**
files_to_localize = [f for f in files_in_bucket if any(sample_id in f for sample_id in sample_ids)]

In [None]:
#create input file for gsutil file localization
with open('files_to_localize', 'w') as fout:
    fout.write("\n".join(files_to_localize))

!mkdir localized_files

In [None]:
#run the metric aggregator for each sample and write out the results
from process_metrics.threshold_file_parser import ThresholdFileParser
from process_metrics.qc_validator import QcValidator
from process_metrics.metrics import AvailableMetrics
from process_metrics.report_generator import ReportGenerator

pass_fail_thresholds = ThresholdFileParser(threshold_file_name).thresholds()

qc_results = []
first_sample = True
with open(final_output_file_name, 'w') as fout:   
    for sample_id in sample_ids:
        
        #localize files relevant to this sample
        !cat files_to_localize | grep $sample_id | grep -v .cram | gsutil -m cp -I ./localized_files
        
        metrics = AvailableMetrics(sample_id)
        validator = QcValidator("localized_files/")
        res = ReportGenerator(sample_id, pass_fail_thresholds, metrics, validator).gather_metrics()
        headers = res[0]
        values = res[1]
        if first_sample:
            first_sample = False
            headers[0] = "entity:qc_result_sample_id"
            lowercased_headers = [h.lower() for h in headers]
            print(str.join("\t", lowercased_headers), file=fout)
        print(str.join("\t", values), file=fout)
        
        #clean up localized files for this sample
        !rm -rf localized_files/*

In [None]:
#copy the results into terra as a datatable
fapi.upload_entities_tsv(workspace_namespace, workspace_name, final_output_file_name, "flexible")

#copy the TSV to the workspace bucket
uploaded_tsv = bucket + '/' + final_output_file_name
!gsutil cp $final_output_file_name $uploaded_tsv