# Initialize

To install prerequisites, open a terminal in Jupyter and run the following commands:
```
git clone https://github.com/claudiofahey/p3_test_driver
cd p3_test_driver
pip install -e p3_data
```
Then restart the Python kernel.

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pylab as plt
import matplotlib.dates as mdates
import matplotlib.cm as cm
import seaborn as sns
import json
from io import StringIO
import importlib
import re

In [12]:
import p3_data
importlib.reload(p3_data)
from p3_data import (glob_file_list , load_json_from_file, merge_dicts, plot_groups, 
                    get_varying_column_names, filter_dataframe, take_varying_columns,
                    load_json_records_as_dataframe, flatten_multiindex_columns,
                    regex_first_group)

# Load and Clean Results

In [101]:
# Load result files from P3 Test Driver
src_files = []
src_files += ['../../mnt/isilon/data/genomics/summary/*.json.bz2']
raw_df = load_json_records_as_dataframe(src=src_files, ignore_error=True)

Loading records from 18 files...


In [131]:
# Clean raw results
def clean_result(result):
    try:
        r = result.copy()
        r['utc_begin'] = pd.to_datetime(r['utc_begin'], utc=True)
        r['utc_end'] = pd.to_datetime(r['utc_end'], utc=True)
        r['germline_sec'] = r.germline_result['elapsed_sec']
        r['deepvariant_sec'] = r.deepvariant_result['elapsed_sec']
        r['bwa_mem_sec'] = float(regex_first_group('GPU-BWA Mem time: (.*) seconds', r.germline_result['errors'], search=True))
        r['error'] = r.germline_result['error'] or r.deepvariant_result['error']
        if not r['error']:
            # GPU-BWA Mem + Sorting + MarkingDups + BQSR Generation + BAM writing
            r['bwa_mem_to_bam_writing_sec'] = float(regex_first_group('Processing time: (.*) seconds', r.germline_result['errors'], search=True))
            # Sorting + MarkingDups + BQSR Generation + BAM writing
            r['sorting_to_bam_writing_sec'] = r['bwa_mem_to_bam_writing_sec'] - r['bwa_mem_sec']
            r['haplotypecaller_sec'] = float(regex_first_group('Total time taken: (.*)$', r.germline_result['errors'], search=True))
            r['germline_minutes'] = r['germline_sec'] / 60.0
            r['bwa_mem_minutes'] = r['bwa_mem_sec'] / 60.0
            r['bwa_mem_to_bam_writing_minutes'] = r['bwa_mem_to_bam_writing_sec'] / 60.0
            r['sorting_to_bam_writing_minutes'] = r['sorting_to_bam_writing_sec'] / 60.0
            r['haplotypecaller_minutes'] = r['haplotypecaller_sec'] / 60.0
            r['deepvariant_minutes'] = r['deepvariant_sec'] / 60.0
        return pd.Series(r)
    except Exception as e:
        print('ERROR: %s: %s' % (r['record_uuid'], e))
        # raise e

In [132]:
# r = clean_result(raw_df.iloc[-1])
# pd.DataFrame(r)

In [134]:
clean1_df = raw_df.apply(clean_result, axis=1)
clean1_df = clean1_df.set_index('record_uuid', drop=False)
clean1_df = clean1_df[clean1_df.error==False]
clean1_df = clean1_df.sort_values(['utc_begin'])

In [136]:
clean1_df.T

record_uuid,b3e996eb-d5db-4cd4-9fcb-10a572849a53,5d91a753-7402-45ac-b9b1-1482abe9c00a,a5779fcc-6598-45a3-9e6f-90140a045aec,d6b6a636-add0-4f99-9024-62ed12b7749f,e9484792-6e72-416e-a8f9-077fabc6c48d,70e9cb1c-60d5-42ea-9c44-d1a5afd6bf6b,21e94ca8-95ac-45f9-99f2-1b4090c70d00,476d3bc7-da3c-473f-bf41-4a58406eae36,b0a1c102-c9b6-4c31-a9b9-ee99934fbb8a,77b6650f-c385-4733-9e49-f96800daf122,eb4192e2-ea9a-4103-9a96-d6bdae12333b,6a89e9c8-7d3c-4a44-8f2e-138737330c6b,522ececf-48a6-468d-8657-4a93079765d0,ba348c4d-9501-439c-9416-1522fc44e90e,3e73437f-bdc2-46b1-96d0-4a2af468f347,9fb0f8a8-59b9-4896-8ac1-88fa62866e6d,c861920c-bee0-41f2-aa4b-e9399a1e1676
args,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...
bwa_mem_minutes,71.8455,139.509,73.8669,77.1487,53.1297,61.2301,51.7234,52.6988,76.6242,113.519,111.703,134.927,41.9934,40.2688,36.6592,39.6213,41.2982
bwa_mem_sec,4310.73,8370.55,4432.02,4628.92,3187.78,3673.81,3103.4,3161.93,4597.45,6811.12,6702.19,8095.62,2519.6,2416.13,2199.55,2377.28,2477.89
bwa_mem_to_bam_writing_minutes,96.4555,176.165,100.117,99.3316,70.2135,77.3551,69.074,68.5155,99.6954,143.31,147.746,173.376,64.8448,61.6672,55.7765,63.7874,66.4093
bwa_mem_to_bam_writing_sec,5787.33,10569.9,6007.04,5959.9,4212.81,4641.3,4144.44,4110.93,5981.72,8598.6,8864.78,10402.6,3890.69,3700.03,3346.59,3827.25,3984.56
cuda_visible_devices,12131415,891011,0123,4567,891011,0123,4567,12131415,891011,4567,0123,12131415,01234567,89101112131415,89101112131415,01234567,89101112131415
deepvariant_minutes,114.249,78.8076,113.57,119.934,92.3317,86.2529,90.3816,91.0151,123.376,134.871,124.437,123.601,43.8309,42.804,36.5184,43.1053,45.1994
deepvariant_result,"{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ..."
deepvariant_sec,6854.94,4728.45,6814.18,7196.04,5539.9,5175.17,5422.9,5460.91,7402.54,8092.25,7466.24,7416.05,2629.86,2568.24,2191.1,2586.32,2711.97
elapsed_sec,16463,21602.6,16643.1,16391.9,12802.6,12698.3,12533.2,12622.9,16587.5,22435,21893.4,24432.7,8265.55,8041.15,7264.49,8169.78,8464.24


In [137]:
sample_ids_df = pd.read_csv('sample_ids_300.csv').set_index(['sample_id'])
sample_ids_df.head()

Unnamed: 0_level_0,coverage
sample_id,Unnamed: 1_level_1
LP6005592-DNA_E05,31.97
LP6005441-DNA_A10,33.59
LP6005442-DNA_B12,33.86
LP6005519-DNA_B12,33.94
LP6005442-DNA_A04,33.95


In [138]:
clean_df = clean1_df.join(sample_ids_df, on=['sample_id'])
clean_df.head(3).T

record_uuid,b3e996eb-d5db-4cd4-9fcb-10a572849a53,5d91a753-7402-45ac-b9b1-1482abe9c00a,a5779fcc-6598-45a3-9e6f-90140a045aec
args,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...,{'config': 'parabricks_germline_pipeline.yaml'...
bwa_mem_minutes,71.8455,139.509,73.8669
bwa_mem_sec,4310.73,8370.55,4432.02
bwa_mem_to_bam_writing_minutes,96.4555,176.165,100.117
bwa_mem_to_bam_writing_sec,5787.33,10569.9,6007.04
cuda_visible_devices,12131415,891011,0123
deepvariant_minutes,114.249,78.8076,113.57
deepvariant_result,"{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ...","{'command': ['pbrun', 'deepvariant', '--ref', ..."
deepvariant_sec,6854.94,4728.45,6814.18
elapsed_sec,16463,21602.6,16643.1


# Explore data

In [139]:
# Define columns that identify test parameters
param_cols = [
    'sample_id',
    'coverage',
    'num_gpus',
]

In [140]:
# Define columns that are the output of the experiments
output_cols = [
#     'utc_begin',
    'bwa_mem_to_bam_writing_minutes',
    'haplotypecaller_minutes',
    'deepvariant_minutes',
]

In [141]:
cols = param_cols + output_cols

In [142]:
# View most recent results
clean_df[cols].tail(3).T

record_uuid,3e73437f-bdc2-46b1-96d0-4a2af468f347,9fb0f8a8-59b9-4896-8ac1-88fa62866e6d,c861920c-bee0-41f2-aa4b-e9399a1e1676
sample_id,LP6005443-DNA_G11,LP6005443-DNA_A08,LP6005441-DNA_B10
coverage,44.48,45.06,45.12
num_gpus,8,8,8
bwa_mem_to_bam_writing_minutes,55.7765,63.7874,66.4093
haplotypecaller_minutes,27.6343,27.9703,28.1338
deepvariant_minutes,36.5184,43.1053,45.1994


In [143]:
clean_df[cols].set_index(['sample_id']).sort_values(['coverage'])

Unnamed: 0_level_0,coverage,num_gpus,bwa_mem_to_bam_writing_minutes,haplotypecaller_minutes,deepvariant_minutes
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LP6005441-DNA_A10,33.59,4,77.355059,46.615667,86.252898
LP6005442-DNA_B12,33.86,4,70.213467,49.428167,92.33171
LP6005442-DNA_A04,33.95,4,69.073996,48.244833,90.381603
LP6005442-DNA_H09,34.11,4,68.515539,49.589667,91.015119
SS6004478,44.4,4,99.695402,52.235833,123.375651
SS6004478,44.4,8,61.66718,28.300833,42.803965
SS6004472,44.45,4,99.331589,52.5845,119.933933
SS6004472,44.45,8,64.844841,27.8045,43.83092
LP6005443-DNA_G11,44.48,8,55.776514,27.634333,36.518402
LP6005443-DNA_G11,44.48,4,100.11739,62.42,113.569695


In [144]:
# Export to CSV
clean_df[cols].to_csv('results.csv')

In [145]:
# First level of filtering
filt_df = filter_dataframe(
    clean_df,
)
len(filt_df)

17

In [149]:
filt_df[cols].set_index(['coverage','sample_id','num_gpus']).unstack(['num_gpus'])

Unnamed: 0_level_0,Unnamed: 1_level_0,bwa_mem_to_bam_writing_minutes,bwa_mem_to_bam_writing_minutes,haplotypecaller_minutes,haplotypecaller_minutes,deepvariant_minutes,deepvariant_minutes
Unnamed: 0_level_1,num_gpus,4,8,4,8,4,8
coverage,sample_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
33.59,LP6005441-DNA_A10,77.355059,,46.615667,,86.252898,
33.86,LP6005442-DNA_B12,70.213467,,49.428167,,92.33171,
33.95,LP6005442-DNA_A04,69.073996,,48.244833,,90.381603,
34.11,LP6005442-DNA_H09,68.515539,,49.589667,,91.015119,
44.4,SS6004478,99.695402,61.66718,52.235833,28.300833,123.375651,42.803965
44.45,SS6004472,99.331589,64.844841,52.5845,27.8045,119.933933,43.83092
44.48,LP6005443-DNA_G11,100.11739,55.776514,62.42,27.634333,113.569695,36.518402
44.56,LP6005441-DNA_G04,96.455525,,62.405167,,114.249046,
45.06,LP6005443-DNA_A08,,63.787425,,27.970333,,43.105347
45.12,LP6005441-DNA_B10,,66.409265,,28.133833,,45.199438
