Auto reload magics will dynamically load changes to the manual_review_classifier package for convenience during development [source](http://blog.comperiosearch.com/blog/2015/05/11/using-ipython-notebooks-and-pycharm-together/)

In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2

%aimport manual_review_classifier

Populating the interactive namespace from numpy and matplotlib


In [2]:
import re, os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn import preprocessing

In [4]:
import pickle

In [5]:
from manual_review_classifier.ReadCount import ReadCount

In [6]:
# pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_rows', 2000)

### Traverse all samples and concatenate DFs

In [7]:
# Read in dataframe of sample information and variant calls
data = pd.read_pickle('../data/full_df.pkl')

#Standardize reviewers to avoid merge conflicts
data.loc[data.individual_name == 'H_KA-452198', 'reviewer']= 'Lee'

# Select tumor normal pairs and groupby individual
#g = data[data.tumor_normal_pair].groupby('individual_name')

# Group by individual
g = data.groupby('individual_name')

counts = pd.DataFrame()
for name, group in g:
    
    print(name)
    individual_df = pd.DataFrame()
    normal_count=tumor_count=relapse_count=met_count=rna_count=1 
    
    if len(group.reviewer.unique()) > 2:
        print(group.reviewer.unique())
        raise ValueError('Multiple reviewers for the same individual')
    
    for index, row in group[['build_id','tissue_type', 'sequencing_context', 'project', 'disease', 'reviewer']].drop_duplicates().iterrows():
        if row['tissue_type'] == 'normal':
            column_name = 'normal_' + str(normal_count)
            normal_count += 1
        elif row['sequencing_context'] == 'rna seq':
            column_name = 'tumor_rna_' + str(rna_count)
            rna_count += 1
        elif row['tissue_type'] == 'tumor':
            column_name = 'tumor_' + str(tumor_count)
            tumor_count += 1
        elif row['tissue_type'] == 'relapse':
            column_name = 'relapse_' + str(relapse_count)
            relapse_count += 1
        elif row['tissue_type'] == 'met':
            column_name = 'met_' + str(met_count)
            met_count += 1
        print('\t'+column_name)
        print('\t'+row['build_id'],row['tissue_type'],  row['sequencing_context'], row['project'], row['disease'], row['reviewer'])
        count_fp = '../data/bam-read-counts/normal/{0}/{1}.counts'.format(name,row['build_id'])
#         print('\t\t'+count_fp)
        rc = ReadCount(count_fp)
        bed_fp = '../data/bam-read-counts/normal/{0}/{0}_full.bed'.format(name)
        temp_df = rc.compute_variant_metrics(bed_fp, column_name)
        temp_df['disease'] = row['disease']
        temp_df['reviewer'] = row['reviewer']
        if len(individual_df)==0:
            individual_df=temp_df
        else:
            if len(individual_df) != len(temp_df):
                bed_df = pd.read_csv(bed_fp, names=['chromosome','start','stop','ref','var','call'])
                raise ValueError('Dataframes cannot be merged. They are differing lengths.')
            test = individual_df
            individual_df = pd.merge(individual_df, temp_df, on=['chromosome', 'start', 'stop', 'ref', 'var', 'call', 'disease', 'reviewer'])
            if len(individual_df) != len(temp_df):
                print('Error: The merge operation has not completed as expected')
                break
                
                
    if len(individual_df) != len(temp_df):
        print('Error: The merge operation has not completed as expected')
        break
#     if any(individual_df.tumor_1_VAF.isnull()):
#         print('this should not happen')
#         break
    individual_df['individual_name']=name
    counts = pd.concat([counts, individual_df], ignore_index=True)


H_BS-312340G
	normal_1
	4ed71ccd17df455dae7200fb136b8ce2 normal capture gue_su/SMR_06-09-rAML-review1 AML Heather
	tumor_1
	6c19a2eb9ef3448491c7f6ff7c05e67d tumor capture gue_su/SMR_06-09-rAML-review1 AML Heather
H_BS-807615G
	normal_1
	4b0aba5fed324014bf82eb0cb880342f normal capture gue_su/SMR_06-09-rAML-review1 AML Heather
	tumor_1
	b0d8896570b64b3b97b49d49297dfe34 tumor capture gue_su/SMR_06-09-rAML-review1 AML Heather
H_GM-113971
	normal_1
	f54e721735334d08b954e1ec59005788 normal capture allegra/AML_Post-Transplant_Relapse AML Heather
	tumor_1
	bb45156f99014815993e189982c158db tumor capture allegra/AML_Post-Transplant_Relapse AML Heather
	met_1
	755cecc25a5a4aa9b92a01cb71f43ca1 met capture allegra/AML_Post-Transplant_Relapse AML Heather
H_GM-456892
	normal_1
	b3d14d1ba54e4b60a0a48c894929c373 normal capture gue_su/SMR_06-09-rAML-review1 AML Heather
	tumor_1
	cb6fae2054464abbb33b3cda8757f809 tumor capture gue_su/SMR_06-09-rAML-review1 AML Heather
H_GM-545259
	normal_1
	a5a75a140f9143

H_KA-220882
	normal_1
	1c0b7a2805094670992301df94d4b651 normal exome allegra/AML_Quads AML Heather
	tumor_1
	3befd877badb4639ad749b3cc8f91b75 tumor exome allegra/AML_Quads AML Heather
	tumor_2
	050c3692db3b4b77be5cde843f7efdb8 tumor exome allegra/AML_Quads AML Heather
	tumor_3
	e3b3b0d7033241ccbbd5a4a58c474458 tumor exome allegra/AML_Quads AML Heather
H_KA-242129
	normal_1
	509647fff4d44171ae7c8bc3453a4e03 normal capture gue_su/SMR_06-09-rAML-review1 AML Heather
	tumor_1
	dc1394f52f8a4c8c975957cafacb617b tumor capture gue_su/SMR_06-09-rAML-review1 AML Heather
	normal_2
	80321eb25f1949798b43a9935883a7e4 normal capture allegra/AML_Post-Transplant_Relapse AML Heather
	tumor_2
	f686915417334960afd651b586e60930 tumor capture allegra/AML_Post-Transplant_Relapse AML Heather
	relapse_1
	0ce4cf0f31a0449880d0fc5478fdb3f3 relapse capture allegra/AML_Post-Transplant_Relapse AML Heather
H_KA-255421
	normal_1
	d0206a20270e4d0b9ef086d6aceaf490 normal capture gue_su/SMR_06-09-rAML-review1 AML Heather


	3089cdeb218e4509921224175805c7d0 normal capture gue_su/SMR_06-09-rAML-review1 AML Heather
	tumor_1
	b49b5afb8d7c42298733624ac2bd798a tumor capture gue_su/SMR_06-09-rAML-review1 AML Heather
H_KA-505186
	normal_1
	fb5a87d95c954b9da19077729fd8cbfb normal capture allegra/AML_Decitabine AML Heather
	tumor_1
	ca72d9266d034446991bc9b41c506699 tumor capture allegra/AML_Decitabine AML Heather
H_KA-509754G
	normal_1
	984d180e22754801a9d694f60c0679b1 normal capture gue_su/SMR_06-09-rAML-review1 AML Heather
	tumor_1
	805941afe87e4e3c98189d874f1b5f7d tumor capture gue_su/SMR_06-09-rAML-review1 AML Heather
H_KA-548327
	normal_1
	02d07941ce814fa49355a3fd1457c74b normal capture gue_su/SMR_06-09-rAML-review1 AML Heather
	tumor_1
	8ade2a75587748549924922fee471092 tumor capture gue_su/SMR_06-09-rAML-review1 AML Heather
	normal_2
	9e3792a3cc584c9ba8974cd32ad0e49f normal capture allegra/AML_Post-Transplant_Relapse AML Heather
	relapse_1
	1f2413dd536a4b32a6799e2178f99fa5 relapse capture allegra/AML_Post-Tr

	10de98907424492fb796e55a4b84a6be tumor capture gue_su/SMR_06-09-rAML-review1 AML Heather
H_KA-868442
	normal_1
	8d26b223d0ff4ace81f3fb566af6d13a normal capture gue_su/SMR_06-09-rAML-review1 AML Heather
	tumor_1
	12a2d8ac3cee44d181d59e8be6b1352e tumor capture gue_su/SMR_06-09-rAML-review1 AML Heather
H_KA-875663
	normal_1
	e8743c37d08d42328a781e7ee4cb9bf7 normal exome allegra/AML_Quads AML Heather
	tumor_1
	a9b3db6216f5464c8a81a055ff7c8826 tumor exome allegra/AML_Quads AML Heather
	tumor_2
	58e30ed8031e49febf4da0e6cbe52e14 tumor exome allegra/AML_Quads AML Heather
	tumor_3
	fd5cc8d1997e4515865553aeee31f168 tumor exome allegra/AML_Quads AML Heather
H_KA-959485
	normal_1
	6f5c1f83e8114428bc2ea44983bd6688 normal exome allegra/AML_Quads AML Heather
	tumor_1
	81da1704c9874f3894a54187540fad17 tumor exome allegra/AML_Quads AML Heather
	tumor_2
	d9fae61b0b254808a1d7c38d6ec19628 tumor exome allegra/AML_Quads AML Heather
	tumor_3
	ffd34be7d042416b82abec0660e30ed1 tumor exome allegra/AML_Quads AM

H_LB-597501
	tumor_1
	9e4995a179804991aa8b78c4054b394a tumor capture GTB11 AML Lee
	normal_1
	108db7bc8c6441398d3e268b2a997821 normal capture GTB11 AML Lee
H_LB-608870
	tumor_1
	1fc50990e8d24a1b8d483af0b78e8693 tumor capture GTB11 AML Lee
	normal_1
	65e2c34bdd6b4a36a1bf1f713edc4a4b normal capture GTB11 AML Lee
H_LB-610184
	tumor_1
	1a11a007d18d445383fbca8babf27cac tumor capture GTB11 AML Lee
	normal_1
	11e0baea28964d62be5be811ba55249c normal capture GTB11 AML Lee
H_LB-611835
	tumor_1
	f1775d0067104bb4837034c3d604e11e tumor capture GTB11 AML Lee
	normal_1
	2c40c2a0e01440cebef456747c10d06e normal capture GTB11 AML Lee
H_LB-625765
	tumor_1
	6f110038105145b28ad51f743f0bb557 tumor capture GTB11 AML Lee
	normal_1
	ebdf379202504f03871fdecd1153ad0f normal capture GTB11 AML Lee
H_LB-631698
	tumor_1
	b7c66ec10e77454dac3d63e9e2a809a7 tumor capture GTB11 AML Lee
	normal_1
	6979055637b3414aaa2615aba35dc139 normal capture GTB11 AML Lee
H_LB-641987
	tumor_1
	8f7ed266a30b40358c741c9e12e16e9e tumor cap

	8451a182361c4fe181f44352933a8b5b normal capture GTB11 AML Lee
H_LB-987089
	tumor_1
	2131ac2dcf134604b1f259be7804f754 tumor capture GTB11 AML Lee
	normal_1
	5df71e2522d947cda1811ba1c28d90ca normal capture GTB11 AML Lee
H_LB-988428
	tumor_1
	05d5d893b5a748f59bbac8a7ca69c0b2 tumor capture GTB11 AML Lee
	normal_1
	2548243a7d8f4d10a40487cab7f89e79 normal capture GTB11 AML Lee
H_LB-989739
	tumor_1
	4cf250f2b9114fba809fddefb26f4c70 tumor capture GTB11 AML Lee
	normal_1
	698f91175e90411ea79d600e60f5110b normal capture GTB11 AML Lee
H_LB-994096
	tumor_1
	8228829315a445dcb0e75f4b193d4e1b tumor capture GTB11 AML Lee
	normal_1
	631a4ee0ee914a04995fd89cdd5f92ab normal capture GTB11 AML Lee
H_LB-999103
	tumor_1
	b4fee78b9e914c71aa1b3f442394e140 tumor capture GTB11 AML Lee
	normal_1
	6c8bfd7b82454b4fb6e5ea9678a2a4b3 normal capture GTB11 AML Lee
H_LE-548417
	normal_1
	3eb13a7b02814de6a2dc5bda70759819 normal capture allegra/AML_Decitabine AML Heather
	tumor_1
	f3a10a8b237f4a92ac09a484cec72f42 tumor ca

	8cd4c4a2c5294ec381efa1b52c8f9933 tumor capture tli/rohan breast Heather
H_ND-056G
	tumor_1
	b5f468c5f3294c5283deb0330ee330d5 tumor capture tli/rohan breast Heather
H_ND-056Y
	tumor_1
	7b60a465317b410aba16e39a6f57cb12 tumor capture tli/rohan breast Heather
H_ND-060G
	tumor_1
	a6b32e7f45ef44708c2966e351c269dc tumor capture tli/rohan breast Heather
H_ND-061G
	tumor_1
	7fb3aaaa9e0c45b5b98936ce45b1af72 tumor capture tli/rohan breast Heather
H_ND-061Y
	tumor_1
	4d6ea968963c4161b281dc7aea4640c8 tumor capture tli/rohan breast Heather
H_ND-062G
	tumor_1
	4d24b9e2778a448a8c75c07c220fb5ee tumor capture tli/rohan breast Heather
H_ND-062Y
	tumor_1
	6d5b7e2d14834df9b96cf576527d3a02 tumor capture tli/rohan breast Heather
H_ND-066G
	tumor_1
	5e0c405295784dcd86f49c21642abe54 tumor capture tli/rohan breast Heather
H_ND-066Y
	tumor_1
	3cf71bc0aa9f4565b8dea52a66b27646 tumor capture tli/rohan breast Heather
H_ND-068G
	tumor_1
	aa39831f5e814d70ac17f04920e60849 tumor capture tli/rohan breast Heather
H_ND-06

	84e3ff883ec844aaa652d882f82c88de tumor capture tli/rohan breast Heather
H_ND-165G
	tumor_1
	edcbf68f9fd940da90c26557cda2664b tumor capture tli/rohan breast Heather
H_ND-165Y
	tumor_1
	3bc286d73bcf4bb38625d7201c772165 tumor capture tli/rohan breast Heather
H_ND-166G
	tumor_1
	8eeffa09442b412e99e80c9ac4e16af6 tumor capture tli/rohan breast Heather
H_ND-166Y
	tumor_1
	2c0cc2d4f34b4227afbad94b7dbab307 tumor capture tli/rohan breast Heather
H_ND-170G
	tumor_1
	65d5b4d924ec497c998c50b44c54e59f tumor capture tli/rohan breast Heather
H_ND-170Y
	tumor_1
	27db8d67b8004b9abff67549484c44c6 tumor capture tli/rohan breast Heather
H_ND-171G
	tumor_1
	068aa343cd964df292dc2c7e06cb6b04 tumor capture tli/rohan breast Heather
H_ND-171Y
	tumor_1
	fed6c87c74104ab59f75cc7b4a39df0c tumor capture tli/rohan breast Heather
H_ND-176G
	tumor_1
	14836c6ebb644cc89468536a5d82b2d0 tumor capture tli/rohan breast Heather
H_ND-176Y
	tumor_1
	c6a4605e56e74444b3cb4e6f1e224317 tumor capture tli/rohan breast Heather
H_ND-17

H_ND-248N
	tumor_1
	2d4adc997ab94faf9712bcfe1589138a tumor capture tli/rohan breast Heather
H_ND-248X
	tumor_1
	706546d0939149a6ae96290c17302d9b tumor capture tli/rohan breast Heather
H_ND-250Y
	tumor_1
	f7041e6087414141bb539d395bd50502 tumor capture tli/rohan breast Heather
H_ND-252G
	tumor_1
	876ab56ba37347d5afaa8a2d51a0fcf3 tumor capture tli/rohan breast Heather
H_ND-252Y
	tumor_1
	b0793a91ad724772ab8d77656755522e tumor capture tli/rohan breast Heather
H_ND-253G
	tumor_1
	8240a310e332438292635c4e66cb3c23 tumor capture tli/rohan breast Heather
H_ND-253Y
	tumor_1
	f39ee90b7b094950984b2c9b82efeda8 tumor capture tli/rohan breast Heather
H_ND-254G
	tumor_1
	baf3e3e6ee79473db14ac49eb15a86b5 tumor capture tli/rohan breast Heather
H_ND-254Y
	tumor_1
	5828e8548823464292465ea5903334d7 tumor capture tli/rohan breast Heather
H_ND-257Y
	tumor_1
	e433174431d347c390b32f579f70f868 tumor capture tli/rohan breast Heather
H_ND-258Y
	tumor_1
	e9ba61bd7ed24b6799c1cf9d063d8452 tumor capture tli/rohan bre

H_ND-317Y
	tumor_1
	d5af0bbfdc0e472e81fa1b5580ff072e tumor capture tli/rohan breast Heather
H_ND-318G
	tumor_1
	b1c3acaebe0d439e8c2887f65281f060 tumor capture tli/rohan breast Heather
H_ND-318Y
	tumor_1
	17499b4d5bac4bb8a151b6f7c318a70e tumor capture tli/rohan breast Heather
H_ND-321G
	tumor_1
	8612fd24e0774750b1dcb15cf870b962 tumor capture tli/rohan breast Heather
H_ND-321Y
	tumor_1
	adc6daf0b1ed4e6690769bef2bf5e031 tumor capture tli/rohan breast Heather
H_ND-323G
	tumor_1
	24ad638ab1de41e79b5414626c38ef7e tumor capture tli/rohan breast Heather
H_ND-323Y
	tumor_1
	5c4ffcd08241479481dcd6750b74f76d tumor capture tli/rohan breast Heather
H_ND-325G
	tumor_1
	d3df0b2eb1224a64bfc9d1b391941c63 tumor capture tli/rohan breast Heather
H_ND-325Y
	tumor_1
	131be8caeeda47e2884ca8c41d62e04b tumor capture tli/rohan breast Heather
H_ND-327G
	tumor_1
	a3b98de28ffd41588b472649da761525 tumor capture tli/rohan breast Heather
H_ND-327Y
	tumor_1
	cabb7b135bb94ce7adbce91069e8bbfb tumor capture tli/rohan bre

H_ND-384Y
	tumor_1
	8aa9022d1ecd46e28b487e30eeaf5d04 tumor capture tli/rohan breast Heather
H_ND-421G
	tumor_1
	c2784f51cf63467e9ed750742299487d tumor capture tli/rohan breast Heather
H_ND-421Y
	tumor_1
	b40bee71d30f42f8834fc2f54413ebdc tumor capture tli/rohan breast Heather
H_ND-501G
	tumor_1
	f2d3b3aff9c24baf9352dbc9605a2034 tumor capture tli/rohan breast Heather
H_ND-501Y
	tumor_1
	6d4511111685424fb9d27ec426139c42 tumor capture tli/rohan breast Heather
H_ND-502G
	tumor_1
	8246ce9be6f1405092877c3a5850a763 tumor capture tli/rohan breast Heather
H_ND-502Y
	tumor_1
	6e3c8965fa164493a976ecda2735d7cd tumor capture tli/rohan breast Heather
H_ND-506G
	tumor_1
	ffaffd3913bd479aa496a3e8a5bb3d2a tumor capture tli/rohan breast Heather
H_ND-506Y
	tumor_1
	b857987a42694b6d96c1c57e8c65b299 tumor capture tli/rohan breast Heather
H_ND-509G
	tumor_1
	2f8729a55cb34d4489fa1630c6ed0100 tumor capture tli/rohan breast Heather
H_ND-509Y
	tumor_1
	1714ca3fa17d4e02b5be309ff7a283fc tumor capture tli/rohan bre

H_ND-583G
	tumor_1
	8088c88f58ec416aa80ae97824de7936 tumor capture tli/rohan breast Heather
H_ND-583Y
	tumor_1
	56bbfdecf77b49c9ab8978e8f4ebd419 tumor capture tli/rohan breast Heather
H_ND-587G
	tumor_1
	33810da674e448468ddf6cfc49a966fe tumor capture tli/rohan breast Heather
H_ND-587Y
	tumor_1
	269cdcdad115470582dd42ef2099731a tumor capture tli/rohan breast Heather
H_ND-588Y
	tumor_1
	a2838789b8de41d0a67edb93a3a31de4 tumor capture tli/rohan breast Heather
H_ND-58Y
	normal_1
	01ea3fd4a4174d79b692d5c33b8ae51b normal capture tli/rohan breast Heather
	tumor_1
	a8b93fa9e4f643b5943bf6097422c5f2 tumor capture tli/rohan breast Heather
H_NI-LBHDec-001
	normal_1
	135579781 normal capture gue_su/06_MDS-DECITABINE_06-28-manual_review1_4 AML Heather
	tumor_1
	135584181 tumor capture gue_su/06_MDS-DECITABINE_06-28-manual_review1_4 AML Heather
H_NI-LBHDec-002
	normal_1
	135609493 normal capture gue_su/06_MDS-DECITABINE_06-28-manual_review1_4 AML Heather
	tumor_1
	135609505 tumor capture gue_su/06_MDS

H_NZ-WU36
	normal_1
	7c9426c2849b4012b3619691e587c698 normal capture lee/LGG1 glioblastoma Lee
	normal_2
	ab83e57a51524de4ad584f713cd192bd normal WGS lee/LGG1 glioblastoma Lee
	tumor_1
	06a72a2d0ba54a889103ffc46638aeda tumor capture lee/LGG1 glioblastoma Lee
	tumor_2
	d7941980d4fb4342821e7a3c97ab8c91 tumor WGS lee/LGG1 glioblastoma Lee
H_OM-15920
	tumor_1
	447c5d3b6913446c83df66d1bb789f69 tumor WGS traztuzamab breast Lee
	normal_1
	cd84a7f84f994acd9ae48abdc4530221 normal WGS traztuzamab breast Lee
	tumor_2
	ba63ec224d964d849134b1d0b1a3e8d2 tumor exome traztuzamab breast Lee
	normal_2
	fcc7eff845044f47b06996aee94e4114 normal exome traztuzamab breast Lee
H_OM-15938
	tumor_1
	b1b6c280f292497da3fa5ac453cc78dc tumor WGS traztuzamab breast Lee
	normal_1
	2cc276cc7d754e239b3e07a7456f0325 normal WGS traztuzamab breast Lee
	tumor_2
	a640bc00570048688b0273f5a17018b0 tumor exome traztuzamab breast Lee
	normal_2
	24dba785fae248e0b9e0a0a9ed8fc34e normal exome traztuzamab breast Lee
	tumor_rna_1
	af

	8b2837ff60c046ecb9ede3c3128fc345 normal exome traztuzamab breast Lee
	tumor_rna_1
	a436e2f516f04e439d18f5a33a87d759 tumor rna seq traztuzamab breast Lee
H_OM-16693
	tumor_1
	501926aae4834332969914a994b0272f tumor WGS traztuzamab breast Lee
	normal_1
	5ed846df6af64dafa4790bf873558852 normal WGS traztuzamab breast Lee
	tumor_2
	769a44a7012f461abf764a810686dad4 tumor exome traztuzamab breast Lee
	normal_2
	befbcb9ecce044498233ae7ab1e87225 normal exome traztuzamab breast Lee
	tumor_rna_1
	0ec50bda9b734d34884b317d059ec2b3 tumor rna seq traztuzamab breast Lee
H_OM-16708
	tumor_1
	256541f355b24dd8b225f63c98a9b790 tumor WGS traztuzamab breast Lee
	normal_1
	6dac4fbeb8eb4fe89ed3a923bc3302d9 normal WGS traztuzamab breast Lee
	tumor_2
	64419e2a482e426598db0953ea218626 tumor exome traztuzamab breast Lee
	normal_2
	f0caf885df6f4cd1977e15e64dc52ac7 normal exome traztuzamab breast Lee
	tumor_rna_1
	a9cf1210b0264ace89c1968058d99967 tumor rna seq traztuzamab breast Lee
H_OM-16941
	tumor_1
	efe0b0158ca

	normal_2
	ec698a998d404ed8bf375985837676da normal exome traztuzamab breast Lee
	tumor_rna_1
	dff0c715d7b043c6ab22749ae25efc60 tumor rna seq traztuzamab breast Lee
H_OM-6198
	tumor_1
	d285659c1d2949b4ad0238404efb9a34 tumor WGS traztuzamab breast Lee
	normal_1
	93334b70c2b54a2187e8b99188d2db87 normal WGS traztuzamab breast Lee
	tumor_2
	f9a7993b19434d448aec3c11f1c1fd9d tumor exome traztuzamab breast Lee
	normal_2
	b2ef7489b63e498781af680b6e7d3071 normal exome traztuzamab breast Lee
	tumor_rna_1
	f97b0056b32d4c268420c48e4dd03ea6 tumor rna seq traztuzamab breast Lee
H_OM-7801
	tumor_1
	165f6ea7710c4139873752c89319f1a2 tumor WGS traztuzamab breast Lee
	normal_1
	815a46a4eae24ce0b5ccc226a9c7056a normal WGS traztuzamab breast Lee
	tumor_2
	51b3303af9fb469cbdc9f6e925b2b6be tumor exome traztuzamab breast Lee
	normal_2
	535292d82c8c42418ef7dc6440c4c85d normal exome traztuzamab breast Lee
	tumor_rna_1
	3701bfed6c9a4156975c029ce4c5ec56 tumor rna seq traztuzamab breast Lee
H_OM-8085
	tumor_1
	0f5f

H_OO-10DD-1014
	normal_1
	03a50a47bde34f9a87b8581c5ded7b5c normal exome allegra/AML_Decitabine AML Heather
	tumor_1
	de0b6fe87f73426bbb3d69aa1ce5e8a8 tumor exome allegra/AML_Decitabine AML Heather
	tumor_2
	16b402a8f45944a09e0a2d211db4c4fc tumor exome allegra/AML_Decitabine AML Heather
	tumor_3
	7708a9c6ceff44d2a7446747e0ce183c tumor exome allegra/AML_Decitabine AML Heather
	tumor_4
	c89f7cbe47864ac495cdca6867966f32 tumor exome allegra/AML_Decitabine AML Heather
	tumor_5
	07db87b2216744c2b76cd1b33ca89146 tumor exome allegra/AML_Decitabine AML Heather
	tumor_6
	4eb2031c9af6439aa5c9b0a455882fcc tumor exome allegra/AML_Decitabine AML Heather
	tumor_7
	80dd8155d2364a92b71eb9c4057ee3c2 tumor exome allegra/AML_Decitabine AML Heather
H_OO-10DD-1016
	normal_1
	d9622964cc6648a798b8137f97eaa2eb normal exome allegra/AML_Decitabine AML Heather
	tumor_1
	a12e0253fe254b409b782d7fb4d5c34e tumor exome allegra/AML_Decitabine AML Heather
	tumor_2
	8f98a06eac0849d5932d6fa9e902910e tumor exome allegra/AML

	e9bfd8ef16364bb2b2987a492107fc7c tumor exome allegra/AML_Decitabine AML Heather
	tumor_3
	05818f25be304e2b8ae10f9c5ba824c9 tumor exome allegra/AML_Decitabine AML Heather
H_OO-10DD-1051
	normal_1
	740b29fb925547e6ab1960d8b392e462 normal exome allegra/AML_Decitabine AML Heather
	tumor_1
	28ba1087eff147628b43a4b4868a5a6d tumor exome allegra/AML_Decitabine AML Heather
	tumor_2
	dfe0206449684f95ba38c42d3926606c tumor exome allegra/AML_Decitabine AML Heather
	tumor_3
	c9e084cd1c2640fd8959f720a5699619 tumor exome allegra/AML_Decitabine AML Heather
H_OO-10DD-1055
	normal_1
	7426368ded154824a304a6b1d2870cab normal exome allegra/AML_Decitabine AML Heather
	tumor_1
	08624fdeb3e5426f87d88a85b2087b59 tumor exome allegra/AML_Decitabine AML Heather
	tumor_2
	4ce243df773c4a6c8c30e08afb1dd748 tumor exome allegra/AML_Decitabine AML Heather
H_OO-10DD-1058
	normal_1
	091840a8924f4903a430b4cb9dd20126 normal exome allegra/AML_Decitabine AML Heather
	tumor_1
	2048dc225c414067b232393ef2f4c161 tumor exome all

H_SU-B67
	normal_1
	991fb3ab480542a1a156ab4f08c4770c normal exome tli/gbm glioblastoma Heather
	tumor_1
	199caf9cad1f4ebf89f81d22f4cc1daf tumor exome tli/gbm glioblastoma Heather
	tumor_2
	9d72f1ce18cf4f43925e0361fc1e3b43 tumor exome tli/gbm glioblastoma Heather
	tumor_3
	8e873012a0004c4ca55ab957304fd75b tumor exome tli/gbm glioblastoma Heather
	tumor_4
	dfc7e24565d64ad59517c83bce7484b8 tumor exome tli/gbm glioblastoma Heather
H_SU-B71
	normal_1
	c2da24821b7f4e129b666428a8858745 normal exome tli/gbm glioblastoma Heather
	tumor_1
	e28d5defbd4e4b388291cfa8a17cf539 tumor exome tli/gbm glioblastoma Heather
	tumor_2
	548952af9ce445f4ab3526c8da439230 tumor exome tli/gbm glioblastoma Heather
	tumor_3
	dce080359dfc4d1aa568684d59a5d058 tumor exome tli/gbm glioblastoma Heather
	tumor_4
	9b76830f963742208de0505749b34e0a tumor exome tli/gbm glioblastoma Heather
H_SU-B72
	normal_1
	03ae2b8a86e248ea9b66183981d08b8f normal exome tli/gbm glioblastoma Heather
	tumor_1
	4e940e5dc03a491483d6b8ef4e3d240b 

In [8]:
# bed_df = pd.read_csv(bed_fp, sep='\t', names=['chromosome','start','stop','ref','var','call'])
# print(bed_fp)
# print(len(bed_df))
# print(count_fp)
# print(len(rc.read_count_df))

In [9]:
# len(bed_df[bed_df[['chromosome', 'start','stop','ref','var']].duplicated(keep=False)].sort_values(['chromosome','start','ref']))

In [10]:
# # Compare DFs for count errors
# bed_set = set([ tuple(line) for line in bed_df.values.tolist()])
# readcount_set = set([ tuple(line) for line in rc.read_count_df[['chromosome','start','stop','ref','var','call']].values.tolist()])
# diff = pd.DataFrame(list(bed_set.difference(readcount_set)),columns=['chr','start','stop','ref','var','call'])
# diff.sort_values(['chr','start','ref'])

In [11]:
# data[data.individual_name == bed_fp.split('/')[4]].project.unique()

### Filter and normalize training data

In [12]:
len(counts)

45326

In [13]:
training_cols = ['call', 'chromosome',  'start', 'stop', 'ref', 'var', 
                 'disease', 'reviewer', 'individual_name', 'normal_VAF', 
                 'normal_depth', 'normal_other_bases_count', 
                 'normal_ref_avg_basequality', 
                 'normal_ref_avg_clipped_length', 
                 'normal_ref_avg_distance_to_effective_3p_end', 
                 'normal_ref_avg_distance_to_q2_start_in_q2_reads', 
                 'normal_ref_avg_mapping_quality', 
                 'normal_ref_avg_num_mismaches_as_fraction', 
                 'normal_ref_avg_pos_as_fraction', 
                 'normal_ref_avg_se_mapping_quality', 
                 'normal_ref_avg_sum_mismatch_qualities', 
                 'normal_ref_count', 'normal_ref_num_minus_strand', 
                 'normal_ref_num_plus_strand', 
                 'normal_ref_num_q2_containing_reads', 
                 'normal_var_avg_basequality', 
                 'normal_var_avg_clipped_length', 
                 'normal_var_avg_distance_to_effective_3p_end', 
                 'normal_var_avg_distance_to_q2_start_in_q2_reads', 
                 'normal_var_avg_mapping_quality', 
                 'normal_var_avg_num_mismaches_as_fraction', 
                 'normal_var_avg_pos_as_fraction', 
                 'normal_var_avg_se_mapping_quality', 
                 'normal_var_avg_sum_mismatch_qualities', 
                 'normal_var_count', 'normal_var_num_minus_strand', 
                 'normal_var_num_plus_strand', 
                 'normal_var_num_q2_containing_reads', 'tumor_VAF', 
                 'tumor_depth', 'tumor_other_bases_count', 
                 'tumor_ref_avg_basequality', 'tumor_ref_avg_clipped_length', 
                 'tumor_ref_avg_distance_to_effective_3p_end', 
                 'tumor_ref_avg_distance_to_q2_start_in_q2_reads', 
                 'tumor_ref_avg_mapping_quality', 
                 'tumor_ref_avg_num_mismaches_as_fraction', 
                 'tumor_ref_avg_pos_as_fraction', 
                 'tumor_ref_avg_se_mapping_quality', 
                 'tumor_ref_avg_sum_mismatch_qualities', 'tumor_ref_count', 
                 'tumor_ref_num_minus_strand', 'tumor_ref_num_plus_strand', 
                 'tumor_ref_num_q2_containing_reads', 
                 'tumor_var_avg_basequality', 'tumor_var_avg_clipped_length', 
                 'tumor_var_avg_distance_to_effective_3p_end', 
                 'tumor_var_avg_distance_to_q2_start_in_q2_reads', 
                 'tumor_var_avg_mapping_quality', 
                 'tumor_var_avg_num_mismaches_as_fraction', 
                 'tumor_var_avg_pos_as_fraction', 
                 'tumor_var_avg_se_mapping_quality', 
                 'tumor_var_avg_sum_mismatch_qualities', 'tumor_var_count', 
                 'tumor_var_num_minus_strand', 'tumor_var_num_plus_strand', 
                 'tumor_var_num_q2_containing_reads']

In [14]:
tn_cols = ['call', 'chromosome',  'start', 'stop', 'ref', 'var', 'disease', 
           'reviewer', 'individual_name', 'normal_1_VAF', 'normal_1_depth', 
           'normal_1_other_bases_count', 'normal_1_ref_avg_basequality', 
           'normal_1_ref_avg_clipped_length', 
           'normal_1_ref_avg_distance_to_effective_3p_end', 
           'normal_1_ref_avg_distance_to_q2_start_in_q2_reads', 
           'normal_1_ref_avg_mapping_quality', 
           'normal_1_ref_avg_num_mismaches_as_fraction', 
           'normal_1_ref_avg_pos_as_fraction', 
           'normal_1_ref_avg_se_mapping_quality', 
           'normal_1_ref_avg_sum_mismatch_qualities', 
           'normal_1_ref_count', 'normal_1_ref_num_minus_strand', 
           'normal_1_ref_num_plus_strand', 
           'normal_1_ref_num_q2_containing_reads', 
           'normal_1_var_avg_basequality', 
           'normal_1_var_avg_clipped_length', 
           'normal_1_var_avg_distance_to_effective_3p_end', 
           'normal_1_var_avg_distance_to_q2_start_in_q2_reads', 
           'normal_1_var_avg_mapping_quality', 
           'normal_1_var_avg_num_mismaches_as_fraction', 
           'normal_1_var_avg_pos_as_fraction', 
           'normal_1_var_avg_se_mapping_quality', 
           'normal_1_var_avg_sum_mismatch_qualities', 'normal_1_var_count', 
           'normal_1_var_num_minus_strand', 'normal_1_var_num_plus_strand', 
           'normal_1_var_num_q2_containing_reads', 'tumor_1_VAF', 
           'tumor_1_depth', 'tumor_1_other_bases_count', 
           'tumor_1_ref_avg_basequality', 'tumor_1_ref_avg_clipped_length', 
           'tumor_1_ref_avg_distance_to_effective_3p_end', 
           'tumor_1_ref_avg_distance_to_q2_start_in_q2_reads', 
           'tumor_1_ref_avg_mapping_quality', 
           'tumor_1_ref_avg_num_mismaches_as_fraction', 
           'tumor_1_ref_avg_pos_as_fraction', 
           'tumor_1_ref_avg_se_mapping_quality', 
           'tumor_1_ref_avg_sum_mismatch_qualities', 'tumor_1_ref_count', 
           'tumor_1_ref_num_minus_strand', 'tumor_1_ref_num_plus_strand', 
           'tumor_1_ref_num_q2_containing_reads', 
           'tumor_1_var_avg_basequality', 'tumor_1_var_avg_clipped_length', 
           'tumor_1_var_avg_distance_to_effective_3p_end', 
           'tumor_1_var_avg_distance_to_q2_start_in_q2_reads', 
           'tumor_1_var_avg_mapping_quality', 
           'tumor_1_var_avg_num_mismaches_as_fraction', 
           'tumor_1_var_avg_pos_as_fraction', 
           'tumor_1_var_avg_se_mapping_quality', 
           'tumor_1_var_avg_sum_mismatch_qualities', 'tumor_1_var_count', 
           'tumor_1_var_num_minus_strand', 'tumor_1_var_num_plus_strand', 
           'tumor_1_var_num_q2_containing_reads']

In [15]:
tumor_normal_counts = counts[(counts.tumor_1_depth.notnull())&
                             (counts.normal_1_depth.notnull())][tn_cols]
tumor_normal_counts.columns = training_cols

In [16]:
rn_cols = ['call', 'chromosome',  'start', 'stop', 'ref', 'var', 'disease', 
           'reviewer', 'individual_name', 'normal_1_VAF', 'normal_1_depth', 
           'normal_1_other_bases_count', 'normal_1_ref_avg_basequality', 
           'normal_1_ref_avg_clipped_length', 
           'normal_1_ref_avg_distance_to_effective_3p_end', 
           'normal_1_ref_avg_distance_to_q2_start_in_q2_reads', 
           'normal_1_ref_avg_mapping_quality', 
           'normal_1_ref_avg_num_mismaches_as_fraction', 
           'normal_1_ref_avg_pos_as_fraction', 
           'normal_1_ref_avg_se_mapping_quality', 
           'normal_1_ref_avg_sum_mismatch_qualities', 
           'normal_1_ref_count', 'normal_1_ref_num_minus_strand', 
           'normal_1_ref_num_plus_strand', 
           'normal_1_ref_num_q2_containing_reads', 
           'normal_1_var_avg_basequality', 
           'normal_1_var_avg_clipped_length', 
           'normal_1_var_avg_distance_to_effective_3p_end', 
           'normal_1_var_avg_distance_to_q2_start_in_q2_reads', 
           'normal_1_var_avg_mapping_quality', 
           'normal_1_var_avg_num_mismaches_as_fraction', 
           'normal_1_var_avg_pos_as_fraction', 
           'normal_1_var_avg_se_mapping_quality', 
           'normal_1_var_avg_sum_mismatch_qualities', 'normal_1_var_count', 
           'normal_1_var_num_minus_strand', 'normal_1_var_num_plus_strand', 
           'normal_1_var_num_q2_containing_reads', 'relapse_1_VAF', 
           'relapse_1_depth', 'relapse_1_other_bases_count', 
           'relapse_1_ref_avg_basequality', 
           'relapse_1_ref_avg_clipped_length', 
           'relapse_1_ref_avg_distance_to_effective_3p_end', 
           'relapse_1_ref_avg_distance_to_q2_start_in_q2_reads', 
           'relapse_1_ref_avg_mapping_quality', 
           'relapse_1_ref_avg_num_mismaches_as_fraction', 
           'relapse_1_ref_avg_pos_as_fraction', 
           'relapse_1_ref_avg_se_mapping_quality', 
           'relapse_1_ref_avg_sum_mismatch_qualities', 'relapse_1_ref_count', 
           'relapse_1_ref_num_minus_strand', 'relapse_1_ref_num_plus_strand', 
           'relapse_1_ref_num_q2_containing_reads', 
           'relapse_1_var_avg_basequality', 
           'relapse_1_var_avg_clipped_length', 
           'relapse_1_var_avg_distance_to_effective_3p_end', 
           'relapse_1_var_avg_distance_to_q2_start_in_q2_reads', 
           'relapse_1_var_avg_mapping_quality', 
           'relapse_1_var_avg_num_mismaches_as_fraction', 
           'relapse_1_var_avg_pos_as_fraction', 
           'relapse_1_var_avg_se_mapping_quality', 
           'relapse_1_var_avg_sum_mismatch_qualities', 'relapse_1_var_count', 
           'relapse_1_var_num_minus_strand', 'relapse_1_var_num_plus_strand', 
           'relapse_1_var_num_q2_containing_reads']

In [17]:
relapse_normal_counts = counts[(counts.tumor_1_depth.isnull())&
                               (counts.relapse_1_depth.notnull())&
                               (counts.normal_1_depth.notnull())][rn_cols]
relapse_normal_counts.columns = training_cols


In [18]:
mn_cols = ['call', 'chromosome',  'start', 'stop', 'ref', 'var', 'disease', 
           'reviewer', 'individual_name', 'normal_1_VAF', 'normal_1_depth', 
           'normal_1_other_bases_count', 'normal_1_ref_avg_basequality', 
           'normal_1_ref_avg_clipped_length', 
           'normal_1_ref_avg_distance_to_effective_3p_end', 
           'normal_1_ref_avg_distance_to_q2_start_in_q2_reads', 
           'normal_1_ref_avg_mapping_quality', 
           'normal_1_ref_avg_num_mismaches_as_fraction', 
           'normal_1_ref_avg_pos_as_fraction', 
           'normal_1_ref_avg_se_mapping_quality', 
           'normal_1_ref_avg_sum_mismatch_qualities', 
           'normal_1_ref_count', 'normal_1_ref_num_minus_strand', 
           'normal_1_ref_num_plus_strand', 
           'normal_1_ref_num_q2_containing_reads', 
           'normal_1_var_avg_basequality', 
           'normal_1_var_avg_clipped_length', 
           'normal_1_var_avg_distance_to_effective_3p_end', 
           'normal_1_var_avg_distance_to_q2_start_in_q2_reads', 
           'normal_1_var_avg_mapping_quality', 
           'normal_1_var_avg_num_mismaches_as_fraction', 
           'normal_1_var_avg_pos_as_fraction', 
           'normal_1_var_avg_se_mapping_quality', 
           'normal_1_var_avg_sum_mismatch_qualities', 'normal_1_var_count', 
           'normal_1_var_num_minus_strand', 'normal_1_var_num_plus_strand', 
           'normal_1_var_num_q2_containing_reads', 'met_1_VAF', 
           'met_1_depth', 'met_1_other_bases_count', 
           'met_1_ref_avg_basequality', 'met_1_ref_avg_clipped_length', 
           'met_1_ref_avg_distance_to_effective_3p_end', 
           'met_1_ref_avg_distance_to_q2_start_in_q2_reads', 
           'met_1_ref_avg_mapping_quality', 
           'met_1_ref_avg_num_mismaches_as_fraction', 
           'met_1_ref_avg_pos_as_fraction', 
           'met_1_ref_avg_se_mapping_quality', 
           'met_1_ref_avg_sum_mismatch_qualities', 'met_1_ref_count', 
           'met_1_ref_num_minus_strand', 'met_1_ref_num_plus_strand', 
           'met_1_ref_num_q2_containing_reads', 
           'met_1_var_avg_basequality', 'met_1_var_avg_clipped_length', 
           'met_1_var_avg_distance_to_effective_3p_end', 
           'met_1_var_avg_distance_to_q2_start_in_q2_reads', 
           'met_1_var_avg_mapping_quality', 
           'met_1_var_avg_num_mismaches_as_fraction', 
           'met_1_var_avg_pos_as_fraction', 
           'met_1_var_avg_se_mapping_quality', 
           'met_1_var_avg_sum_mismatch_qualities', 'met_1_var_count', 
           'met_1_var_num_minus_strand', 'met_1_var_num_plus_strand', 
           'met_1_var_num_q2_containing_reads']

In [19]:
met_normal_counts = counts[(counts.tumor_1_depth.isnull())&
                           (counts.relapse_1_depth.isnull())&
                           (counts.normal_1_depth.notnull())&
                           (counts.met_1_depth.notnull())][mn_cols]
met_normal_counts.columns = training_cols

In [20]:
training_data = pd.concat([tumor_normal_counts, relapse_normal_counts,
                          met_normal_counts], ignore_index=True)

In [21]:
# Sanity check: Ensure all columns have data (expect 0)
len(training_data[training_data.isnull().any(axis=1)])

0

In [22]:
len(training_data)

41855

#### How many calls are there of each type?

In [23]:
training_data.groupby('call').size()

call
a    14589
f     1600
g     4309
s    21156
v      201
dtype: int64

'v' calls seem to be a bit of an outlier, additionally the call does not have a clear definition in the manual review guidelines. It is likely they are really messy, because of this I will remove them from future analysis.

In [24]:
training_data = training_data[training_data.call != 'v']

In [25]:
training_data.groupby('call').size()

call
a    14589
f     1600
g     4309
s    21156
dtype: int64

In [26]:
training_data.index = training_data.chromosome + ':' + training_data.start.map(str)+ '-' + training_data.stop.map(str) + training_data['ref'] + '>' + training_data['var']

In [27]:
# Remove genomic location information for now. 
# TODO: include important genomic features (ie predicticted function, region complexity, splicing information, etc)
# Remove individual_name
# ???: Could the individual_name be helpful for the classifier
training_data.drop(['chromosome', 'start', 'stop', 'ref', 'var', 'individual_name'], axis=1, inplace=True)

In [28]:
# Transform disease and reviewer columns in to one hot columns
training_data = pd.concat([training_data, pd.get_dummies(training_data['disease'], prefix='disease')], axis=1)
training_data.drop('disease', axis=1, inplace=True)
training_data = pd.concat([training_data, pd.get_dummies(training_data['reviewer'], prefix='reviewer')], axis=1)
training_data.drop('reviewer', axis=1, inplace=True)

#### Normalize bam-readcount metrics

In [29]:
to_normalize = training_data[['normal_VAF', 'normal_depth', 'normal_other_bases_count',
       'normal_ref_avg_basequality', 'normal_ref_avg_clipped_length',
       'normal_ref_avg_distance_to_effective_3p_end',
       'normal_ref_avg_distance_to_q2_start_in_q2_reads',
       'normal_ref_avg_mapping_quality',
       'normal_ref_avg_num_mismaches_as_fraction',
       'normal_ref_avg_pos_as_fraction',
       'normal_ref_avg_se_mapping_quality',
       'normal_ref_avg_sum_mismatch_qualities', 'normal_ref_count',
       'normal_ref_num_minus_strand', 'normal_ref_num_plus_strand',
       'normal_ref_num_q2_containing_reads', 'normal_var_avg_basequality',
       'normal_var_avg_clipped_length',
       'normal_var_avg_distance_to_effective_3p_end',
       'normal_var_avg_distance_to_q2_start_in_q2_reads',
       'normal_var_avg_mapping_quality',
       'normal_var_avg_num_mismaches_as_fraction',
       'normal_var_avg_pos_as_fraction',
       'normal_var_avg_se_mapping_quality',
       'normal_var_avg_sum_mismatch_qualities', 'normal_var_count',
       'normal_var_num_minus_strand', 'normal_var_num_plus_strand',
       'normal_var_num_q2_containing_reads', 'tumor_VAF', 'tumor_depth',
       'tumor_other_bases_count', 'tumor_ref_avg_basequality',
       'tumor_ref_avg_clipped_length',
       'tumor_ref_avg_distance_to_effective_3p_end',
       'tumor_ref_avg_distance_to_q2_start_in_q2_reads',
       'tumor_ref_avg_mapping_quality',
       'tumor_ref_avg_num_mismaches_as_fraction',
       'tumor_ref_avg_pos_as_fraction', 'tumor_ref_avg_se_mapping_quality',
       'tumor_ref_avg_sum_mismatch_qualities', 'tumor_ref_count',
       'tumor_ref_num_minus_strand', 'tumor_ref_num_plus_strand',
       'tumor_ref_num_q2_containing_reads', 'tumor_var_avg_basequality',
       'tumor_var_avg_clipped_length',
       'tumor_var_avg_distance_to_effective_3p_end',
       'tumor_var_avg_distance_to_q2_start_in_q2_reads',
       'tumor_var_avg_mapping_quality',
       'tumor_var_avg_num_mismaches_as_fraction',
       'tumor_var_avg_pos_as_fraction', 'tumor_var_avg_se_mapping_quality',
       'tumor_var_avg_sum_mismatch_qualities', 'tumor_var_count',
       'tumor_var_num_minus_strand', 'tumor_var_num_plus_strand',
       'tumor_var_num_q2_containing_reads']]

In [30]:
# Source http://stackoverflow.com/a/26415620
x = to_normalize.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
scaled = pd.DataFrame(x_scaled, index=training_data.index, columns=['normal_VAF', 'normal_depth', 'normal_other_bases_count',
       'normal_ref_avg_basequality', 'normal_ref_avg_clipped_length',
       'normal_ref_avg_distance_to_effective_3p_end',
       'normal_ref_avg_distance_to_q2_start_in_q2_reads',
       'normal_ref_avg_mapping_quality',
       'normal_ref_avg_num_mismaches_as_fraction',
       'normal_ref_avg_pos_as_fraction',
       'normal_ref_avg_se_mapping_quality',
       'normal_ref_avg_sum_mismatch_qualities', 'normal_ref_count',
       'normal_ref_num_minus_strand', 'normal_ref_num_plus_strand',
       'normal_ref_num_q2_containing_reads', 'normal_var_avg_basequality',
       'normal_var_avg_clipped_length',
       'normal_var_avg_distance_to_effective_3p_end',
       'normal_var_avg_distance_to_q2_start_in_q2_reads',
       'normal_var_avg_mapping_quality',
       'normal_var_avg_num_mismaches_as_fraction',
       'normal_var_avg_pos_as_fraction',
       'normal_var_avg_se_mapping_quality',
       'normal_var_avg_sum_mismatch_qualities', 'normal_var_count',
       'normal_var_num_minus_strand', 'normal_var_num_plus_strand',
       'normal_var_num_q2_containing_reads', 'tumor_VAF', 'tumor_depth',
       'tumor_other_bases_count', 'tumor_ref_avg_basequality',
       'tumor_ref_avg_clipped_length',
       'tumor_ref_avg_distance_to_effective_3p_end',
       'tumor_ref_avg_distance_to_q2_start_in_q2_reads',
       'tumor_ref_avg_mapping_quality',
       'tumor_ref_avg_num_mismaches_as_fraction',
       'tumor_ref_avg_pos_as_fraction', 'tumor_ref_avg_se_mapping_quality',
       'tumor_ref_avg_sum_mismatch_qualities', 'tumor_ref_count',
       'tumor_ref_num_minus_strand', 'tumor_ref_num_plus_strand',
       'tumor_ref_num_q2_containing_reads', 'tumor_var_avg_basequality',
       'tumor_var_avg_clipped_length',
       'tumor_var_avg_distance_to_effective_3p_end',
       'tumor_var_avg_distance_to_q2_start_in_q2_reads',
       'tumor_var_avg_mapping_quality',
       'tumor_var_avg_num_mismaches_as_fraction',
       'tumor_var_avg_pos_as_fraction', 'tumor_var_avg_se_mapping_quality',
       'tumor_var_avg_sum_mismatch_qualities', 'tumor_var_count',
       'tumor_var_num_minus_strand', 'tumor_var_num_plus_strand',
       'tumor_var_num_q2_containing_reads'])

In [31]:
training_data = pd.concat([training_data[['call','disease_AML', 'disease_GST',
       'disease_MPNST', 'disease_SCLC', 'disease_breast', 'disease_colorectal',
       'disease_glioblastoma', 'disease_lymphoma', 'disease_melanoma',
       'reviewer_Avi', 'reviewer_Heather', 'reviewer_Lee', 'reviewer_Nick']], scaled], axis=1)

In [32]:
training_data.to_pickle('../data/training_data.pkl')