In [1]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import numpy as np
import itertools
from collections import OrderedDict

def add_summary(data, with_as=False):
    """
    Add summary stats to a dataframe
    """
    e = "as_" if with_as else ""
    data['Total Calls'] = data[['Concordant', 'Discordant', 'Missing', 'Filtered']].sum(axis=1)
    data['Total Genotyped'] = data['Total Calls'] - data['Missing']
    data['Missing Rate'] = data['Missing'] / data['Total Genotyped']
    data['GT Concordance'] = data['Concordant'] / data['Total Genotyped']
    data[f'{e}compP'] = data[f'{e}TP'] + data[f'{e}FP']
    data[f'{e}baseP'] = data[f'{e}TP'] + data[f'{e}FN']
    data[f'{e}compN'] = data[f'{e}TN'] + data[f'{e}FN']
    data[f'{e}baseN'] = data[f'{e}TN'] + data[f'{e}FP']
    data[f'{e}ppv'] = data[f'{e}TP'] / data[f'{e}compP']
    data[f'{e}tpr'] = data[f'{e}TP'] / data[f'{e}baseP']
    data[f'{e}tnr'] = data[f'{e}TN'] / data[f'{e}baseN']
    data[f'{e}npv'] = data[f'{e}TN'] / data[f'{e}compN']
    data[f'{e}acc'] = (data[f'{e}TP'] + data[f'{e}TN']) / (data[f'{e}baseP'] + data[f'{e}baseN'])
    data[f'{e}ba'] = (data[f'{e}tpr'] + data[f'{e}tnr']) / 2
    data[f'{e}f1'] = 2 * ((data[f'{e}ppv'] * data[f'{e}tpr']) / (data[f'{e}ppv'] + data[f'{e}tpr']))
    
    
gt_data = pd.read_csv("../results/tables/all.gt_dist.table.txt", sep='\t')
inter_data = pd.read_csv("../results/tables/all.intersect.table.txt", sep='\t')
neigh_data = pd.read_csv("../results/tables/all.neighbor.table.txt", sep='\t')
svtype_data = pd.read_csv("../results/tables/all.svtype.table.txt", sep='\t')

tr_gt_data = pd.read_csv("../results/tables/tr.gt_dist.table.txt", sep='\t')
tr_inter_data = pd.read_csv("../results/tables/tr.intersect.table.txt", sep='\t')
tr_neigh_data = pd.read_csv("../results/tables/tr.neighbor.table.txt", sep='\t')
tr_svtype_data = pd.read_csv("../results/tables/tr.svtype.table.txt", sep='\t')

ntr_gt_data = pd.read_csv("../results/tables/non_tr.gt_dist.table.txt", sep='\t')
ntr_inter_data = pd.read_csv("../results/tables/non_tr.intersect.table.txt", sep='\t')
ntr_neigh_data = pd.read_csv("../results/tables/non_tr.neighbor.table.txt", sep='\t')
ntr_svtype_data = pd.read_csv("../results/tables/non_tr.svtype.table.txt", sep='\t')

On average, kanpig’s genotype concordance on the 32x coverage experiments was 0.85 compared to 0.762 for svjedi, and 0.781 for cuteSV. Since sniffles was used to discover the SVs, we measure both its original discovery genotypes as well as its force-genotyping modes’ performance separately at 0.805 and 0.804, respectively. 

In [None]:
view = inter_data[(inter_data['experiment'] == 'ds') &
                 (inter_data['coverage'] == '32x')]

view.groupby(['program'])[['gt_concordance']].mean()


Unnamed: 0_level_0,gt_concordance
program,Unnamed: 1_level_1
cutesv,0.781145
kanpig,0.85074
orig,0.805352
sniffles,0.804316
svjedi,0.762801


 (full strat sup table?)

In [28]:
# Full stratification sup table
add_summary(svtype_data)
add_summary(tr_svtype_data)
add_summary(ntr_svtype_data)

a = svtype_data[(svtype_data['experiment'] == 'ds')].copy()
a['TR'] = 'all'

b = tr_svtype_data[(tr_svtype_data['experiment'] == 'ds')].copy()
b['TR'] = 'within'

c = ntr_svtype_data[(ntr_svtype_data['experiment'] == 'ds')].copy()
c['TR'] = 'outside'

view = pd.concat([a, b, c])
sup_table = (view.groupby(['program', 'coverage', 'svtype', 'TR'])['GT Concordance'].describe()[['mean', 'std']].reset_index())
sup_table.to_csv("SupTable_DS_GTconcordance.txt", sep='\t', index=False)

In [44]:
view = inter_data[(inter_data['experiment'] == 'ds')
                  & ((inter_data['program'] != "orig") | (inter_data['coverage'] == '32x'))
                 ]
view2 = view.groupby(['program', 'coverage'])[['gt_concordance', 'TP-base', 'TP-comp', 'FN', 'FP', 'precision', 'recall', 'f1']].mean()
o = view2.loc[('orig', '32x')]
t = view2.reset_index()
g = t[t['coverage'] == '32x'].set_index(['program', 'coverage'])
diff = g - o
diff

Unnamed: 0_level_0,Unnamed: 1_level_0,gt_concordance,TP-base,TP-comp,FN,FP,precision,recall,f1
program,coverage,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cutesv,32x,-0.024206,-723.914894,-745.808511,631.489362,-50.87234,0.001536,-0.028023,-0.013964
kanpig,32x,0.045388,-760.297872,-205.638298,914.765957,-51.957447,0.002317,-0.039303,-0.019597
orig,32x,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sniffles,32x,-0.001036,-79.255319,-87.340426,516.361702,-23.042553,0.001025,-0.021185,-0.01055
svjedi,32x,-0.042551,-647.789598,-633.661939,6.413712,63.033097,-0.004181,-0.002025,-0.003071


In [71]:
view2.reset_index().to_csv("SupTable_DS_PrecisionRecall.txt", sep='\t', index=False)