Let's first check that all our variants identified to be in calls of two different tools are indeed there:

In [27]:
%%bash
cd /data/NCR_SBRB/simplex/
for id in `grep -v '#' snv_arm/10033_trio1_gatkANDdng.vcf | awk 'BEGIN {FS="\t"}; {print $2}' -`; do
    if ! grep -q $id dng/10033_trio1_dnm.vcf || ! grep -q $id gatk_refine/10033_trio1_hiConfDeNovo.vcf; then
        echo $id;
    fi;
done

OK, so let's assume this also works for the other trios. Now, let's figure out how many interesting DNVs showed up both in GATK and DNG:

In [64]:
%%bash
suffix=gatkANDdng.vcf
cd /data/NCR_SBRB/simplex/snv_arm
rm interesting_snvs_${suffix}.txt
# figure out all family IDs
ls -1 *_trio1_${suffix} > famids.txt
sed -i -e "s/_trio1_${suffix}//g" famids.txt

# for each family ID
while read fam; do
  # figure out how many trios we have
   ntrios=`ls -1 ${fam}_trio?_${suffix} | wc -l`;
   ntrios=$(($ntrios))
   echo $fam;
   echo $ntrios
   # if we have more than one (assuming the first one is affected)
   if [ $ntrios -gt 1 ]; then
      # get all SNVs in the affected trio in the family
      cut -f 1,2 ${fam}_trio1_${suffix} | grep -v '#' - > ${fam}_possible_snvs_${suffix}.txt;
      # combine the vcf files of all unnafected trios
      cat ${fam}_trio[2..$ntrios]_${suffix} > ${fam}_control_snvs_${suffix}.txt;
      # for each possible SNV in affected trio, mark it as interesting if it's not
      # in the unnafected trios
      while read snv; do
         if ! grep -q "$snv" ${fam}_control_snvs_${suffix}.txt; then
            echo $snv >> interesting_snvs_${suffix}.txt;
         fi;
      done < ${fam}_possible_snvs_${suffix}.txt;
   fi;
done < famids.txt

10033
2
10042
1
10090
2
10094
2
10128
2
10131
4
10153
3
10164
2
10173
2
10178
2
10182
3
10197
2
10215
4
10369
2
10406
3
10448
2
1892
2
1893
2
1895
2
1976
3
855
2


OK, so now interesting_snvs.txt hold all DNVs that occur in affected kids but not in the unaffected sibs, called by both GATK and DenovoGear. If we count how often each variant appears, we get how many families:

In [65]:
import pandas as pd
snps_gatkANDdng = pd.read_table('/data/NCR_SBRB/simplex/snv_arm/interesting_snvs_gatkANDdng.vcf.txt',
                               header=None, names=['snp'])
snps_gatkANDdng.head()

Unnamed: 0,snp
0,chr1 91852821
1,chr1 142726878
2,chr1 142825133
3,chr1 142825147
4,chr1 142825151


In [67]:
from scipy import stats
counts_gatkANDdng = stats.itemfreq(snps_gatkANDdng['snp'])
my_max = np.max(counts_gatkANDdng[:, 1])
print 'Maximum frequency: %d in %d unique snps' % (my_max, len(counts_gatkANDdng))
print counts_gatkANDdng[counts_gatkANDdng[:, 1] == my_max]

Maximum frequency: 2 in 536 unique snps
[['chr17 72762902' 2]
 ['chr18 15271172' 2]
 ['chr2 89105006' 2]
 ['chr20 29637674' 2]
 ['chr20 29637691' 2]
 ['chr6 32548712' 2]
 ['chr6 32557647' 2]]


We can do the same thing for the combination of GATK and triodenovo:

In [69]:
%%bash
suffix=gatkANDtdn.vcf
cd /data/NCR_SBRB/simplex/snv_arm
rm interesting_snvs_${suffix}.txt
# figure out all family IDs
ls -1 *_trio1_${suffix} > famids.txt
sed -i -e "s/_trio1_${suffix}//g" famids.txt

# for each family ID
while read fam; do
  # figure out how many trios we have
   ntrios=`ls -1 ${fam}_trio?_${suffix} | wc -l`;
   ntrios=$(($ntrios))
   echo $fam;
   echo $ntrios
   # if we have more than one (assuming the first one is affected)
   if [ $ntrios -gt 1 ]; then
      # get all SNVs in the affected trio in the family
      cut -f 1,2 ${fam}_trio1_${suffix} | grep -v '#' - > ${fam}_possible_snvs_${suffix}.txt;
      # combine the vcf files of all unnafected trios
      cat ${fam}_trio[2..$ntrios]_${suffix} > ${fam}_control_snvs_${suffix}.txt;
      # for each possible SNV in affected trio, mark it as interesting if it's not
      # in the unnafected trios
      while read snv; do
         if ! grep -q "$snv" ${fam}_control_snvs_${suffix}.txt; then
            echo $snv >> interesting_snvs_${suffix}.txt;
         fi;
      done < ${fam}_possible_snvs_${suffix}.txt;
   fi;
done < famids.txt

10033
2
10042
1
10090
2
10094
2
10128
2
10131
4
10153
3
10164
2
10173
2
10178
2
10182
3
10197
2
10215
3
10369
2
10406
3
10448
2
1892
2
1893
2
1895
2
1976
3
855
2


In [70]:
snps_gatkANDtdn = pd.read_table('/data/NCR_SBRB/simplex/snv_arm/interesting_snvs_gatkANDtdn.vcf.txt',
                               header=None, names=['snp'])
counts_gatkANDtdn = stats.itemfreq(snps_gatkANDtdn['snp'])
my_max = np.max(counts_gatkANDtdn[:, 1])
print 'Maximum frequency: %d in %d unique snps' % (my_max, len(counts_gatkANDtdn))
print counts_gatkANDtdn[counts_gatkANDtdn[:, 1] == my_max]

Maximum frequency: 2 in 631 unique snps
[['chr17 72762902' 2]
 ['chr18 15271172' 2]
 ['chr2 89105006' 2]
 ['chr20 29637674' 2]
 ['chr20 29637691' 2]
 ['chr6 29968761' 2]
 ['chr6 32548712' 2]
 ['chr6 32557647' 2]
 ['chr7 154002420' 2]]


Good... some intersection too. What if we try the intersection of all 3?

In [72]:
%%bash
suffix=ensemble.vcf
cd /data/NCR_SBRB/simplex/snv_arm
rm interesting_snvs_${suffix}.txt
# figure out all family IDs
ls -1 *_trio1_${suffix} > famids.txt
sed -i -e "s/_trio1_${suffix}//g" famids.txt

# for each family ID
while read fam; do
  # figure out how many trios we have
   ntrios=`ls -1 ${fam}_trio?_${suffix} | wc -l`;
   ntrios=$(($ntrios))
   echo $fam;
   echo $ntrios
   # if we have more than one (assuming the first one is affected)
   if [ $ntrios -gt 1 ]; then
      # get all SNVs in the affected trio in the family
      cut -f 1,2 ${fam}_trio1_${suffix} | grep -v '#' - > ${fam}_possible_snvs_${suffix}.txt;
      # combine the vcf files of all unnafected trios
      cat ${fam}_trio[2..$ntrios]_${suffix} > ${fam}_control_snvs_${suffix}.txt;
      # for each possible SNV in affected trio, mark it as interesting if it's not
      # in the unnafected trios
      while read snv; do
         if ! grep -q "$snv" ${fam}_control_snvs_${suffix}.txt; then
            echo $snv >> interesting_snvs_${suffix}.txt;
         fi;
      done < ${fam}_possible_snvs_${suffix}.txt;
   fi;
done < famids.txt

10033
2
10042
1
10090
2
10094
2
10128
2
10131
4
10153
3
10164
2
10173
2
10178
2
10182
3
10197
2
10215
3
10369
2
10406
3
10448
2
1892
2
1893
2
1895
2
1976
3
855
2


In [98]:
snps = pd.read_table('/data/NCR_SBRB/simplex/snv_arm/interesting_snvs_ensemble.vcf.txt',
                               header=None, names=['snp'])
counts = stats.itemfreq(snps['snp'])
my_max = np.max(counts[:, 1])
print 'Maximum frequency: %d in %d unique snps' % (my_max, len(counts))
print counts[counts[:, 1] == my_max]

Maximum frequency: 2 in 535 unique snps
[['chr17 72762902' 2]
 ['chr18 15271172' 2]
 ['chr2 89105006' 2]
 ['chr20 29637674' 2]
 ['chr20 29637691' 2]
 ['chr6 32548712' 2]
 ['chr6 32557647' 2]]


Then, the question is: out of 535 unique SNPs, if I were to choose 20 with replacement (as one of the families only has trio1, and therefore didn't get counted), how often does the same SNP get selected twice just by chance? 

In [77]:
import numpy as np
def do_perms(snps, noften, nboot=10000, npicks=20):
    success = 0
    for i in range(nboot):
        picks = np.random.choice(snps, npicks, replace = True)
        counts = stats.itemfreq(picks)
        nmax = np.max(counts[:,1])
        if (nmax >= noften):
            success += 1
    return(success/float(nboot))

In [86]:
do_perms(counts[:, 0], 2)

0.3053

Not good... well, do any of those IDs come up in the affected trio I'm not counting?

In [93]:
fid = open('/home/sudregp/tmp/good_snps.txt', 'w')
idx = counts[:, 1] == my_max
for rsid in counts[idx, 0]:
    fid.write('%s\n' % rsid.split(' ')[1])
fid.close()

In [95]:
%%bash
echo `wc -l /home/sudregp/tmp/good_snps.txt`
grep -f /home/sudregp/tmp/good_snps.txt /data/NCR_SBRB/simplex/snv_arm/10042_trio1_ensemble.vcf

7 /home/sudregp/tmp/good_snps.txt


Not realy... oh well. Our options here are:
* do this within tool, with the hopes of making the stats better
* look at all DNVs in affected siblings, and check that the more frequent ones are not as frequent in unaffected sibs (no within-family requirement)
* drop the initial quality filter in GATK pipeline?

# Within tools stats

## GATK

In [96]:
%%bash
suffix=hiConfDeNovo.vcf 
cd /data/NCR_SBRB/simplex/gatk_refine
rm interesting_snvs_${suffix}.txt
# figure out all family IDs
ls -1 *_trio1_${suffix} > famids.txt
sed -i -e "s/_trio1_${suffix}//g" famids.txt

# for each family ID
while read fam; do
  # figure out how many trios we have
   ntrios=`ls -1 ${fam}_trio?_${suffix} | wc -l`;
   ntrios=$(($ntrios))
   echo $fam;
   echo $ntrios
   # if we have more than one (assuming the first one is affected)
   if [ $ntrios -gt 1 ]; then
      # get all SNVs in the affected trio in the family
      cut -f 1,2 ${fam}_trio1_${suffix} | grep -v '#' - > ${fam}_possible_snvs_${suffix}.txt;
      # combine the vcf files of all unnafected trios
      cat ${fam}_trio[2..$ntrios]_${suffix} > ${fam}_control_snvs_${suffix}.txt;
      # for each possible SNV in affected trio, mark it as interesting if it's not
      # in the unnafected trios
      while read snv; do
         if ! grep -q "$snv" ${fam}_control_snvs_${suffix}.txt; then
            echo $snv >> interesting_snvs_${suffix}.txt;
         fi;
      done < ${fam}_possible_snvs_${suffix}.txt;
   fi;
done < famids.txt

10033
2
10042
1
10090
2
10094
2
10128
2
10131
4
10153
3
10164
2
10173
2
10178
2
10182
3
10197
2
10215
4
10369
2
10406
3
10448
2
1892
2
1893
2
1895
2
1976
3
855
2


rm: cannot remove `interesting_snvs_hiConfDeNovo.vcf.txt': No such file or directory


In [99]:
snps_gatk = pd.read_table('/data/NCR_SBRB/simplex/gatk_refine/interesting_snvs_hiConfDeNovo.vcf.txt',
                               header=None, names=['snp'])
counts_gatk = stats.itemfreq(snps_gatk['snp'])
my_max = np.max(counts_gatk[:, 1])
print 'Maximum frequency: %d in %d unique snps' % (my_max, len(counts_gatk))
print counts_gatk[counts_gatk[:, 1] == my_max]

Maximum frequency: 2 in 3388 unique snps
[['chr1 40229504' 2]
 ['chr17 72762902' 2]
 ['chr18 15271172' 2]
 ['chr2 89105006' 2]
 ['chr20 29637674' 2]
 ['chr20 29637691' 2]
 ['chr6 29968761' 2]
 ['chr6 32548712' 2]
 ['chr6 32557647' 2]
 ['chr7 154002420' 2]]


In [102]:
do_perms(counts_gatk[:, 0], 2)

0.0578

This is looking better, as we have increased the options of unique SNPs. Not great yet, but getting there.

## Triodenovo

In [103]:
%%bash
suffix=denovo_v2.vcf 
cd /data/NCR_SBRB/simplex/triodenovo
rm interesting_snvs_${suffix}.txt
# figure out all family IDs
ls -1 *_trio1_${suffix} > famids.txt
sed -i -e "s/_trio1_${suffix}//g" famids.txt

# for each family ID
while read fam; do
  # figure out how many trios we have
   ntrios=`ls -1 ${fam}_trio?_${suffix} | wc -l`;
   ntrios=$(($ntrios))
   echo $fam;
   echo $ntrios
   # if we have more than one (assuming the first one is affected)
   if [ $ntrios -gt 1 ]; then
      # get all SNVs in the affected trio in the family
      cut -f 1,2 ${fam}_trio1_${suffix} | grep -v '#' - > ${fam}_possible_snvs_${suffix}.txt;
      # combine the vcf files of all unnafected trios
      cat ${fam}_trio[2..$ntrios]_${suffix} > ${fam}_control_snvs_${suffix}.txt;
      # for each possible SNV in affected trio, mark it as interesting if it's not
      # in the unnafected trios
      while read snv; do
         if ! grep -q "$snv" ${fam}_control_snvs_${suffix}.txt; then
            echo $snv >> interesting_snvs_${suffix}.txt;
         fi;
      done < ${fam}_possible_snvs_${suffix}.txt;
   fi;
done < famids.txt

10033
2
10042
1
10090
2
10094
2
10128
2
10131
4
10153
3
10164
2
10173
2
10178
2
10182
3
10197
2
10215
4
10369
2
10406
3
10448
2
1892
2
1893
2
1895
2
1976
3
855
2


rm: cannot remove `interesting_snvs_denovo_v2.vcf.txt': No such file or directory


In [105]:
snps_tdn = pd.read_table('/data/NCR_SBRB/simplex/triodenovo/interesting_snvs_denovo_v2.vcf.txt',
                               header=None, names=['snp'])
counts_tdn = stats.itemfreq(snps_tdn['snp'])
my_max = np.max(counts_tdn[:, 1])
print 'Maximum frequency: %d in %d unique snps' % (my_max, len(counts_tdn))
print counts_tdn[counts_tdn[:, 1] == my_max]

Maximum frequency: 5 in 18802 unique snps
[['chr1 16914580' 5]
 ['chr10 49319323' 5]
 ['chr12 69667893' 5]
 ['chrX 102973509' 5]
 ['chrX 118751076' 5]
 ['chrX 13397236' 5]
 ['chrX 135307049' 5]
 ['chrX 14599572' 5]
 ['chrX 152610294' 5]
 ['chrX 153008911' 5]
 ['chrX 153880181' 5]
 ['chrX 153904473' 5]
 ['chrX 154774663' 5]
 ['chrX 16876980' 5]
 ['chrX 38262808' 5]
 ['chrX 43601142' 5]
 ['chrX 96502650' 5]]


In [106]:
do_perms(counts_tdn[:, 0], 5)

0.0

This is much better, with 5 trios showing the variant. I'd ignore the chrX variants for now. Somewhat concerning that the GATK refinement pipeline didn't show those 3 other variants... maybe a function of the QC?

## DenovoGear

In [108]:
%%bash
suffix=dnm.vcf 
cd /data/NCR_SBRB/simplex/dng
rm interesting_snvs_${suffix}.txt
# figure out all family IDs
ls -1 *_trio1_${suffix} > famids.txt
sed -i -e "s/_trio1_${suffix}//g" famids.txt

# for each family ID
while read fam; do
  # figure out how many trios we have
   ntrios=`ls -1 ${fam}_trio?_${suffix} | wc -l`;
   ntrios=$(($ntrios))
   echo $fam;
   echo $ntrios
   # if we have more than one (assuming the first one is affected)
   if [ $ntrios -gt 1 ]; then
      # get all SNVs in the affected trio in the family
      cut -f 1,2 ${fam}_trio1_${suffix} | grep -v '#' - > ${fam}_possible_snvs_${suffix}.txt;
      # combine the vcf files of all unnafected trios
      cat ${fam}_trio[2..$ntrios]_${suffix} > ${fam}_control_snvs_${suffix}.txt;
      # for each possible SNV in affected trio, mark it as interesting if it's not
      # in the unnafected trios
      while read snv; do
         if ! grep -q "$snv" ${fam}_control_snvs_${suffix}.txt; then
            echo $snv >> interesting_snvs_${suffix}.txt;
         fi;
      done < ${fam}_possible_snvs_${suffix}.txt;
   fi;
done < famids.txt

10033
2
10042
1
10090
2
10094
2
10128
2
10131
4
10153
3
10164
2
10173
2
10178
2
10182
3
10197
2
10215
4
10369
2
10406
3
10448
2
1892
2
1893
2
1895
2
1976
3
855
2


rm: cannot remove `interesting_snvs_dnm.vcf.txt': No such file or directory


In [109]:
snps_dng = pd.read_table('/data/NCR_SBRB/simplex/dng/interesting_snvs_dnm.vcf.txt',
                               header=None, names=['snp'])
counts_dng = stats.itemfreq(snps_dng['snp'])
my_max = np.max(counts_dng[:, 1])
print 'Maximum frequency: %d in %d unique snps' % (my_max, len(counts_dng))
print counts_dng[counts_dng[:, 1] == my_max]

Maximum frequency: 6 in 20901 unique snps
[['chr2 89072915' 6]
 ['chr8 97156476' 6]]


Well, 6 families is better than 5 (or 2). But the other tools didn't pick that one up. Still worth analyzing it...

Before we continue, do any of these come up in the family without unaffected data?

In [113]:
%%bash
grep 89072915 /data/NCR_SBRB/simplex/dng/10042_trio1_dnm.vcf
grep 97156476 /data/NCR_SBRB/simplex/dng/10042_trio1_dnm.vcf
grep 16914580 /data/NCR_SBRB/simplex/triodenovo/10042_trio1_denovo_v2.vcf
grep 49319323 /data/NCR_SBRB/simplex/triodenovo/10042_trio1_denovo_v2.vcf
grep 69667893 /data/NCR_SBRB/simplex/triodenovo/10042_trio1_denovo_v2.vcf

Nothing... oh well. We can still look for biological plausibility on them, just for kicks.

# Within group stats

The approach here is to calculate the best stats in all ADHD samples, and see what's the best we can do for a specific variable in non-affected siblings. For comparison, we can do it the other way around as well.

## Triodenovo

In [122]:
%%bash
cd /data/NCR_SBRB/simplex/triodenovo/

# concatenate all affected trios and extract the SNPs
cat *_trio1_*.vcf | grep -v '#' - | awk 'BEGIN {FS="\t"; OFS=":"}; {print $1, $2}' - > affected_snvs.txt;
cat *_trio[2..4]_*.vcf | grep -v '#' - | awk 'BEGIN {FS="\t"; OFS=":"}; {print $1, $2}' - > unaffected_snvs.txt;

In [123]:
naff = 21
nunaff = 29
aff_tdn = pd.read_table('/data/NCR_SBRB/simplex/triodenovo/affected_snvs.txt',
                               header=None, names=['snp'])
counts_aff_tdn = stats.itemfreq(aff_tdn['snp'])
my_max = np.max(counts_aff_tdn[:, 1])
print 'Maximum frequency: %d in %d affected families' % (my_max, naff)
print counts_aff_tdn[counts_aff_tdn[:, 1] == my_max]

unaff_tdn = pd.read_table('/data/NCR_SBRB/simplex/triodenovo/unaffected_snvs.txt',
                               header=None, names=['snp'])
counts_unaff_tdn = stats.itemfreq(unaff_tdn['snp'])
my_max = np.max(counts_unaff_tdn[:, 1])
print 'Maximum frequency: %d in %d unaffected families' % (my_max, nunaff)
print counts_unaff_tdn[counts_unaff_tdn[:, 1] == my_max]

Maximum frequency: 28 in 21 affected families
[['chrX:41093413' 28]]
Maximum frequency: 24 in 29 unaffected families
[['chrX:9686187' 24]]


*WEIRD!!! DUPLICATE RSIDS IN VCF FILES? HOWNCOME MORE OCCURRENCES THAN FAMILIES?*

## DenovoGear

In [124]:
%%bash
cd /data/NCR_SBRB/simplex/dng/

# concatenate all affected trios and extract the SNPs
cat *_trio1_*.vcf | grep -v '#' - | awk 'BEGIN {FS="\t"; OFS=":"}; {print $1, $2}' - > affected_snvs.txt;
cat *_trio[2..4]_*.vcf | grep -v '#' - | awk 'BEGIN {FS="\t"; OFS=":"}; {print $1, $2}' - > unaffected_snvs.txt;

In [125]:
naff = 21
nunaff = 29
aff_dng = pd.read_table('/data/NCR_SBRB/simplex/dng/affected_snvs.txt',
                               header=None, names=['snp'])
counts_aff_dng = stats.itemfreq(aff_dng['snp'])
my_max = np.max(counts_aff_dng[:, 1])
print 'Maximum frequency: %d in %d affected families' % (my_max, naff)
print counts_aff_dng[counts_aff_dng[:, 1] == my_max]

unaff_dng = pd.read_table('/data/NCR_SBRB/simplex/dng/unaffected_snvs.txt',
                               header=None, names=['snp'])
counts_unaff_dng = stats.itemfreq(unaff_dng['snp'])
my_max = np.max(counts_unaff_dng[:, 1])
print 'Maximum frequency: %d in %d unaffected families' % (my_max, nunaff)
print counts_unaff_dng[counts_unaff_dng[:, 1] == my_max]

Maximum frequency: 8 in 21 affected families
[['chr17:21319860' 8]
 ['chrX:8432783' 8]]
Maximum frequency: 7 in 29 unaffected families
[['chr4:113190251' 7]
 ['chrX:13779124' 7]
 ['chrX:152772473' 7]
 ['chrX:8432783' 7]
 ['chrY:9967496' 7]]


## GATK

In [117]:
%%bash
cd /data/NCR_SBRB/simplex/gatk_refine

# concatenate all affected trios and extract the SNPs
cat *_trio1_*.vcf | grep -v '#' - | awk 'BEGIN {FS="\t"; OFS=":"}; {print $1, $2}' - > affected_snvs.txt;
cat *_trio[2..4]_*.vcf | grep -v '#' - | awk 'BEGIN {FS="\t"; OFS=":"}; {print $1, $2}' - > unaffected_snvs.txt;

In [121]:
naff = 21
nunaff = 29
aff_gatk = pd.read_table('/data/NCR_SBRB/simplex/gatk_refine/affected_snvs.txt',
                               header=None, names=['snp'])
counts_aff_gatk = stats.itemfreq(aff_gatk['snp'])
my_max = np.max(counts_aff_gatk[:, 1])
print 'Maximum frequency: %d in %d affected families' % (my_max, naff)
print counts_aff_gatk[counts_aff_gatk[:, 1] == my_max]

unaff_gatk = pd.read_table('/data/NCR_SBRB/simplex/gatk_refine/unaffected_snvs.txt',
                               header=None, names=['snp'])
counts_unaff_gatk = stats.itemfreq(unaff_gatk['snp'])
my_max = np.max(counts_unaff_gatk[:, 1])
print 'Maximum frequency: %d in %d unaffected families' % (my_max, nunaff)
print counts_unaff_gatk[counts_unaff_gatk[:, 1] == my_max]

Maximum frequency: 2 in 21 affected families
[['chr17:72762902' 2]
 ['chr18:15271172' 2]
 ['chr1:40229504' 2]
 ['chr20:29637674' 2]
 ['chr20:29637691' 2]
 ['chr2:89105006' 2]
 ['chr6:29968761' 2]
 ['chr6:32548712' 2]
 ['chr6:32557647' 2]
 ['chr6:32725367' 2]
 ['chr7:154002420' 2]
 ['chr7:154467985' 2]]
Maximum frequency: 2 in 29 unaffected families
[['chr12:11214231' 2]
 ['chr12:11214232' 2]
 ['chr12:50745851' 2]
 ['chr12:50745893' 2]
 ['chr12:50745894' 2]
 ['chr12:99139159' 2]
 ['chr16:33940111' 2]
 ['chr16:33940122' 2]
 ['chr1:142813302' 2]
 ['chr1:148902738' 2]
 ['chr1:206566826' 2]
 ['chr20:29638202' 2]
 ['chr2:9546136' 2]
 ['chr6:32083111' 2]
 ['chrX:116025284' 2]]


In [115]:
%%bash
cd /data/NCR_SBRB/simplex/gatk_refine
head affected_snvs.txt

head: cannot open `affected_snvs_.txt' for reading: No such file or directory


In [56]:
%%bash
wc -l /data/NCR_SBRB/simplex/snv_arm/interesting_snvs.txt

640 /data/NCR_SBRB/simplex/snv_arm/interesting_snvs.txt
