### Filters

The `filters` module defines two classes that process blocks of SNPs:
* SNPFilter applies filtering operations to the SNPs found by the "peak finder"
* NCOFilter does some "postprocessing" steps to find NCOs within blocks

When instantiating a filter object pass the constructor a dictionary with filter settings.

Both filters have an `apply` method that takes a data frame as a parameter and returns a frame after applying its filters.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from xo.filters import SNPFilter, NCOFilter

In [3]:
sf = SNPFilter({})

In [4]:
sf.length_range

(0, 10000)

In [5]:
nf = NCOFilter({})

In [6]:
nf.min_z

0.9

In [7]:
df = pd.read_csv('../chr1.csv')

In [8]:
df.head()

Unnamed: 0,SNP,Sample,chromosome,position,base_geno,hmm_state1,hmm_state2,reference,ref_reads,variant,var_reads,chrom_id,blk_id,background,chr_length,location,homozygosity
0,67168,BSP-OR-001,3,31010,unknown,N2,unknown,G,0,A,0,BSP-OR-001-3,0,CB4856,13819453,0.002244,
1,67169,BSP-OR-001,3,31025,unknown,N2,unknown,C,0,T,0,BSP-OR-001-3,0,CB4856,13819453,0.002245,
2,67170,BSP-OR-001,3,31030,N2,N2,N2,T,8,G,0,BSP-OR-001-3,0,CB4856,13819453,0.002245,1.0
3,67171,BSP-OR-001,3,31031,unknown,N2,N2,C,0,T,0,BSP-OR-001-3,0,CB4856,13819453,0.002245,
4,67172,BSP-OR-001,3,31036,unknown,N2,N2,A,0,G,0,BSP-OR-001-3,0,CB4856,13819453,0.002246,


In [9]:
nf.apply(df)

Unnamed: 0,SNP,Sample,chromosome,position,base_geno,hmm_state1,hmm_state2,reference,ref_reads,variant,var_reads,chrom_id,blk_id,background,chr_length,location,homozygosity,nco
0,67168,BSP-OR-001,3,31010,unknown,N2,unknown,G,0,A,0,BSP-OR-001-3,0,CB4856,13819453,0.002244,,0
1,67169,BSP-OR-001,3,31025,unknown,N2,unknown,C,0,T,0,BSP-OR-001-3,0,CB4856,13819453,0.002245,,0
2,67170,BSP-OR-001,3,31030,N2,N2,N2,T,8,G,0,BSP-OR-001-3,0,CB4856,13819453,0.002245,1.0,1
3,67171,BSP-OR-001,3,31031,unknown,N2,N2,C,0,T,0,BSP-OR-001-3,0,CB4856,13819453,0.002245,,0
4,67172,BSP-OR-001,3,31036,unknown,N2,N2,A,0,G,0,BSP-OR-001-3,0,CB4856,13819453,0.002246,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4560,194796,BSP-OR-001,5,20629483,unknown,N2,N2,T,0,A,0,BSP-OR-001-5,136,CB4856,20953657,0.984529,,0
4561,194797,BSP-OR-001,5,20629489,N2,N2,N2,G,13,A,0,BSP-OR-001-5,136,CB4856,20953657,0.984529,1.0,1
4562,194798,BSP-OR-001,5,20629505,unknown,N2,N2,C,0,T,0,BSP-OR-001-5,136,CB4856,20953657,0.984530,,0
4563,194799,BSP-OR-001,5,20629514,unknown,N2,N2,A,0,G,0,BSP-OR-001-5,136,CB4856,20953657,0.984530,,0


In [15]:
nf.apply(df[df.chrom_id == 'BSP-OR-001-4'])

Unnamed: 0,SNP,Sample,chromosome,position,base_geno,hmm_state1,hmm_state2,reference,ref_reads,variant,var_reads,chrom_id,blk_id,background,chr_length,location,homozygosity,nco
868,93067,BSP-OR-001,4,1997145,unknown,N2,unknown,T,0,A,0,BSP-OR-001-4,0,CB4856,17493838,0.114163,,0
869,93068,BSP-OR-001,4,1997147,unknown,N2,unknown,T,0,C,0,BSP-OR-001-4,0,CB4856,17493838,0.114163,,0
870,93069,BSP-OR-001,4,1997149,unknown,N2,unknown,T,0,A,0,BSP-OR-001-4,0,CB4856,17493838,0.114163,,0
871,93070,BSP-OR-001,4,1997234,N2,N2,N2,G,6,A,0,BSP-OR-001-4,0,CB4856,17493838,0.114168,1.0,2
872,93071,BSP-OR-001,4,1997248,N2,N2,N2,T,8,A,0,BSP-OR-001-4,0,CB4856,17493838,0.114169,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1181,109332,BSP-OR-001,4,14549576,unknown,N2,N2,C,0,G,0,BSP-OR-001-4,14,CB4856,17493838,0.831697,,0
1182,109333,BSP-OR-001,4,14549580,N2,N2,N2,T,14,G,0,BSP-OR-001-4,14,CB4856,17493838,0.831697,1.0,1
1183,109334,BSP-OR-001,4,14549597,N2,N2,N2,C,11,T,0,BSP-OR-001-4,14,CB4856,17493838,0.831698,1.0,1
1184,109335,BSP-OR-001,4,14549605,N2,N2,N2,T,8,C,0,BSP-OR-001-4,14,CB4856,17493838,0.831699,1.0,1


In [16]:
chr4 = nf.apply(df[df.chrom_id == 'BSP-OR-001-4'])

In [17]:
blocks = chr4.groupby('blk_id')

In [18]:
blocks.get_group(1)

Unnamed: 0,SNP,Sample,chromosome,position,base_geno,hmm_state1,hmm_state2,reference,ref_reads,variant,var_reads,chrom_id,blk_id,background,chr_length,location,homozygosity,nco
887,93434,BSP-OR-001,4,2157039,unknown,N2,unknown,G,0,A,0,BSP-OR-001-4,1,CB4856,17493838,0.123303,,0
888,93435,BSP-OR-001,4,2157047,unknown,N2,unknown,G,0,A,0,BSP-OR-001-4,1,CB4856,17493838,0.123303,,0
889,93436,BSP-OR-001,4,2157050,unknown,N2,unknown,A,0,G,0,BSP-OR-001-4,1,CB4856,17493838,0.123303,,0
890,93437,BSP-OR-001,4,2157055,unknown,N2,unknown,A,0,C,0,BSP-OR-001-4,1,CB4856,17493838,0.123304,,0
891,93438,BSP-OR-001,4,2157060,unknown,N2,unknown,G,0,A,0,BSP-OR-001-4,1,CB4856,17493838,0.123304,,0
892,93439,BSP-OR-001,4,2157068,unknown,N2,unknown,G,0,A,0,BSP-OR-001-4,1,CB4856,17493838,0.123304,,0
893,93440,BSP-OR-001,4,2157071,unknown,N2,unknown,A,0,G,0,BSP-OR-001-4,1,CB4856,17493838,0.123305,,0
894,93441,BSP-OR-001,4,2157089,N2,N2,unknown,G,9,A,0,BSP-OR-001-4,1,CB4856,17493838,0.123306,1.0,1
895,93442,BSP-OR-001,4,2157092,unknown,N2,unknown,A,0,G,0,BSP-OR-001-4,1,CB4856,17493838,0.123306,,0
896,93443,BSP-OR-001,4,2157095,unknown,N2,unknown,G,0,A,0,BSP-OR-001-4,1,CB4856,17493838,0.123306,,0


In [21]:
{n: grp.nco.max() for n, grp in blocks}

{0: np.int8(2),
 1: np.int8(1),
 3: np.int8(1),
 4: np.int8(2),
 5: np.int8(1),
 7: np.int8(2),
 8: np.int8(1),
 10: np.int8(1),
 11: np.int8(1),
 12: np.int8(1),
 13: np.int8(1),
 14: np.int8(2)}

In [23]:
{ n for n, grp in blocks if grp.nco.max() == 2}

{0, 4, 7, 14}

### NCOs in the Full Data Set

In [None]:
nf = NCOFilter({})

In [None]:
blocks = pd.read_csv('../filtered.csv')

In [None]:
len(blocks)

In [None]:
blocks.head()

In [None]:
coverage = blocks.ref_reads + blocks.var_reads

In [None]:
coverage.head()

In [None]:
coverage.mean()

In [None]:
coverage.std()

In [None]:
coverage[coverage > 10].mean()

In [None]:
plt.hist(coverage[coverage > 10])

In [None]:
plt.hist(coverage[coverage < 10])

In [None]:
plt.hist(coverage[coverage < 20])

In [None]:
plt.hist(coverage[coverage < 30])

In [None]:
plt.hist(coverage[(coverage >= 30) & (coverage < 100)])

In [None]:
plt.hist(coverage[coverage < 100])

In [None]:
len(coverage[coverage >= 100])

In [None]:
blocks.groupby('chromosome').max('var_reads')

In [None]:
blocks.groupby('chrom_id').groups.keys()

In [None]:
grps = blocks.groupby(['chrom_id','blk_id'])

In [None]:
chr_names = grps.count().index.levels[0]

In [None]:
'BSP-OR-001-4' in chr_names

In [None]:
len(chr_names)