# Postprocessing Steps

Testing code for the function that looks for NCOs in filtered blocks

In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_data(fn):
    df = pd.read_csv(fn)
    res = []
    groups = df.groupby('chrom_id')
    for chr_id, chr in groups:
        for blk_id, blk in chr.groupby('blk_id'):
            res.append(blk)
    return res

In [3]:
blocks = load_data('../peaks.csv')

In [4]:
len(blocks)

76975

In [5]:
data = [len(b) for b in blocks]

In [6]:
len(data)

76975

In [7]:
import os

In [8]:
df = blocks[4]

In [9]:
s = {160, 161, 162, 163, 164, 165, 166, 167, 168, 169}

In [10]:
df[df.index.isin(s)]

Unnamed: 0.1,Unnamed: 0,Sample,chromosome,position,base_geno,hmm_state1,hmm_state2,reference,ref_reads,variant,var_reads,chrom_id,blk_id,background
160,68076,BSP-OR-001,3,88644,unknown,N2,N2,A,0,C,0,BSP-OR-001-3,4,CB4856
161,68077,BSP-OR-001,3,88646,unknown,N2,N2,A,0,C,0,BSP-OR-001-3,4,CB4856
162,68078,BSP-OR-001,3,88740,unknown,N2,N2,C,0,G,0,BSP-OR-001-3,4,CB4856
163,68079,BSP-OR-001,3,88741,unknown,N2,N2,C,0,A,0,BSP-OR-001-3,4,CB4856
164,68080,BSP-OR-001,3,88785,unknown,N2,N2,A,0,G,0,BSP-OR-001-3,4,CB4856
165,68081,BSP-OR-001,3,88786,unknown,N2,N2,T,0,A,0,BSP-OR-001-3,4,CB4856
166,68082,BSP-OR-001,3,88830,N2,N2,N2,C,4,G,0,BSP-OR-001-3,4,CB4856
167,68083,BSP-OR-001,3,88850,N2,N2,N2,G,4,C,0,BSP-OR-001-3,4,CB4856
168,68084,BSP-OR-001,3,88851,unknown,N2,N2,C,0,G,0,BSP-OR-001-3,4,CB4856
169,68085,BSP-OR-001,3,88854,N2,N2,N2,G,4,A,0,BSP-OR-001-3,4,CB4856


In [11]:
df[df.index.isin(s) & (df.base_geno == df.hmm_state1)]

Unnamed: 0.1,Unnamed: 0,Sample,chromosome,position,base_geno,hmm_state1,hmm_state2,reference,ref_reads,variant,var_reads,chrom_id,blk_id,background
166,68082,BSP-OR-001,3,88830,N2,N2,N2,C,4,G,0,BSP-OR-001-3,4,CB4856
167,68083,BSP-OR-001,3,88850,N2,N2,N2,G,4,C,0,BSP-OR-001-3,4,CB4856
169,68085,BSP-OR-001,3,88854,N2,N2,N2,G,4,A,0,BSP-OR-001-3,4,CB4856


In [12]:
s2 = set(s)

In [13]:
s2 &= set(df[df.index.isin(s) & (df.base_geno == df.hmm_state1)].index)

In [14]:
s2

{166, 167, 169}

In [15]:
chr = blocks[5]

In [16]:
chr.head()

Unnamed: 0.1,Unnamed: 0,Sample,chromosome,position,base_geno,hmm_state1,hmm_state2,reference,ref_reads,variant,var_reads,chrom_id,blk_id,background
194,68475,BSP-OR-001,3,118319,unknown,N2,unknown,G,0,T,0,BSP-OR-001-3,5,CB4856
195,68476,BSP-OR-001,3,118329,N2,N2,N2,G,17,A,0,BSP-OR-001-3,5,CB4856
196,68477,BSP-OR-001,3,118331,unknown,N2,N2,G,0,A,0,BSP-OR-001-3,5,CB4856
197,68478,BSP-OR-001,3,118337,unknown,N2,N2,A,0,T,0,BSP-OR-001-3,5,CB4856
198,68479,BSP-OR-001,3,118374,N2,N2,N2,T,15,C,0,BSP-OR-001-3,5,CB4856


In [17]:
len(chr)

57

In [18]:
np.nan

nan

In [19]:
hzi = pd.Series(np.nan, index=chr.index)

In [20]:
len(hzi)

57

In [21]:
chr.ref_reads / (chr.ref_reads + chr.var_reads)

194         NaN
195    1.000000
196         NaN
197         NaN
198    1.000000
199    1.000000
200    1.000000
201    1.000000
202    1.000000
203         NaN
204         NaN
205         NaN
206         NaN
207         NaN
208         NaN
209    1.000000
210    1.000000
211    1.000000
212    0.909091
213    0.909091
214    0.909091
215    0.913043
216    0.807692
217    0.814815
218    0.800000
219    0.722222
220    0.666667
221    0.631579
222    0.588235
223    0.562500
224    1.000000
225         NaN
226         NaN
227    1.000000
228    1.000000
229    1.000000
230    1.000000
231    1.000000
232    1.000000
233    1.000000
234    1.000000
235    1.000000
236    1.000000
237    1.000000
238    1.000000
239    0.736842
240    0.761905
241    0.785714
242    0.785714
243    0.857143
244    0.875000
245    0.875000
246    0.875000
247    0.875000
248    1.000000
249         NaN
250         NaN
dtype: float64

In [22]:
s = pd.Series(np.random.randn(10))

In [23]:
s

0   -2.034816
1    0.080189
2    0.807504
3   -0.349904
4    0.504301
5    1.354754
6    0.225808
7    0.472462
8   -1.018808
9    0.972643
dtype: float64

In [24]:
s.where(s > 0, 0)

0    0.000000
1    0.080189
2    0.807504
3    0.000000
4    0.504301
5    1.354754
6    0.225808
7    0.472462
8    0.000000
9    0.972643
dtype: float64

The command line app (`xo post`) saves a frame that can be grouped by chromosome and block ID to get the set of NCOs.

In [25]:
df = pd.read_csv('../ncos.csv')

In [26]:
df.head()

Unnamed: 0,SNP,Sample,chromosome,position,base_geno,hmm_state1,hmm_state2,reference,ref_reads,variant,var_reads,chrom_id,blk_id,background,chr_length,location,homozygosity
0,67647,BSP-OR-001,3,68256,N2,N2,N2,T,6,A,0,BSP-OR-001-3,2,CB4856,13819453,0.004939,1.0
1,67648,BSP-OR-001,3,68258,N2,N2,N2,C,6,T,0,BSP-OR-001-3,2,CB4856,13819453,0.004939,1.0
2,67649,BSP-OR-001,3,68267,N2,N2,N2,G,6,A,0,BSP-OR-001-3,2,CB4856,13819453,0.00494,1.0
3,67651,BSP-OR-001,3,68280,N2,N2,N2,T,4,C,0,BSP-OR-001-3,2,CB4856,13819453,0.004941,1.0
4,67659,BSP-OR-001,3,68496,N2,N2,N2,A,16,C,0,BSP-OR-001-3,2,CB4856,13819453,0.004956,1.0


In [27]:
ncos = df.groupby(['chrom_id','blk_id'])

To iterate over all NCOs:

In [28]:
# for name, group in ncos:
#    print(name)              <- index, e.g. ('BSP-OR-001-3', 4)
#    print(group)             <- the group itself, as a frame

To get a single group:

In [29]:
ncos.get_group(('BSP-OR-001-3',4))

Unnamed: 0,SNP,Sample,chromosome,position,base_geno,hmm_state1,hmm_state2,reference,ref_reads,variant,var_reads,chrom_id,blk_id,background,chr_length,location,homozygosity
10,68067,BSP-OR-001,3,88606,N2,N2,N2,G,6,A,0,BSP-OR-001-3,4,CB4856,13819453,0.006412,1.0
11,68069,BSP-OR-001,3,88614,N2,N2,N2,C,6,A,0,BSP-OR-001-3,4,CB4856,13819453,0.006412,1.0
12,68070,BSP-OR-001,3,88623,N2,N2,N2,A,6,T,0,BSP-OR-001-3,4,CB4856,13819453,0.006413,1.0
13,68082,BSP-OR-001,3,88830,N2,N2,N2,C,4,G,0,BSP-OR-001-3,4,CB4856,13819453,0.006428,1.0
14,68083,BSP-OR-001,3,88850,N2,N2,N2,G,4,C,0,BSP-OR-001-3,4,CB4856,13819453,0.006429,1.0
15,68085,BSP-OR-001,3,88854,N2,N2,N2,G,4,A,0,BSP-OR-001-3,4,CB4856,13819453,0.00643,1.0


To apply an aggregation function select a column, call the function:

In [30]:
ncos.SNP.count()

chrom_id      blk_id
BSP-OR-001-3  2         10
              4          6
              11        10
              12        33
              14         7
                        ..
BSP-OR-001-5  125        6
              126       15
              129       10
              133       20
              135       11
Name: SNP, Length: 69, dtype: int64

Mean number of SNPs per NCO:

In [31]:
ncos.SNP.count().mean()

np.float64(13.478260869565217)

The output of `count` is a series with a multi-index that can itself be grouped to give interesting info.

In [32]:
sf = ncos.SNP.count()

In [33]:
type(sf)

pandas.core.series.Series

The number of NCOs in each chromosome:

In [34]:
sf.groupby(level='chrom_id').count()

chrom_id
BSP-OR-001-3     9
BSP-OR-001-4     6
BSP-OR-001-5    54
Name: SNP, dtype: int64

In [35]:
sf.groupby(level='chrom_id').count().mean()

np.float64(23.0)