# AnnoSeqCell class

> Read table and add annotation

In [76]:
#| hide
from nbdev import nbdev_export
nbdev_export()

In [42]:
#| default_exp annoseqcell

In [43]:
#| export
import numpy as np
import pandas as pd
import gzip
from digestion_rates.preprocess import SequencesCells
from pathlib import Path
from multiprocessing import Pool

# AnnoSeqCell class





## Test data

In [44]:
data = [('ACCA', 'PLATE-01_CCCC', 1), ('ACCA', 'PLATE-01_CCCA', 1), 
        ('CAAC', 'PLATE-01_CCCA', 2), ('ACCA', 'PLATE-02_CCCC', 1), 
        ('CAAC', 'PLATE-02_CCCT', 2), ('CCCA', 'PLATE-02_CCAA', 3)]
df = pd.DataFrame(data, columns=['seq', 'cb', 'counts'])
df

Unnamed: 0,seq,cb,counts
0,ACCA,PLATE-01_CCCC,1
1,ACCA,PLATE-01_CCCA,1
2,CAAC,PLATE-01_CCCA,2
3,ACCA,PLATE-02_CCCC,1
4,CAAC,PLATE-02_CCCT,2
5,CCCA,PLATE-02_CCAA,3


In [65]:
data = [('PLATE-01_CCCC', 1, 'G0'), ('PLATE-01_CCCA', 2, 'G0'), ('PLATE-02_CCCC', 1, 'Interphase'), 
        ('PLATE-02_CCCT', 2, 'Interphase'), ('PLATE-02_CCAA', 1,'G0'), ('PLATE-02_CCAA', 1,np.nan)]
anno_df = pd.DataFrame(data, columns=['cb','units_mnase','sort_population'])
anno_df.to_csv('annotation.csv.gz', compression='gzip', index=False)

In [66]:
anno_df

Unnamed: 0,cb,units_mnase,sort_population
0,PLATE-01_CCCC,1,G0
1,PLATE-01_CCCA,2,G0
2,PLATE-02_CCCC,1,Interphase
3,PLATE-02_CCCT,2,Interphase
4,PLATE-02_CCAA,1,G0
5,PLATE-02_CCAA,1,


In [47]:
anno_seq = df.join(anno_df.set_index('cb'), on='cb')

In [48]:
anno_seq

Unnamed: 0,seq,cb,counts,units_mnase,sort_population
0,ACCA,PLATE-01_CCCC,1,1,G0
1,ACCA,PLATE-01_CCCA,1,2,G0
2,CAAC,PLATE-01_CCCA,2,2,G0
3,ACCA,PLATE-02_CCCC,1,1,Interphase
4,CAAC,PLATE-02_CCCT,2,2,Interphase
5,CCCA,PLATE-02_CCAA,3,1,G0


## AnnoSequencesCells

In [49]:
#| export
def build_group_dictionary(anno_seq, group):
    phase_dz = {}
    [ phase_dz.update({phase:df}) for phase, df in anno_seq.groupby(group)]
    return phase_dz

In [50]:
dz['G0']

Unnamed: 0,seq,cb,counts,units_mnase,sort_population
0,ACCA,PLATE-01_CCCC,1,1,G0
1,ACCA,PLATE-01_CCCA,1,2,G0
2,CAAC,PLATE-01_CCCA,2,2,G0
5,CCCA,PLATE-02_CCAA,3,1,G0


In [68]:
dz = build_group_dictionary(anno_seq, 'sort_population')
assert list(dz.keys()) == ['G0', 'Interphase']
assert (list(dz['G0'].sort_population) == ['G0']*4)
    

In [69]:
#| export
def join_df_anno(df, anno_df):
    anno_seq = df.join(anno_df.set_index('cb'), on='cb', how='inner')
    return anno_seq

In [70]:

anno_seq = join_df_anno(df,anno_df.dropna())
assert (anno_seq.shape == (6, 5))


In [71]:
#| export
def filter_unshared_sequences(df, groups):
    filtered = df.groupby('seq').filter(lambda x: sorted(list(x['sort_population'].unique())) == sorted(groups) )
    return filtered

In [72]:
filtered = filter_unshared_sequences(anno_seq, ['Interphase','G0'])
assert (list(filtered.seq.unique()) == ['ACCA', 'CAAC'])

In [73]:
#| export
class AnnoSequencesCells(SequencesCells):
    def __init__(self, df=None):
        super().__init__(df=df)

    def add_cell_anno(self, path, compression='gzip', usecols=None, dropna=False):
        anno_df = pd.read_csv(path, compression=compression, usecols=usecols)
        anno_df.columns = map(lambda x: x.lower(), anno_df.columns)
        self.table = join_df_anno(self.table, anno_df)
        if dropna:
            self.table.dropna(inplace=True)
        return self

    def split_cells(self, by='sort_population', keep_only_common=False):
        group_names = list(self.table[by].unique())
        if keep_only_common:
            self.table = filter_unshared_sequences(self.table, group_names)
        dz = build_group_dictionary(self.table, by)    
        self.group = dz
        return self
    
    

In [74]:
annseqcell = AnnoSequencesCells(df)
annseqcell.table 

Unnamed: 0,seq,cb,counts
0,ACCA,PLATE-01_CCCC,1
1,ACCA,PLATE-01_CCCA,1
2,CAAC,PLATE-01_CCCA,2
3,ACCA,PLATE-02_CCCC,1
4,CAAC,PLATE-02_CCCT,2
5,CCCA,PLATE-02_CCAA,3


In [75]:
annseqcell.add_cell_anno('./annotation.csv.gz', dropna=True)
annseqcell.table

Unnamed: 0,seq,cb,counts,units_mnase,sort_population
0,ACCA,PLATE-01_CCCC,1,1,G0
1,ACCA,PLATE-01_CCCA,1,2,G0
2,CAAC,PLATE-01_CCCA,2,2,G0
3,ACCA,PLATE-02_CCCC,1,1,Interphase
4,CAAC,PLATE-02_CCCT,2,2,Interphase
5,CCCA,PLATE-02_CCAA,3,1,G0


In [29]:
annseqcell.split_cells(keep_only_common=True)
assert sorted(list(annseqcell.group.keys())) == ['G0', 'Interphase']

In [31]:
assert len(annseqcell.group['G0']) == 3

In [37]:
assert (annseqcell[annseqcell['units_mnase']> 1]['units_mnase'] == [2]*3).all()