# AnnoSeqCell class

> Read table and add annotation

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()

In [None]:
#| default_exp annoseqcell

In [None]:
#| export
import numpy as np
import pandas as pd
import gzip
from digestion_rates.preprocess import SequencesCells
from pathlib import Path
from multiprocessing import Pool

# AnnoSeqCell class





## Test data

In [None]:
data = [('ACCA', 'PLATE-01_CCCC', 1), ('ACCA', 'PLATE-01_CCCA', 1), 
        ('CAAC', 'PLATE-01_CCCA', 2), ('ACCA', 'PLATE-02_CCCC', 1), 
        ('CAAC', 'PLATE-02_CCCT', 2), ('CCCA', 'PLATE-02_CCAA', 3)]
df = pd.DataFrame(data, columns=['seq', 'cb', 'counts'])
df

Unnamed: 0,seq,cb,counts
0,ACCA,PLATE-01_CCCC,1
1,ACCA,PLATE-01_CCCA,1
2,CAAC,PLATE-01_CCCA,2
3,ACCA,PLATE-02_CCCC,1
4,CAAC,PLATE-02_CCCT,2
5,CCCA,PLATE-02_CCAA,3


In [None]:
data = [('PLATE-01_CCCC', 1, 'G0'), ('PLATE-01_CCCA', 2, 'G0'), ('PLATE-02_CCCC', 1, 'Interphase'), 
        ('PLATE-02_CCCT', 2, 'Interphase'), ('PLATE-02_CCAA', 1,'G0')]
anno_df = pd.DataFrame(data, columns=['cb','units_mnase','sort_population'])
anno_df.to_csv('annotation.csv.gz', compression='gzip', index=False)

In [None]:
seqcell = SequencesCells(df)

In [None]:
seqcell.table

Unnamed: 0,seq,cb,counts
0,ACCA,PLATE-01_CCCC,1
1,ACCA,PLATE-01_CCCA,1
2,CAAC,PLATE-01_CCCA,2
3,ACCA,PLATE-02_CCCC,1
4,CAAC,PLATE-02_CCCT,2
5,CCCA,PLATE-02_CCAA,3


## AnnoSequencesCells

In [None]:
def build_group_dictionary(anno_seq, group):
    phase_dz = {}
    [ phase_dz.update({phase:df}) for phase, df in anno_seq.groupby(group)]
    return phase_dz

In [None]:
dz = build_group_dictionary(anno_seq, 'sort_population')
assert list(dz.keys()) == ['G0', 'Interphase']
assert (list(dz['G0'].sort_population) == ['G0']*4)
    

In [None]:
def join_df_anno(df, anno_df):
    anno_seq = df.join(anno_df.set_index('cb'), on='cb')
    return anno_seq

In [None]:
anno_seq = join_df_anno(df,anno_df)
assert (anno_seq.shape == (6, 5))


In [None]:
def filter_unshared_sequences(df, groups):
    filtered = df.groupby('seq').filter(lambda x: sorted(list(x['sort_population'].unique())) == sorted(groups) )
    return filtered
    
    
filtered = filter_unshared_sequences(anno_seq, ['Interphase','G0'])
assert (list(filtered.seq.unique()) == ['ACCA', 'CAAC'])

In [None]:
anno_seq

Unnamed: 0,seq,cb,counts,units_mnase,sort_population
0,ACCA,PLATE-01_CCCC,1,1,G0
1,ACCA,PLATE-01_CCCA,1,2,G0
2,CAAC,PLATE-01_CCCA,2,2,G0
3,ACCA,PLATE-02_CCCC,1,1,Interphase
4,CAAC,PLATE-02_CCCT,2,2,Interphase
5,CCCA,PLATE-02_CCAA,3,1,G0


In [None]:
#| export
class AnnoSequencesCells(SequencesCells):
    def __init__(self, df=None):
        super().__init__(df=df)

    def add_cell_anno(self, path, compression='gzip', usecols=None):
        anno_df = pd.read_csv(path, compression=compression, usecols=usecols)
        anno_df.columns = map(lambda x: x.lower(), anno_df.columns)
        self.table = join_df_anno(self.table, anno_df)
        return self

    def split_cells(self, by='sort_population', keep_only_common=False):
        group_names = list(self.table[by].unique())
        if keep_only_common:
            self.table = filter_unshared_sequences(self.table, group_names)
        dz = build_group_dictionary(self.table, by)    
        self.group = dz
        return self
        
    

In [None]:
annseqcell = AnnoSequencesCells(df)
annseqcell.table

Unnamed: 0,seq,cb,counts
0,ACCA,PLATE-01_CCCC,1
1,ACCA,PLATE-01_CCCA,1
2,CAAC,PLATE-01_CCCA,2
3,ACCA,PLATE-02_CCCC,1
4,CAAC,PLATE-02_CCCT,2
5,CCCA,PLATE-02_CCAA,3


In [None]:
annseqcell.add_cell_anno('./annotation.csv.gz')
annseqcell.table

Unnamed: 0,seq,cb,counts,units_mnase,sort_population
0,ACCA,PLATE-01_CCCC,1,1,G0
1,ACCA,PLATE-01_CCCA,1,2,G0
2,CAAC,PLATE-01_CCCA,2,2,G0
3,ACCA,PLATE-02_CCCC,1,1,Interphase
4,CAAC,PLATE-02_CCCT,2,2,Interphase
5,CCCA,PLATE-02_CCAA,3,1,G0


In [None]:
annseqcell.split_cells(keep_only_common=True)
annseqcell.group.keys()

dict_keys(['G0', 'Interphase'])

In [None]:
annseqcell.group['Interphase']

Unnamed: 0,seq,cb,counts,units_mnase,sort_population
3,ACCA,PLATE-02_CCCC,1,1,Interphase
4,CAAC,PLATE-02_CCCT,2,2,Interphase


In [None]:
df = pd.read_csv('../RPMD1/meta_data/cell_annotations.csv.gz', compression='gzip', usecols=['CB', 'units_mnase', 'sort_population'])
df.columns = map(lambda x: x.lower(), df.columns)
df

Unnamed: 0,cb,units_mnase,sort_population
0,RPMD1-03_AACAGCAATG,25.000000,Interphase
1,RPMD1-03_AACCACGTCA,0.390625,Interphase
2,RPMD1-03_AACCGTAACA,0.048828,Interphase
3,RPMD1-03_AACCTAGACG,0.000000,Interphase
4,RPMD1-03_AACGCGGTAG,0.000000,Interphase
...,...,...,...
6126,RPMD1-18_TTGGTGTGTC,0.048828,G0
6127,RPMD1-18_TTGTCTCTAC,0.012207,Mit
6128,RPMD1-18_TTGTGCTTGG,0.000000,Mit
6129,RPMD1-18_TTGTTCGTGT,25.000000,Interphase
