# FastQ and SequencesCells class

> Read and create count table of cells

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()

In [None]:
#| default_exp preprocess

In [None]:
#| export
import numpy as np
import pandas as pd
import gzip
from Bio import SeqIO

# SeqCell class


Run with parse_file(fastq.gz) method and return a SequencesCells() class containing the dataframe:

|       | **plate** | **seq**   | **cb** | **counts** |
|-------|-----------|-----------|--------|------------|
| **0** |  RPMD1-01 |    CTCTGA |  GTNCB |          1 |
| **1** |  RPMD1-01 |    CTCTGA |  NTGCB |          2 |
| **2** |  RPMD1-01 | CTCTGAGGG |  GTNCB |          1 |

that can be transformed in the wide dataframe

|                | **NTGCB** | **GTNCB** | **cell3** |
|----------------|-----------|-----------|-----------|
| **CTCTGA**     | 2         | 1         | ..        |
| **CTCTGAGGG**  | 0         | 1         | ..        |
| **sequence_3** | ..        | ..        | ..        |

On the SequencesCells object then you can join a new plate using the method `.join_plate()`


`RPFv4D-TEST-03_R1.trimmed.fastq.gz` contains 100 sequences all repeated 2 times

In [None]:
#| export

def extract_umi_cb(nm):
    nm = nm.split(' ')[0]
    cb, umi = nm.split('_')[1:3]
    return umi, cb

In [None]:
nm = '@VH00225:8:AAAKNTKHV:1:1101:62862:1057_NTGCB_GGCAUMI 1:N:0:GTTTCG'
umi, cb = extract_umi_cb(nm)

assert cb == 'NTGCB'
assert umi == 'GGCAUMI'

In [None]:
#| export

def parse_plate(path, plate_name):
    ids = []
    with gzip.open(path, "rt") as handle:
        for record in SeqIO.parse(handle, "fastq"):
            umi, cb = extract_umi_cb(str(record.id))
            # cb = plate_name + '_' + cb 
            sequence = str(record.seq)
            id_item = ( plate_name, sequence, cb, umi)
            ids.append(id_item)
    df = pd.DataFrame(ids, columns=['plate', 'seq', 'cb','umi'])
    dedup_df = df.drop_duplicates().reset_index(drop=True)
    count_df = dedup_df.groupby(['plate','seq', 'cb']).count()
    seqcell_long = count_df.reset_index().rename(columns={'umi':'counts'})
    return seqcell_long


In [None]:
seqcell_table = parse_plate('./test-RPMD1-01_R1.trimmed.fastq.gz', 'RPMD1-01')
print(list(seqcell_table.columns))
assert list(seqcell_table.columns) == ['plate', 'seq', 'cb', 'counts']
assert list(seqcell_table['counts']) == [1,2,1]

['plate', 'seq', 'cb', 'counts']


In [None]:
seqcell_table

Unnamed: 0,plate,seq,cb,counts
0,RPMD1-01,CTCTGA,GTNCB,1
1,RPMD1-01,CTCTGA,NTGCB,2
2,RPMD1-01,CTCTGAGGG,GTNCB,1


In [None]:
#| export
class SequencesCells():
    def __init__(self, df=None):
        self.table = df
        
    def join_plate(self, sequences_cells):
        addtable = sequences_cells.table
        addplates = addtable['plate'].unique()
        inplates = self.table['plate'].unique()
        plates_to_add = [p for p in addplates if p not in inplates]
        if len(plates_to_add) == 0:
            return self
        n_plates = len(list(inplates)+list(plates_to_add))
        other = addtable.set_index('plate').loc[plates_to_add, :].reset_index()
        cat_table = pd.concat([self.table, other])
        seq_grouped = cat_table.groupby('seq')
        self.table =seq_grouped.filter(lambda x: len(x['plate'].unique()) == n_plates).reset_index(drop=True)
        return self
    
    def select_plate(self, plate):
        df = self.set_index('plate').loc[plate, :].reset_index()
        return SequencesCells(df)
    
    def parse_file(self, fastq_file, plate_name=None):
        if plate_name is None:
            plate_name = '-'.join(fastq_file.split('_')[0].split('-')[1:])
        self.table = parse_plate(fastq_file, plate_name)
        return self
    
    def save_table(self, path, compression='gzip'):
        self.table.to_csv(path, compression=compression)
        
    def read_csv(self, path, compression='gzip'):
        self.table = pd.read_csv(path, index_col=0, compression=compression)
        return self

In [None]:
fastq = SequencesCells()
plate2 = fastq.parse_file('./test-RPMD1-02_R1.trimmed.fastq.gz')
plate2.table

Unnamed: 0,plate,seq,cb,counts
0,RPMD1-02,CTCTGA,GTNCB,1
1,RPMD1-02,CTCTGA,NTGCB,2


In [None]:
sc = SequencesCells()
plate1 = sc.parse_file('./test-RPMD1-01_R1.trimmed.fastq.gz')
plate1.table

Unnamed: 0,plate,seq,cb,counts
0,RPMD1-01,CTCTGA,GTNCB,1
1,RPMD1-01,CTCTGA,NTGCB,2
2,RPMD1-01,CTCTGAGGG,GTNCB,1


In [None]:
n_plates = 2

seq_grouped = pd.concat([plate1.table, plate2.table]).groupby('seq')
df =seq_grouped.filter(lambda x: len(x['plate'].unique()) == n_plates).reset_index(drop=True)
df

Unnamed: 0,plate,seq,cb,counts
0,RPMD1-01,CTCTGA,GTNCB,1
1,RPMD1-01,CTCTGA,NTGCB,2
2,RPMD1-02,CTCTGA,GTNCB,1
3,RPMD1-02,CTCTGA,NTGCB,2


In [None]:
plates = plate1.join_plate(plate2)

assert list(plates.table.plate.unique()) == ['RPMD1-01', 'RPMD1-02']
assert list(plates.table.seq) == ['CTCTGA']*4

2


In [None]:
plates.table

Unnamed: 0,plate,seq,cb,counts
0,RPMD1-01,CTCTGAGGG,GTNCB,1


In [None]:
plates.table

Unnamed: 0,plate,seq,cb,counts
0,RPMD1-01,CTCTGA,GTNCB,1
1,RPMD1-01,CTCTGA,NTGCB,2
2,RPMD1-02,CTCTGA,GTNCB,1
3,RPMD1-02,CTCTGA,NTGCB,2


In [None]:
#| export

def concat_plates(sequences_cells, n_plates=None):
    if n_plates == None:
        n_plates = len(sequences_cells)
    cat_table = pd.concat([sc.table for sc in sequences_cells] )
    seq_grouped = cat_table.groupby('seq')
    table =seq_grouped.filter(lambda x: len(x['plate'].unique()) == n_plates).reset_index(drop=True)
    return SequencesCells(df=table)

In [None]:
plates = concat_plates([plate1,plate2])

assert list(plates.table.plate.unique()) == ['RPMD1-01', 'RPMD1-02']
assert list(plates.table.seq) == ['CTCTGA']*4

In [None]:
plates.table

Unnamed: 0,plate,seq,cb,counts
0,RPMD1-01,CTCTGA,GTNCB,1
1,RPMD1-01,CTCTGA,NTGCB,2
2,RPMD1-02,CTCTGA,GTNCB,1
3,RPMD1-02,CTCTGA,NTGCB,2


In [None]:
plates.save_table('all_plates.csv.gz')

In [None]:
results = SequencesCells().read_csv('all_plates.csv.gz')