# FastQ and SequencesCells class

> Read and create count table of cells

In [36]:
#| hide
from nbdev import nbdev_export
nbdev_export()

In [37]:
#| default_exp fastq_reader

In [3]:
#| export
import numpy as np
import pandas as pd
import gzip
from Bio import SeqIO

# FastQ class

Inizialize it with the path of fastq file gz-zipped. 

Run with parse_file() method and return a SequencesCells() class containing the dataframe:

|                | **NTGCB** | **GTNCB** | **cell3** |
|----------------|-----------|-----------|-----------|
| **CTCTGA**     | 2         | 1         | ..        |
| **CTCTGAGGG**  | 0         | 1         | ..        |
| **sequence_3** | ..        | ..        | ..        |

On the SequencesCells object then you can join a new plate using the method `.join_plate()`


`RPFv4D-TEST-03_R1.trimmed.fastq.gz` contains 100 sequences all repeated 2 times

In [5]:
#| export

def extract_umi_cb(nm):
    nm = nm.split(' ')[0]
    cb, umi = nm.split('_')[1:3]
    return umi, cb

In [6]:
nm = '@VH00225:8:AAAKNTKHV:1:1101:62862:1057_NTGCB_GGCAUMI 1:N:0:GTTTCG'
umi, cb = extract_umi_cb(nm)

assert cb == 'NTGCB'
assert umi == 'GGCAUMI'

## extract dictionary & count unique sequeces

Extract dictionary is a function that extract a dictionary in the form of `{'plate_cell_barcode':{'sequence': count_number}}`



In [8]:
#| export

def extract_dictionary(path, plate_name):
    tracked_ids = []
    tracked_seqs = []
    fastq_dict = {}
    with gzip.open(path, "rt") as handle:
        for record in SeqIO.parse(handle, "fastq"):
            umi, cb = extract_umi_cb(str(record.id))
            sequence = str(record.seq)
            uniq_id = (umi, cb, sequence)
            if (uniq_id in tracked_ids):
                continue
            tracked_ids.append(uniq_id)
            cb = plate_name + '_' + cb 
            if cb in fastq_dict.keys():
                if sequence in fastq_dict[cb].keys():                    
                    fastq_dict[cb][sequence] += 1
                else:
                    fastq_dict[cb][sequence] = 1
            else:
                fastq_dict[cb] = {sequence:1}
    return fastq_dict


In [9]:
fastq_dict = extract_dictionary('./test-RPMD1-01_R1.trimmed.fastq.gz', 'RPMD1-01')

assert list(pd.DataFrame(fastq_dict).columns) == ['RPMD1-01_NTGCB', 'RPMD1-01_GTNCB']
assert list(pd.DataFrame(fastq_dict).index) == ['CTCTGA', 'CTCTGAGGG']

In [10]:
fastq_dict

{'RPMD1-01_NTGCB': {'CTCTGA': 2},
 'RPMD1-01_GTNCB': {'CTCTGA': 1, 'CTCTGAGGG': 1}}

In [12]:
#| export
class SequencesCells():
    def __init__(self, df, plate):
        self.table = df
        self.plates = plate if isinstance(plate, list) else [plate]
    def join_plate(self, sequences_cells):
        addtable = sequences_cells.table
        plates_to_add = [p for p in sequences_cells.plates if p not in self.plates]
        if len(plates_to_add) == 0:
            return self
        select_cells = np.array([[plate in name_cell for name_cell in addtable.columns]
                                    for plate in plates_to_add]).sum(axis=0).astype(bool)
        other = addtable.loc[:,select_cells]
        self.table = pd.concat([self.table,other], axis=1).dropna()
        self.plates = self.plates + plates_to_add
        return self
    def select_plate(self, plate):
        df = self.table.loc[:, [col for col in plates.table.columns if plate in col]]
        return SequencesCells(df, plate)

In [13]:
#| export
class FastQ():
    def __init__(self, fastq_file, plate_name=None):
        self.fastq_file = fastq_file
        self.plate_name = plate_name
        
    def parse_file(self):
        if self.plate_name is None:
            self.plate_name = '-'.join(self.fastq_file.split('_')[0].split('-')[1:])
            
            
        fastq = extract_dictionary(self.fastq_file, self.plate_name)
        df_fastq = pd.DataFrame(fastq).fillna(0)
        return SequencesCells(df_fastq, self.plate_name)

            

In [14]:
#| export

def concatenate_tables(list_of_seqcells):
    cat_plates = [pl for t in list_of_seqcells for pl in t.plates ]
    if len(set(cat_plates)) != len(cat_plates):
        raise ValueError('Repeated plates in the passed list')
    cat_tables = [t.table for t in list_of_seqcells ]
    new_table = pd.concat(cat_tables, axis=1).dropna()
    return SequencesCells(new_table, cat_plates)
    

In [15]:
fastq = FastQ('./test-RPMD1-01_R1.trimmed.fastq.gz')
plate1 = fastq.parse_file()
plate1.table

Unnamed: 0,RPMD1-01_NTGCB,RPMD1-01_GTNCB
CTCTGA,2.0,1
CTCTGAGGG,0.0,1


In [17]:
fastq = FastQ('./test-RPMD1-02_R1.trimmed.fastq.gz')
plate2 = fastq.parse_file()
plate2.table

Unnamed: 0,RPMD1-02_NTGCB,RPMD1-02_GTNCB
CTCTGA,2,1


In [18]:
cat_plates= concatenate_tables([plate1, plate2])
assert cat_plates.plates == ['RPMD1-01', 'RPMD1-02']
assert cat_plates.table.index == ['CTCTGA']

In [20]:
plates = plate1.join_plate(plate2)

assert plates.plates == ['RPMD1-01', 'RPMD1-02']
assert plates.table.index == ['CTCTGA']

In [21]:
plates.table

Unnamed: 0,RPMD1-01_NTGCB,RPMD1-01_GTNCB,RPMD1-02_NTGCB,RPMD1-02_GTNCB
CTCTGA,2.0,1,2.0,1.0
