# FastQ and SequencesCells class

> Read and create count table of cells

In [36]:
#| hide
from nbdev import nbdev_export
nbdev_export()

In [37]:
#| default_exp fastq_reader

In [38]:
#| export
import numpy as np
import pandas as pd
import gzip

# FastQ class

Inizialize it with the path of fastq file gz-zipped. 

Run with parse_file() method and return a SequencesCells() class containing the dataframe:

|                | **NTGCB** | **GTNCB** | **cell3** |
|----------------|-----------|-----------|-----------|
| **CTCTGA**     | 2         | 1         | ..        |
| **CTCTGAGGG**  | 0         | 1         | ..        |
| **sequence_3** | ..        | ..        | ..        |

On the SequencesCells object then you can join a new plate using the method `.join_plate()`


In [39]:
#| export

def extract_umi_cb(nm):
    nm = nm.split(' ')[0]
    cb, umi = nm.split('_')[1:3]
    return cb, umi

In [40]:
nm = '@VH00225:8:AAAKNTKHV:1:1101:62862:1057_NTGCB_GGCAUMI 1:N:0:GTTTCG'
cb, umi = extract_umi_cb(nm)
assert cb == 'NTGCB'
assert umi == 'GGCAUMI'

## extract dictionary & count unique sequeces

Extract dictionary is a function that extract a dictionary in the form of `{'plate_cell_barcode':{'sequence':[list_of_associated_UMI]}}`

In this form it's easy to convert to a pandas dataframe, and with the function 'set' it's possible to count the unique identifiers (UMIs)



In [41]:
#| export

def extract_dictionary(path, plate_name):
    fastq_dict = {}
    with gzip.open(path, 'rt') as f:
        content = f.readlines()
        for i, line in enumerate(content):
            index_seq = i % 4
            line = line.strip()
            if index_seq == 0:
                sequence = content[i+1].strip()
                cb, umi = extract_umi_cb(line)
                cb = plate_name + '_' + cb 
                i += 0
                if cb in fastq_dict.keys():
                    if sequence in fastq_dict[cb].keys():                    
                        fastq_dict[cb][sequence].append(umi)
                    else:
                        fastq_dict[cb][sequence] = [umi]

                else:
                    fastq_dict[cb] = {sequence:[umi]}
    return fastq_dict


In [42]:
fastq_dict = extract_dictionary('./test-RPMD1-01_R1.trimmed.fastq.gz', 'RPMD1-01')

assert list(pd.DataFrame(fastq_dict).columns) == ['RPMD1-01_NTGCB', 'RPMD1-01_GTNCB']
assert list(pd.DataFrame(fastq_dict).index) == ['CTCTGA', 'CTCTGAGGG']

In [43]:
fastq_dict

{'RPMD1-01_NTGCB': {'CTCTGA': ['GGCAUMI', 'ACGGUMI', 'GGCAUMI', 'GGCAUMI']},
 'RPMD1-01_GTNCB': {'CTCTGA': ['GGCAUMI'], 'CTCTGAGGG': ['ACGGUMI']}}

In [44]:
#| export
def count_unique_sequences(dz):    
    df = pd.DataFrame(dz)
    df = df.applymap(lambda x: len(set(x)) if isinstance(x,list) else 0)
    return df

In [45]:
df = count_unique_sequences(fastq_dict)
df

Unnamed: 0,RPMD1-01_NTGCB,RPMD1-01_GTNCB
CTCTGA,2,1
CTCTGAGGG,0,1


In [46]:
assert (count_unique_sequences(fastq_dict).values == np.array([[2,1],[0,1]])).all()

In [47]:
#| export
class SequencesCells():
    def __init__(self, df, plate):
        self.table = df
        self.plates = plate if isinstance(plate, list) else [plate]
    def join_plate(self, sequences_cells):
        addtable = sequences_cells.table
        plates_to_add = [p for p in sequences_cells.plates if p not in self.plates]
        if len(plates_to_add) == 0:
            return self
        select_cells = np.array([[plate in name_cell for name_cell in addtable.columns]
                                    for plate in plates_to_add]).sum(axis=0).astype(bool)
        other = addtable.loc[:,select_cells]
        self.table = pd.concat([self.table,other], axis=1).dropna()
        self.plates = self.plates + plates_to_add
        return self
    def select_plate(self, plate):
        df = self.table.loc[:, [col for col in plates.table.columns if plate in col]]
        return SequencesCells(df, plate)

In [48]:
#| export
class FastQ():
    def __init__(self, fastq_file, plate_name=None):
        self.fastq_file = fastq_file
        self.plate_name = plate_name
        
    def parse_file(self):
        if self.plate_name is None:
            self.plate_name = '-'.join(self.fastq_file.split('_')[0].split('-')[1:])
            
            
        fastq = extract_dictionary(self.fastq_file, self.plate_name)
        df_fastq = count_unique_sequences(fastq)
        return SequencesCells(df_fastq, self.plate_name)

            

In [49]:
#| export

def concatenate_tables(list_of_seqcells):
    cat_plates = [pl for t in list_of_seqcells for pl in t.plates ]
    if len(set(cat_plates)) != len(cat_plates):
        raise ValueError('Repeated plates in the passed list')
    cat_tables = [t.table for t in list_of_seqcells ]
    new_table = pd.concat(cat_tables, axis=1).dropna()
    return SequencesCells(new_table, cat_plates)
    

In [50]:
fastq = FastQ('./test-RPMD1-01_R1.trimmed.fastq.gz')
plate1 = fastq.parse_file()
plate1.table

Unnamed: 0,RPMD1-01_NTGCB,RPMD1-01_GTNCB
CTCTGA,2,1
CTCTGAGGG,0,1


In [51]:
fastq = FastQ('./test-RPMD1-02_R1.trimmed.fastq.gz')
plate2 = fastq.parse_file()
plate2.table

Unnamed: 0,RPMD1-02_NTGCB,RPMD1-02_GTNCB
CTCTGA,2,1


In [52]:
cat_plates= concatenate_tables([plate1, plate2])
assert cat_plates.plates == ['RPMD1-01', 'RPMD1-02']
assert cat_plates.table.index == ['CTCTGA']

In [53]:
plates = plate1.join_plate(plate2)

assert plates.plates == ['RPMD1-01', 'RPMD1-02']
assert plates.table.index == ['CTCTGA']