# FastQ and SequencesCells class

> Read and create count table of cells

In [8]:
#| hide
from nbdev import notebook2script


ImportError: cannot import name 'notebook2script' from 'nbdev' (/opt/homebrew/anaconda3/lib/python3.9/site-packages/nbdev/__init__.py)

In [1]:
#| default_exp fastq_reader

In [4]:
#| export
import numpy as np
import pandas as pd
import gzip

# FastQ class

Inizialize it with the path of fastq file already unzipped. 

Run with parse_file() method and return a SequencesCells() class containing the dataframe:

|                | **NTGCB** | **GTNCB** | **cell3** |
|----------------|-----------|-----------|-----------|
| **CTCTGA**     | 2         | 1         | ..        |
| **CTCTGAGGG**  | 0         | 1         | ..        |
| **sequence_3** | ..        | ..        | ..        |



In [5]:
#| export

def extract_umi_cb(nm):
    nm = nm.split(' ')[0]
    cb, umi = nm.split('_')[1:3]
    return cb, umi

In [6]:
nm = '@VH00225:8:AAAKNTKHV:1:1101:62862:1057_NTGCB_GGCAUMI 1:N:0:GTTTCG'
cb, umi = extract_umi_cb(nm)
assert cb == 'NTGCB'
assert umi == 'GGCAUMI'

In [18]:
#| export

def extract_dictionary(path):
    fastq_dict = {}
    with open(path, 'r') as f:
        content = f.readlines()
        for i, line in enumerate(content):
            index_seq = i % 4
            line = line.strip()
            if index_seq == 0:
                sequence = content[i+1].strip()
                cb, umi = extract_umi_cb(line)
                i += 0
                if cb in fastq_dict.keys():
                    if sequence in fastq_dict[cb].keys():                    
                        fastq_dict[cb][sequence].append(umi)
                    else:
                        fastq_dict[cb][sequence] = [umi]

                else:
                    fastq_dict[cb] = {sequence:[umi]}
    return fastq_dict
fastq_dict = extract_dictionary('./test.fastq')
assert list(pd.DataFrame(fastq_dict).columns) == ['NTGCB', 'GTNCB']
assert list(pd.DataFrame(fastq_dict).index) == ['CTCTGA', 'CTCTGAGGG']

In [19]:
fastq_dict

{'NTGCB': {'CTCTGA': ['GGCAUMI', 'ACGGUMI', 'GGCAUMI', 'GGCAUMI']},
 'GTNCB': {'CTCTGA': ['GGCAUMI'], 'CTCTGAGGG': ['ACGGUMI']}}

In [21]:
#| export
def count_unique_sequences(dz):    
    df = pd.DataFrame(dz)
    df = df.applymap(lambda x: len(set(x)) if isinstance(x,list) else 0)
    return df

In [26]:
count_unique_sequences(fastq_dict)

Unnamed: 0,NTGCB,GTNCB
CTCTGA,2,1
CTCTGAGGG,0,1


In [25]:
assert (count_unique_sequences(fastq_dict).values == np.array([[2,1],[0,1]])).all()

In [27]:
#| export
class FastQ():
    def __init__(self, fastq_file):
        self.fastq_file = fastq_file
        
    def parse_file(self):
        fastq = extract_dictionary(self.fastq_file)
        df_fastq = count_unique_sequences(fastq)
        return df_fastq

            

In [28]:
fastq = FastQ('./test.fastq')
df = fastq.parse_file()
df

Unnamed: 0,NTGCB,GTNCB
CTCTGA,2,1
CTCTGAGGG,0,1
