# FastQ and SequencesCells class

> Read and create count table of cells

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()

In [None]:
#| default_exp fastq_reader

In [None]:
#| export
import numpy as np
import pandas as pd
import gzip

# FastQ class

Inizialize it with the path of fastq file already unzipped. 

Run with parse_file() method and return a SequencesCells() class containing the dataframe:

|                | **NTGCB** | **GTNCB** | **cell3** |
|----------------|-----------|-----------|-----------|
| **CTCTGA**     | 2         | 1         | ..        |
| **CTCTGAGGG**  | 0         | 1         | ..        |
| **sequence_3** | ..        | ..        | ..        |

On the SequencesCells object then you can join a new plate using the method 

In [None]:
#| export

def extract_umi_cb(nm):
    nm = nm.split(' ')[0]
    cb, umi = nm.split('_')[1:3]
    return cb, umi

In [None]:
nm = '@VH00225:8:AAAKNTKHV:1:1101:62862:1057_NTGCB_GGCAUMI 1:N:0:GTTTCG'
cb, umi = extract_umi_cb(nm)
assert cb == 'NTGCB'
assert umi == 'GGCAUMI'

In [None]:
#| export

def extract_dictionary(path, plate_name):
    fastq_dict = {}
    with open(path, 'r') as f:
        content = f.readlines()
        for i, line in enumerate(content):
            index_seq = i % 4
            line = line.strip()
            if index_seq == 0:
                sequence = content[i+1].strip()
                cb, umi = extract_umi_cb(line)
                cb = plate_name + '_' + cb 
                i += 0
                if cb in fastq_dict.keys():
                    if sequence in fastq_dict[cb].keys():                    
                        fastq_dict[cb][sequence].append(umi)
                    else:
                        fastq_dict[cb][sequence] = [umi]

                else:
                    fastq_dict[cb] = {sequence:[umi]}
    return fastq_dict


In [None]:
fastq_dict = extract_dictionary('./test-RPMD1-01_R1.trimmed.fastq', 'RPMD1-01')

assert list(pd.DataFrame(fastq_dict).columns) == ['RPMD1-01_NTGCB', 'RPMD1-01_GTNCB']
assert list(pd.DataFrame(fastq_dict).index) == ['CTCTGA', 'CTCTGAGGG']

In [None]:
fastq_dict

{'RPMD1-01_NTGCB': {'CTCTGA': ['GGCAUMI', 'ACGGUMI', 'GGCAUMI', 'GGCAUMI']},
 'RPMD1-01_GTNCB': {'CTCTGA': ['GGCAUMI'], 'CTCTGAGGG': ['ACGGUMI']}}

In [None]:
#| export
def count_unique_sequences(dz):    
    df = pd.DataFrame(dz)
    df = df.applymap(lambda x: len(set(x)) if isinstance(x,list) else 0)
    return df

In [None]:
df = count_unique_sequences(fastq_dict)
df

Unnamed: 0,RPMD1-01_NTGCB,RPMD1-01_GTNCB
CTCTGA,2,1
CTCTGAGGG,0,1


In [None]:
assert (count_unique_sequences(fastq_dict).values == np.array([[2,1],[0,1]])).all()

In [None]:
class SequencesCells():
    def __init__(self, df, plate):
        self.table = df
        self.plates = [plate]
    def join_plate(self, sequences_cells):
        addtable = sequences_cells.table
        plates_to_add = [p for p in sequences_cells.plates if p not in self.plates]
        print(plates_to_add)
        if len(plates_to_add) == 0:
            return self
        select_cells = np.array([[plate in name_cell for name_cell in addtable.columns]
                                    for plate in plates_to_add]).sum(axis=0).astype(bool)
        other = addtable.loc[:,select_cells]
        self.table = pd.concat([self.table,other], axis=1).dropna()
        self.plates = self.plates + plates_to_add
        return self
    def select_plate(self, plate):
        df = self.table.loc[:, [col for col in plates.table.columns if plate in col]]
        return SequencesCells(df, plate)

In [None]:
#| export
class FastQ():
    def __init__(self, fastq_file, plate_name=None):
        self.fastq_file = fastq_file
        self.plate_name = plate_name
        
    def parse_file(self):
        if self.plate_name is None:
            self.plate_name = '-'.join(self.fastq_file.split('_')[0].split('-')[1:])
            
            
        fastq = extract_dictionary(self.fastq_file, self.plate_name)
        df_fastq = count_unique_sequences(fastq)
        return SequencesCells(df_fastq, self.plate_name)

            

In [None]:
fastq = FastQ('./test-RPMD1-01_R1.trimmed.fastq')
plate1 = fastq.parse_file()
plate1.table

Unnamed: 0,RPMD1-01_NTGCB,RPMD1-01_GTNCB
CTCTGA,2,1
CTCTGAGGG,0,1


In [None]:
fastq = FastQ('./test-RPMD1-02_R1.trimmed.fastq')
plate2 = fastq.parse_file()
plate2.table

Unnamed: 0,RPMD1-02_NTGCB,RPMD1-02_GTNCB
CTCTGA,2,1


In [None]:
plates = plate1.join_plate(plate2)
print(plates.plates)
assert plates.plates == ['RPMD1-01', 'RPMD1-02']
assert plate

[]
['RPMD1-01', 'RPMD1-02']


In [None]:
assert 

['RPMD1-01', 'RPMD1-02']

In [None]:
l1 = [1,2,3]
l2 = [2,3]
l2 in l1

In [None]:
('cr' in 'cr-01') & ('cr' in 'aaaa')

In [None]:
keyw = ['chiave1', 'chiave2']
words =['aaa-chiave1', 'bbb-chiave1', 'aaaa-chiave3']
np.array([[w in word for word in words] for w in keyw]).sum(axis=0).astype(bool)

In [None]:
np.array([[w in word for word in words] for w in keyw]).sum(axis=0)


In [None]:

? np.sum
