In [1]:
from config import *
from utils import *

import csv
import numpy as np
import collections
import multiprocessing
import pickle

from tqdm.notebook import tqdm

# Barcode - Sequence Filter and Group
The code here processes the previously extracted sequence, barcode, and quality (see Sequence_Extraction.ipynb).

## Quality filtering
Any sequence or barcode that contained at least one base with Phred quality score less than 11 (less than 90% base call accuracy) is rejected.

## Create Barcode : Sequence (and counts) Lookup Tables
Create and pickle dictionaries with barcodes as keys, and sequence:counts as values. This way we can look up sequences associated with a barcode efficiently for later analysis.

In [2]:
def generate_bc_seq_map(bc_seq_fn, bc_qual_thres=20, seq_qual_thres=11):
    bc_seq_map = collections.defaultdict(lambda: collections.defaultdict(int))
    accepted = 0
    rejected = 0
    with open(bc_seq_fn, newline='') as csvfile:
        csvreader = csv.reader(csvfile)
        for line in tqdm(csvreader):
            if len(line) == 3 and line[2] == 'SKIPPED':
                bc, bc_qual, seq = line
                if is_good_quality(bc_qual, bc_qual_thres):
                    bc_seq_map[bc][seq] += 1
                    accepted += 1
                else:
                    rejected += 1
            else:
                bc, bc_qual, seq, seq_qual = line
                if is_good_quality(bc_qual, bc_qual_thres) and is_good_quality(seq_qual, seq_qual_thres):
                    bc_seq_map[bc][seq] += 1
                    accepted += 1
                else:
                    rejected += 1
    print("Accepted:", accepted)
    print("Rejected:", rejected)
    print("Total Instances:", accepted + rejected)
    return bc_seq_map

## Barcode - gRNA Sequence Filter and Group

In [3]:
# Output Dictionary format {bc: {seq: count}}

if not pickle_exists(BC_GRNA_PRECAS_MAP):
    bc_grna_precas_map = generate_bc_seq_map(PRECAS_GDNA_BC_SEQ, BC_QUAL_THRES, SEQ_QUAL_THRES)
    save_bc_seq(bc_grna_precas_map, BC_GRNA_PRECAS_MAP)
    # Stats:
    # Accepted: 5711756
    # Rejected: 1159
    # Total Instances: 5712915

if not pickle_exists(BC_GRNA1_POSTCAS_MAP):
    bc_grna1_postcas_map = generate_bc_seq_map(POSTCAS_GDNA1_BC_SEQ, BC_QUAL_THRES, SEQ_QUAL_THRES)
    save_bc_seq(bc_grna1_postcas_map, BC_GRNA1_POSTCAS_MAP)
    # Stats:
    # Accepted: 4875055
    # Rejected: 981
    # Total Instances: 4876036

if not pickle_exists(BC_GRNA2_POSTCAS_MAP):
    bc_grna2_postcas_map = generate_bc_seq_map(POSTCAS_GDNA2_BC_SEQ, BC_QUAL_THRES, SEQ_QUAL_THRES)
    save_bc_seq(bc_grna2_postcas_map, BC_GRNA2_POSTCAS_MAP)
    # Stats:
    # Accepted: 3032287
    # Rejected: 622
    # Total Instances: 3032909

## Barcode - Target Sequence (Unspliced) Filter and Group

In [4]:
# Output Dictionary format {bc: {seq: count}}

if not pickle_exists(BC_UNSPLICED_PRECAS_MAP):
    bc_unspliced_precas_map = generate_bc_seq_map(PRECAS_UNSPLICED_BC_SEQ, BC_QUAL_THRES, SEQ_QUAL_THRES)
    save_bc_seq(bc_unspliced_precas_map, BC_UNSPLICED_PRECAS_MAP)
    # Stats:
    # Accepted: 14998547
    # Rejected: 4198
    # Total Instances: 15002745

if not pickle_exists(BC_UNSPLICED1_POSTCAS_MAP):
    bc_unspliced1_postcas_map = generate_bc_seq_map(POSTCAS_UNSPLICED1_BC_SEQ, BC_QUAL_THRES, SEQ_QUAL_THRES)
    sanitize(bc_unspliced1_postcas_map)
    save_bc_seq(bc_unspliced1_postcas_map, BC_UNSPLICED1_POSTCAS_MAP)
    # Stats:
    # Accepted: 21286282
    # Rejected: 6125
    # Total Instances: 21292407

if not pickle_exists(BC_UNSPLICED2_POSTCAS_MAP):
    bc_unspliced2_postcas_map = generate_bc_seq_map(POSTCAS_UNSPLICED2_BC_SEQ, BC_QUAL_THRES, SEQ_QUAL_THRES)
    sanitize(bc_unspliced2_postcas_map)
    save_bc_seq(bc_unspliced2_postcas_map, BC_UNSPLICED2_POSTCAS_MAP)
    # Stats:
    # Accepted: 23138618
    # Rejected: 6712
    # Total Instances: 23145330

## Barcode - Target Sequence (Spliced) Filter and Group

In [5]:
# Output Dictionary format {bc: {seq: count}}

if not pickle_exists(BC_SPLICED_PRECAS_MAP):
    bc_spliced_precas_map = generate_bc_seq_map(PRECAS_SPLICED_BC_SEQ, BC_QUAL_THRES, SEQ_QUAL_THRES)
    save_bc_seq(bc_spliced_precas_map, BC_SPLICED_PRECAS_MAP)
    # Stats:
    # Accepted: 4218814
    # Rejected: 688
    # Total Instances: 4219502

if not pickle_exists(BC_SPLICED1_POSTCAS_MAP):
    bc_spliced1_postcas_map = generate_bc_seq_map(POSTCAS_SPLICED1_BC_SEQ, BC_QUAL_THRES, SEQ_QUAL_THRES)
    save_bc_seq(bc_spliced1_postcas_map, BC_SPLICED1_POSTCAS_MAP)
    # Stats:
    # Accepted: 5475016
    # Rejected: 911
    # Total Instances: 5475927

if not pickle_exists(BC_SPLICED2_POSTCAS_MAP):
    bc_spliced2_postcas_map = generate_bc_seq_map(POSTCAS_SPLICED2_BC_SEQ, BC_QUAL_THRES, SEQ_QUAL_THRES)
    save_bc_seq(bc_spliced2_postcas_map, BC_SPLICED2_POSTCAS_MAP)
    # Stats:
    # Accepted: 8034262
    # Rejected: 1216
    # Total Instances: 8035478