# dat-B: Characterize Skipping
dat-B in this study refers to the set of repair genotypes observed in Spliced (Pre and PostCas9, where PreCas9 is control, and gives us wildtype $\Psi$), and their count of transcripts with or without Exon B. The code here extracts these information.

In [1]:
from config import *
from utils import *

import sys
import regex
import copy
import numpy as np
import collections
import multiprocessing
import pickle

import numpy as np
import scipy

# Suppress pandas future warning, which messes tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd

from tqdm.notebook import tqdm

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
bc_spliced_precas_map = load_bc_seq(BC_SPLICED_PRECAS_MAP)
bc_spliced1_postcas_map = load_bc_seq(BC_SPLICED1_POSTCAS_MAP)
bc_spliced2_postcas_map = load_bc_seq(BC_SPLICED2_POSTCAS_MAP)

bc_tid_indel_postcas_map = load_bc_id_indel(BC_TID_INDEL_POSTCAS_MAP)
unnorm_gt_indel_dist_map = load_var(UNNORM_GT_INDEL_DIST_MAP)
gt_indel_dist_map = load_var(GT_INDEL_DIST_MAP)

In [3]:
K = 5
design_t_kmers_map = {design_t:kmers(design_t, K) for design_t in exp_target_tid_map}

# Characterize Skipping
For a given spliced RNA transcript sequence, we need to figure out the original designed target (splice-acceptor) sequence. Similar to what we did in the construction of dat-A, for a query sequence Q, we find the most similar designed sequence D from lib-SA by computing the Dice Coefficient between Q and D, using 5-mers, and picking the D that is most similar to Q.

For a transcript with Exon B skipped, only the barcode information is known. To associate it with its repair genotype, its barcode is queried on the dat-A PostCas lookup table (variable bc_tid_indel_postcas_map) to find the repair genotype with the same barcode. If there are multiple repair genotypes with the same barcode, then the one with the largest read count is chosen for association. If the barcode is not observed in dat-A, then the transcript is discarded.

For a transcript with Exon B retained, naturally only the Exon B portion of the repair outcome is discernible. Similarly to identify its repair genotype, its barcode is queried on dat-A lookup table. If there is one repair genotype with the same barcode and that repair genotype agrees with the partial Exon B we observe in the transcript, then we're set. If there are multiple repair genotypes with the same barcode AND agrees with the partial Exon B we observe in the transcript, then one of them is picked randomly, sampled according to the frequency of observing each candidate (see code). How can there be multiple possible repair genotypes? Consider the following original and spliced sequences:

```
Original ('|' denote intron-exon boundary)
AGTCAAG|CGGTCA

Repair outcomes
AGTC---|CGGTCA
AGTCAA-|CGGTCA
AGTCA--|CGGTCA
AGTCA--|--GTCA

Observed transcript:
ExonA|CGGTCA
```

We see that the first three repair outcomes all can give rise to the observed transcript, and there's no way of knowing which one it is if all three happen to match to the same barcode (rare, but possible).

## Characterize PreCas Spliced

In [4]:
def search_splice(ref, obs):
    prog = regex.compile(r'(?b)('+obs+'){e<=3}')
    return prog.search(ref)


def get_spliceidx(ref, obs):
    m = search_splice(ref, obs)
    if m:
        return m.start()
    return -1


def get_tid_spliced(obs_t):
    if len(obs_t) < 5:
        return None
    
    if len(obs_t) > 12 and hamming_distance('CACAACATCGAG', obs_t[0:12]) <= 2:
        obs_t = obs_t[12:]
    
    max_sim = -float('inf')
    best_design_t = None
    for design_t in exp_target_tid_map:
        if design_t in obs_t or obs_t in design_t:
            max_sim = 1
            best_design_t = design_t
            break
        
        sim = dice_coefficient(obs_t, design_t_kmers_map[design_t], K)
        if sim > max_sim:
            max_sim = sim
            best_design_t = design_t
    
    return best_design_t, obs_t, max_sim


def get_tid_spliceidx_precas(obs_t):      
    t_spliced = get_tid_spliced(obs_t)
    if t_spliced is None:
        return None, -1
    
    best_design_t, obs_t, max_sim = t_spliced
    
    if max_sim < 0.3:
        return None, -1
    
    spliceidx = get_spliceidx(best_design_t, obs_t)
    return exp_target_tid_map[best_design_t], spliceidx


def generate_bc_tid_spliceidx_precas_map(bc_precas_splice_map):
    try:
        p = multiprocessing.Pool(NUM_PROCESSES)

        bc_tid_spliceidx_map = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(int)))

        obs_ts = list({obs_t for bc in tqdm(bc_precas_splice_map) for obs_t in bc_precas_splice_map[bc]})
        tid_spliceidxs = []
        for tid_spliceidx in tqdm(p.imap(get_tid_spliceidx_precas, obs_ts, chunksize=1), total=len(obs_ts)):
            tid_spliceidxs.append(tid_spliceidx)

        obs_tidspliceidx_map = {obs_t:tid_spliceidxs[i] for i, obs_t in enumerate(obs_ts)}

        for bc in tqdm(bc_precas_splice_map):
            for obs_t in bc_precas_splice_map[bc]:
                if obs_t == 'SKIPPED':
                    tid, spliceidx = 'SKIPPED', -1
                else:
                    tid, spliceidx = obs_tidspliceidx_map[obs_t]
                    if tid is None or spliceidx == -1:
                        continue
                    
                bc_tid_spliceidx_map[bc][tid][spliceidx] += bc_precas_splice_map[bc][obs_t]

        print("(Before) Num Unique BCs:", len(bc_precas_splice_map))
        print("(After) Num Unique BCs:", len(bc_tid_spliceidx_map))
    finally:
        p.close()
        p.join()

    return bc_tid_spliceidx_map

In [5]:
if not pickle_exists(BC_TID_SPLICEIDX_PRECAS_MAP):
    bc_tid_spliceidx_precas_map = generate_bc_tid_spliceidx_precas_map(bc_spliced_precas_map)
    save_bc_id_indel(bc_tid_spliceidx_precas_map, BC_TID_SPLICEIDX_PRECAS_MAP)
else:
    bc_tid_spliceidx_precas_map = load_bc_id_indel(BC_TID_SPLICEIDX_PRECAS_MAP)

## Characterize PostCas Spliced

In [6]:
def get_gid_from_indel(indel, tid):
    if indel[1] in DELETION_SIGNATURES:
        _, _, deletion_size, genotype_pos, cutsite = indel
        for gid in exp_tid_gids_map[tid]:
            design_t = exp_tid_target_map[tid]
            grna = exp_gid_grna_map[gid]
            g_orientation = exp_grna_gid_map[grna][1]
            if cutsite == get_cutsite(grna, design_t, g_orientation):
                return gid
    elif indel[1] in INSERTION_SIGNATURES and indel[2] == 1:
        _, _, _, inserted_base, cutsite =  indel
        for gid in exp_tid_gids_map[tid]:
            design_t = exp_tid_target_map[tid]
            grna = exp_gid_grna_map[gid]
            g_orientation = exp_grna_gid_map[grna][1]
            if cutsite == get_cutsite(grna, design_t, g_orientation):
                return gid
    return None


np.random.seed(42)
def get_indel_spliceidx(a):
    t_spliced, bc = a
    if t_spliced is None:
        return None
    
    best_design_t, obs_t, max_sim = t_spliced
    tid = exp_target_tid_map[best_design_t]
    
    if tid not in bc_tid_indel_postcas_map[bc]:
        return None
    
    m_wt = search_splice(best_design_t, obs_t)
    
    min_fuzzy_count = float('inf')
    best_indel_spliceidx = []
    for indel in bc_tid_indel_postcas_map[bc][tid]:
        if indel[1] in DELETION_SIGNATURES or (indel[1] in INSERTION_SIGNATURES and indel[2] == 1):
            prod = get_simulated_product(indel, best_design_t)
            m = search_splice(prod, obs_t)
            if m:
                fuzzy_count = sum(m.fuzzy_counts)
                if fuzzy_count == min_fuzzy_count:
                    best_indel_spliceidx.append((m.start(), indel, tid))
                elif fuzzy_count < min_fuzzy_count:
                    min_fuzzy_count = fuzzy_count
                    best_indel_spliceidx = [(m.start(), indel, tid)]

    # Unsure: Could be uncut WT, or indel only in the intron.
    # There's no way of knowing which, so just ignore them
    if m_wt and sum(m_wt.fuzzy_counts) < min_fuzzy_count:
        return None

    if len(best_indel_spliceidx) == 0:
        return None
    
    # Unique best match found
    if len(best_indel_spliceidx) == 1:
        return best_indel_spliceidx[0]
    
    # If reached here, there are multiple possible original repair genotypes
    # Rather than giving up here and discarding them,
    # we look at the observed frequencies for each of the possible repair genotypes in question
    # This information is available from dat-A. We then sample one of these repair genotypes
    # according to these frequencies, and choose that as the original repair genotype.
    p = []
    for (s, indel, tid) in best_indel_spliceidx:
        gid = get_gid_from_indel(indel, tid)
        grna_seq = exp_gid_grna_map[gid]
        grna_orientation = exp_grna_gid_map[grna_seq][1]
        
        dist = gt_indel_dist_map[(grna_seq, best_design_t, grna_orientation)]
        
        if indel[1] in DELETION_SIGNATURES:
            _, _, deletion_size, genotype_pos, cutsite = indel
            p.append(dist[-deletion_size][genotype_pos])
        elif indel[1] in INSERTION_SIGNATURES:
            _, _, _, inserted_base, cutsite = indel
            p.append(dist[1][inserted_base])
    p = np.array(p) / np.sum(p)
    return best_indel_spliceidx[np.random.choice(len(best_indel_spliceidx), 1, p=p)[0]]  
    

def generate_bc_tid_spliceidx_postcas_map(bc_postcas_splice_map):
    try:
        p = multiprocessing.Pool(NUM_PROCESSES)

        bc_tid_spliceidx_map = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(int)))

        obs_ts = list({obs_t for bc in tqdm(bc_postcas_splice_map) for obs_t in bc_postcas_splice_map[bc]})
        t_splices = []
        for t_spliced in tqdm(p.imap(get_tid_spliced, obs_ts, chunksize=1), total=len(obs_ts)):
            t_splices.append(t_spliced)
        
        obs_tspliced_map = {obs_t:t_splices[i] for i, obs_t in enumerate(obs_ts)}
        
        tsplice_bcs = set()
        for bc in tqdm(bc_postcas_splice_map):
            if bc in bc_tid_indel_postcas_map:
                for obs_t in bc_postcas_splice_map[bc]:
                    tsplice_bcs.add((obs_tspliced_map[obs_t], bc))
        tsplice_bcs = list(tsplice_bcs)
        
        indel_spliceidxs = []
        for indel_spliceidx in tqdm(p.imap(get_indel_spliceidx, tsplice_bcs, chunksize=1), total=len(tsplice_bcs)):
            indel_spliceidxs.append(indel_spliceidx)
            
        tsplicebc_spliceidx_map = {tsplicebc:indel_spliceidxs[i] for i, tsplicebc in enumerate(tsplice_bcs)}
        
        for bc in tqdm(bc_postcas_splice_map):
            if bc in bc_tid_indel_postcas_map:
                for obs_t in bc_postcas_splice_map[bc]:
                    if obs_t == 'SKIPPED':
                        bc_tid_spliceidx_map[bc]['SKIPPED'][-1] += bc_postcas_splice_map[bc][obs_t]
                        continue
                        
                    spliceidx_indel_tid = tsplicebc_spliceidx_map[(obs_tspliced_map[obs_t], bc)]
                    if spliceidx_indel_tid:
                        spliceidx, indel, tid = spliceidx_indel_tid
                        bc_tid_spliceidx_map[bc][tid][(spliceidx, indel)] += bc_postcas_splice_map[bc][obs_t]

        print("(Before) Num Unique BCs:", len(bc_postcas_splice_map))
        print("(After) Num Unique BCs:", len(bc_tid_spliceidx_map))
    finally:
        p.close()
        p.join()

    return bc_tid_spliceidx_map

In [7]:
if not pickle_exists(BC_TID_SPLICEIDX_POSTCAS_MAP):
    bc_tid1_spliceidx_postcas_map = generate_bc_tid_spliceidx_postcas_map(bc_spliced1_postcas_map)

In [8]:
if not pickle_exists(BC_TID_SPLICEIDX_POSTCAS_MAP):
    bc_tid2_spliceidx_postcas_map = generate_bc_tid_spliceidx_postcas_map(bc_spliced2_postcas_map)

In [9]:
def merge_bc_tid_spliceidx_maps(bc_tid1_spliceidx_map, bc_tid2_spliceidx_map):
    bc_tid_spliceidx_map = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(int)))
    
    merged = 0
    for bc in tqdm(bc_tid1_spliceidx_map.keys() & bc_tid2_spliceidx_map.keys()):
        for tid in (bc_tid1_spliceidx_map[bc].keys() & bc_tid2_spliceidx_map[bc].keys()):
            for spliceidx in (bc_tid1_spliceidx_map[bc][tid].keys() & bc_tid2_spliceidx_map[bc][tid].keys()):
                bc_tid_spliceidx_map[bc][tid][spliceidx] += np.mean([bc_tid1_spliceidx_map[bc][tid][spliceidx], bc_tid2_spliceidx_map[bc][tid][spliceidx]])
                merged += 1
    
    for bc in tqdm(bc_tid1_spliceidx_map):
        for tid in bc_tid1_spliceidx_map[bc]:
            for spliceidx in bc_tid1_spliceidx_map[bc][tid]:
                if not (bc in bc_tid_spliceidx_map and tid in bc_tid_spliceidx_map[bc] and spliceidx in bc_tid_spliceidx_map[bc][tid]):
                    bc_tid_spliceidx_map[bc][tid][spliceidx] += bc_tid1_spliceidx_map[bc][tid][spliceidx]
    for bc in tqdm(bc_tid2_spliceidx_map):
        for tid in bc_tid2_spliceidx_map[bc]:
            for spliceidx in bc_tid2_spliceidx_map[bc][tid]:
                if not (bc in bc_tid_spliceidx_map and tid in bc_tid_spliceidx_map[bc] and spliceidx in bc_tid_spliceidx_map[bc][tid]):
                    bc_tid_spliceidx_map[bc][tid][spliceidx] += bc_tid2_spliceidx_map[bc][tid][spliceidx]
    
    print("Num Common:", merged)
    return bc_tid_spliceidx_map

In [10]:
if not pickle_exists(BC_TID_SPLICEIDX_POSTCAS_MAP):
    bc_tid_spliceidx_postcas_map = merge_bc_tid_spliceidx_maps(bc_tid1_spliceidx_postcas_map, bc_tid2_spliceidx_postcas_map)
    save_bc_id_indel(bc_tid_spliceidx_postcas_map, BC_TID_SPLICEIDX_POSTCAS_MAP)
else:
    bc_tid_spliceidx_postcas_map = load_bc_id_indel(BC_TID_SPLICEIDX_POSTCAS_MAP)

In [11]:
def get_skipped_tid_precas(bc):
    if len(bc_tid_spliceidx_precas_map[bc]) > 1:
        max_count = 0
        best_tid = None
        for tid in bc_tid_spliceidx_precas_map[bc]:
            if tid != 'SKIPPED':
                count = sum(bc_tid_spliceidx_precas_map[bc][tid][spliceidx] for spliceidx in bc_tid_spliceidx_precas_map[bc][tid])
                if count > max_count:
                    max_count = count
                    best_tid = tid
        if best_tid is not None:
            best_spliceidx = max((bc_tid_spliceidx_precas_map[bc][best_tid][spliceidx] ,spliceidx) 
                                 for spliceidx in bc_tid_spliceidx_precas_map[bc][best_tid])[1]
            return best_tid, best_spliceidx
    return None, None


def get_skipped_indel_postcas(bc):
    max_count = 0
    best_indel1 = None
    if len(bc_tid_spliceidx_postcas_map[bc]) > 1:
        for tid in bc_tid_spliceidx_postcas_map[bc]:
            if tid != 'SKIPPED':
                for indel in bc_tid_spliceidx_postcas_map[bc][tid]:
                    count = bc_tid_spliceidx_postcas_map[bc][tid][indel]
                    if count > max_count:
                        max_count = count
                        best_indel1 = (tid, indel)
    
    return best_indel1


def generate_gid_spliceidx_count_map(bc_tid_spliceidx_map, mode='precas'):
    gid_spliceidx_count_map = collections.defaultdict(lambda: collections.defaultdict(int))
    
    for bc in tqdm(bc_tid_spliceidx_map):
        for tid in bc_tid_spliceidx_map[bc]:
            if mode == 'precas':
                if tid == 'SKIPPED':
                    guessed_tid, spliceidx = get_skipped_tid_precas(bc)
                    if guessed_tid is not None:
                        payload = (spliceidx, (exp_tid_target_map[guessed_tid], 'N'))
                        gid_spliceidx_count_map[guessed_tid][(-1, payload)] += bc_tid_spliceidx_map[bc]['SKIPPED'][-1]
                else:
                    payload = (exp_tid_target_map[tid], 'N')
                    for spliceidx in bc_tid_spliceidx_map[bc][tid]:
                        gid_spliceidx_count_map[tid][(spliceidx, payload)] += bc_tid_spliceidx_map[bc][tid][spliceidx]
            else:
                if tid == 'SKIPPED':
                    guessed_p = get_skipped_indel_postcas(bc)
                    if guessed_p is not None:
                        gid, indel = guessed_p # indel is actually (spliceidx, indel)
                        gid_spliceidx_count_map[gid][(-1, indel)] += bc_tid_spliceidx_map[bc]['SKIPPED'][-1]
                else:
                    for payload in bc_tid_spliceidx_map[bc][tid]:
                        spliceidx, indel = payload
                        gid = get_gid_from_indel(indel, tid)
                        if gid is not None:
                            gid_spliceidx_count_map[gid][payload] += bc_tid_spliceidx_map[bc][tid][payload]
    
    return gid_spliceidx_count_map


def generate_indel_splice_count_map(gid_spliceidx_count_map):
#     {
#     indel: {
#         C: f, 
#         B: f,
#         spliceidx: ,
#         gid:
#     }
# }
    indel_splice_count_map = collections.defaultdict(lambda: collections.defaultdict(int))
    
    for gid in gid_spliceidx_count_map:
        # Skip cases
        for p in gid_spliceidx_count_map[gid]:
            if p[0] == -1:
                spliceidx, indel = p[1]
                indel_splice_count_map[indel]['C'] += gid_spliceidx_count_map[gid][p]
                indel_splice_count_map[indel]['spliceidx'] = spliceidx
                indel_splice_count_map[indel]['gid'] = gid
        # Non skip cases
        indels = {p[1] for p in gid_spliceidx_count_map[gid] if p[0] != -1}
        for indel in indels:
            max_count = 0
            best_spliceidx = None
            total_counts = 0
            for p in gid_spliceidx_count_map[gid]:
                if p[1] == indel and p[0] != -1:
                    count = gid_spliceidx_count_map[gid][p]
                    total_counts += count
                    if count > max_count:
                        max_count = count
                        best_spliceidx = p[0]
            indel_splice_count_map[indel]['B'] += total_counts
            indel_splice_count_map[indel]['spliceidx'] = best_spliceidx
            indel_splice_count_map[indel]['gid'] = gid
    return indel_splice_count_map

In [12]:
if not pickle_exists(INDEL_SPLICE_PRECAS_COUNT_MAP):
    gid_spliceidx_precas_count_map = generate_gid_spliceidx_count_map(bc_tid_spliceidx_precas_map, 'precas')

In [13]:
# Dictionary output format
# {
#     indel: {
#         C: count, 
#         B: count,
#         spliceidx: ,
#         gid:
#     }
# }

if not pickle_exists(INDEL_SPLICE_PRECAS_COUNT_MAP):
    indel_splice_precas_count_map = generate_indel_splice_count_map(gid_spliceidx_precas_count_map)
    save_bc_seq(indel_splice_precas_count_map, INDEL_SPLICE_PRECAS_COUNT_MAP)
else:
    indel_splice_precas_count_map = load_bc_seq(INDEL_SPLICE_PRECAS_COUNT_MAP)

In [14]:
if not pickle_exists(INDEL_SPLICE_POSTCAS_COUNT_MAP):
    gid_spliceidx_postcas_count_map = generate_gid_spliceidx_count_map(bc_tid_spliceidx_postcas_map, 'postcas')

In [15]:
# Dictionary output format
# {
#     indel: {
#         C: count, 
#         B: count,
#         spliceidx: ,
#         gid:
#     }
# }

if not pickle_exists(INDEL_SPLICE_POSTCAS_COUNT_MAP):
    indel_splice_postcas_count_map = generate_indel_splice_count_map(gid_spliceidx_postcas_count_map)
    save_bc_seq(indel_splice_postcas_count_map, INDEL_SPLICE_POSTCAS_COUNT_MAP)
else:
    indel_splice_postcas_count_map = load_bc_seq(INDEL_SPLICE_POSTCAS_COUNT_MAP)

In [16]:
def generate_gt_splice_count_map(indel_splice_count_map):
    gt_splice_count_map = collections.defaultdict(lambda: collections.defaultdict(int))
    
    for indel, splice_count_map in indel_iterator(indel_splice_count_map):
        gid = splice_count_map['gid']
        grna = exp_gid_grna_map[gid]
        design_t = exp_tid_target_map[exp_gid_tid_map[gid]]
        g_orientation = exp_grna_gid_map[grna][1]
        pair = (grna, design_t, g_orientation)
        gt_splice_count_map[pair]['C'] += splice_count_map['C']
        gt_splice_count_map[pair]['B'] += splice_count_map['B']
        
    return gt_splice_count_map

In [17]:
if not pickle_exists(GT_SPLICE_COUNT_MAP):
    gt_splice_count_map = generate_gt_splice_count_map(indel_splice_postcas_count_map)
    save_bc_seq(gt_splice_count_map, GT_SPLICE_COUNT_MAP)
else:
    gt_splice_count_map = load_bc_seq(GT_SPLICE_COUNT_MAP)

In [18]:
if not pickle_exists(GT_PRECAS_SPLICE_COUNT_MAP):
    gt_precas_splice_count_map = generate_gt_splice_count_map(indel_splice_precas_count_map)
    save_bc_seq(gt_precas_splice_count_map, GT_PRECAS_SPLICE_COUNT_MAP)
else:
    gt_precas_splice_count_map = load_bc_seq(GT_PRECAS_SPLICE_COUNT_MAP)

## Compute Empirical Exon B Retention Frequencies ($\Psi_R$, $\Psi_G$, and WT $\Psi$)

$\Psi = \frac{T_B + 1}{T_B + T_C + 2}$

The code below shows how these values are calculated. The same calculations will happen in subsequent notebooks for further analysis.

In [19]:
PSEUDO_COUNT = 1

def get_B_retention_f(indel_splice_count_map, count_thres):
    result = []
    gids = []
    B_counts = []
    C_counts = []
    indels = []
    total_counts = []
    for indel, splice_count_map in indel_iterator(indel_splice_count_map):
        if not (indel[1] == 'N' or indel[1] in DELETION_SIGNATURES or (indel[1] in INSERTION_SIGNATURES and indel[2] == 1)):
            continue
            
        total = splice_count_map['C'] + splice_count_map['B']
        if total <= count_thres:
            continue
            
        C_f = (splice_count_map['C'] + PSEUDO_COUNT) / (splice_count_map['C'] + splice_count_map['B'] + 2*PSEUDO_COUNT)
        result.append(1 - C_f)
        total_counts.append(splice_count_map['C'] + splice_count_map['B'])
        B_counts.append(splice_count_map['B'])
        C_counts.append(splice_count_map['C'])
        gids.append(splice_count_map['gid'])
        indels.append(indel)
    return result, gids, indels, B_counts, C_counts, total_counts


def get_aggBf_scores(gt_splice_count_map):
    aggBf_scores = []
    aggBf_counts = []
    B_counts = []
    C_counts = []
    gids = []
    for gt in gt_splice_count_map.keys():
        m = gt_splice_count_map[gt]
        
        gids.append(exp_grna_gid_map[gt[0]][0])
        C_f = (m['C'] + PSEUDO_COUNT) / (m['C'] + m['B'] + 2*PSEUDO_COUNT)
        aggBf_scores.append(1 - C_f)
        B_counts.append(m['B'])
        C_counts.append(m['C'])
        aggBf_counts.append(m['C'] + m['B'])
    return aggBf_scores, gids, B_counts, C_counts, aggBf_counts

$\Psi_R$ ($\Psi$ from a repair genotype):

In [20]:
count_thres = 50 # Consider only repair genotypes with at least COUNT_THRES observed transcripts for computing PSI values.
splice_precas_Bf, _, _, _, _, _ = get_B_retention_f(indel_splice_precas_count_map, 50)
(splice_postcas_Bf, 
 splice_postcas_gids, 
 splice_postcas_indels, 
 splice_postcas_B, 
 splice_postcas_C, 
 splice_postcas_total) = get_B_retention_f(indel_splice_postcas_count_map, 50)

$\Psi_G$ ($\Psi$ from a gRNA, considering all its repair genotypes observed) and 

WT $\Psi$ ($\Psi$ from a gRNA, preCas9, so no repair genotypes):

In [21]:
aggBf_postcas_scores, postcas_gids, aggBf_postcas_B, aggBf_postcas_C, aggBf_postcas_counts = get_aggBf_scores(gt_splice_count_map)
aggBf_precas_scores, precas_gids, aggBf_precas_B, aggBf_precas_C, aggBf_precas_counts = get_aggBf_scores(gt_precas_splice_count_map)

In [22]:
print('Mean splice transcript count per repair genotype:', np.mean(splice_postcas_total))

Mean splice transcript count per repair genotype: 246.1883577851396


In [23]:
print("Num repair genotypes represented:", len(splice_postcas_Bf))

Num repair genotypes represented: 2113


In [24]:
print("Num gRNAs represented:", len(set(splice_postcas_gids)))

Num gRNAs represented: 1063


## S3 Table

In [25]:
datB_postcas_df = pd.DataFrame({
    'gRNA ID': splice_postcas_gids,
    'Category': ['del' if indel[1] in DELETION_SIGNATURES else 'ins' for indel in splice_postcas_indels],
    'Genotype position': [indel[3] if indel[1] in DELETION_SIGNATURES else None for indel in splice_postcas_indels],
    'Inserted Bases': [indel[3] if indel[1] in INSERTION_SIGNATURES else None for indel in splice_postcas_indels],
    'Length': [indel[2] if indel[1] in DELETION_SIGNATURES else 1 for indel in splice_postcas_indels],
    'Exon B Retained Count': splice_postcas_B,
    'Exon B Skipped Count': splice_postcas_C
})

In [26]:
datB_postcas_df.head()

Unnamed: 0,gRNA ID,Category,Genotype position,Inserted Bases,Length,Exon B Retained Count,Exon B Skipped Count
0,5902,del,2.0,,2,128.5,0.0
1,5902,del,1.0,,1,144.0,2.0
2,3586,del,2.0,,3,41.0,10.5
3,5839,del,3.0,,8,50.5,11.0
4,119,del,0.0,,1,322.5,3.0


In [27]:
datB_postcas_df.shape[0]

2113

In [28]:
datB_postcas_df['gRNA ID'].nunique()

1063

In [29]:
datB_postcas_df.to_csv(os.path.join(TABLES_DIR, 'datB_postCas_table.csv.gz'), index=False, compression='gzip')

In [30]:
print("Total Reads:", datB_postcas_df['Exon B Retained Count'].sum() + datB_postcas_df['Exon B Skipped Count'].sum())

Total Reads: 520196.0


## S4 Table

In [31]:
datB_precas_df = pd.DataFrame({
    'gRNA ID': precas_gids,
    'Exon B Retained Count': aggBf_precas_B,
    'Exon B Skipped Count': aggBf_precas_C
})

In [32]:
datB_precas_df.head()

Unnamed: 0,gRNA ID,Exon B Retained Count,Exon B Skipped Count
0,5902,7940,105
1,1447,3,1
2,1335,943,33
3,1126,13,3
4,3586,890,30


In [33]:
datB_precas_df.shape[0]

1697

In [34]:
"Num Targets represented:", len(set(exp_gid_tid_map[g] for g in datB_precas_df['gRNA ID']))

('Num Targets represented:', 1697)

In [35]:
print("(S3 Fig) Number of gRNAs in both PostCas9 and PreCas9 (WT) datasets:")
print(len(set(splice_postcas_gids) & set(precas_gids)))

(S3 Fig) Number of gRNAs in both PostCas9 and PreCas9 (WT) datasets:
735


In [36]:
datB_precas_df.to_csv(os.path.join(TABLES_DIR, 'datB_preCas_table.csv.gz'), index=False, compression='gzip')

In [37]:
print("Total Reads:", datB_precas_df['Exon B Retained Count'].sum() + datB_precas_df['Exon B Skipped Count'].sum())

Total Reads: 3443915
