In [1]:
from config import *
from utils import *

import os
import sys
import regex
import copy
import numpy as np
import collections
import multiprocessing
import pickle

import numpy as np
import scipy

# Suppress pandas future warning, which messes tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd

from tqdm.notebook import tqdm

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

inline_rc = dict(mpl.rcParams)
import subprocess

import time

from keras.models import load_model
from pkg_resources import resource_filename
from spliceai.utils import one_hot_encode
import numpy as np

The examples.directory rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2. In the future, examples will be found relative to the 'datapath' directory.
Using TensorFlow backend.


# SpliceAI Predict
SpliceAI can predict the probability of a base being part of an acceptor given its surrounding sequence context.

## SpliceAI Predict dat-B
For every repair product observed in dat-B, use SpliceAI to detect any acceptor regions, by taking max(SpliceAI(base) -> P(acceptor) for each base). Save the scores for later analysis in MetaSplice_SkipGuide_Evaluation.ipynb.

In [2]:
indel_splice_precas_count_map = load_bc_seq(INDEL_SPLICE_PRECAS_COUNT_MAP)
indel_splice_postcas_count_map = load_bc_seq(INDEL_SPLICE_POSTCAS_COUNT_MAP)
gt_splice_count_map = load_bc_seq(GT_SPLICE_COUNT_MAP)
gt_precas_splice_count_map = load_bc_seq(GT_PRECAS_SPLICE_COUNT_MAP)
predicted_gt_indel_dist_map = load_var(PREDICTED_GT_INDEL_DIST_MAP)

In [3]:
EXON_A = 'CAAGATCCGCCACAACATCGAG'
INTRON_TARGETSTART = 'GTAAGTTATCACCTTCGTGGCTACAGAGTTTCCTTATTTGTCTCTGTTGCCGGCTTATATGGACAAGCATATCACAGCCATTTATCGGAGCGCCTCCGTACACGCTATTATCGGACGCCTCGCGAGATCAATACGATTACCAGCTGCCCTCGTCGAC'
TARGETEND_EXON_B = 'TGATTACACATATAGACACGCGAGCAGCCATCTTTTATAGAATGGGTAGAACCCGTCCTAAGGACTCAGATTGAGCATCGTTTGCTTCTCGAGTACTACCTGGTACAGATGTCTCTTCAAACAG'
EXON_C_BCSTART = 'GACGGCAGCGTGCAGCTCGCC'
BCEND_EXON_C = 'GACCACTACCAGCAGAACACCCC'

In [4]:
# Load spliceai models
context = 10000
paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
models = [load_model(resource_filename('spliceai', x)) for x in paths]

Instructions for updating:
Colocations handled automatically by placer.




In [5]:
def spliceai_predict(payload):
    i, model, x = payload
    with tf.device('/device:GPU:' + str(i)):
        return model.predict(x, verbose=verbose)
    

def predict_SpliceAIScores(indel_splice_count_map, verbose=1):
    # target len 61, plus 1 possible insertion = 62 max
    max_seq_len = context + len(INTRON_TARGETSTART) + 62 + len(TARGETEND_EXON_B)
    x = np.zeros(
        (sum(indel[1] == 'N' or indel[1] in DELETION_SIGNATURES or (indel[1] in INSERTION_SIGNATURES and indel[2] == 1) for i, (indel, splice_count_map) in enumerate(indel_iterator(indel_splice_count_map))),
        max_seq_len,
        4))
    
    for i, (indel, splice_count_map) in enumerate(indel_iterator(indel_splice_count_map)):
        target = exp_tid_target_map[exp_gid_tid_map[splice_count_map['gid']]]
        if indel[1] == 'N':
            s = target
        elif indel[1] in DELETION_SIGNATURES:
            s = get_simulated_product(indel, target)
        elif indel[1] in INSERTION_SIGNATURES and indel[2] == 1:
            s = get_simulated_product(indel, target)
        else:
            continue
        
        input_sequence = 'N'*(context//2) + INTRON_TARGETSTART + s + TARGETEND_EXON_B
        input_sequence += 'N'*(max_seq_len - len(input_sequence))
        x[i, :, :] = one_hot_encode(input_sequence)[None, :]

    y = np.mean([models[m].predict(x, verbose=verbose) for m in range(5)], axis=0)
    return y

In [6]:
def get_acceptor_score(preds):
    return np.max(preds[:, :, 1], axis=1)

In [7]:
if not os.path.exists(SPLICEAI_PRECAS_PREDS_PATH):
    spliceai_precas_preds = predict_SpliceAIScores(indel_splice_precas_count_map)
    with open(SPLICEAI_PRECAS_PREDS_PATH, 'wb') as f:
        np.save(f, spliceai_precas_preds)
else:
    with open(SPLICEAI_PRECAS_PREDS_PATH, 'rb') as f:
        spliceai_precas_preds = np.load(f)

In [8]:
if not os.path.exists(SPLICEAI_POSTCAS_PREDS_PATH):
    spliceai_postcas_preds = predict_SpliceAIScores(indel_splice_postcas_count_map)
    with open(SPLICEAI_POSTCAS_PREDS_PATH, 'wb') as f:
        np.save(f, spliceai_postcas_preds)
else:
    with open(SPLICEAI_POSTCAS_PREDS_PATH, 'rb') as f:
        spliceai_postcas_preds = np.load(f)

## SpliceAI Predict on inDelphi Predictions
For every designed guide/target pair, we used inDelphi to predict repair genotypes and their frequencies. For each predicted repair product, use SpliceAI to detect any acceptor regions, by taking max(SpliceAI(base) -> P(acceptor) for each base). Save the scores for later analysis in MetaSplice_SkipGuide_Evaluation.ipynb.

In [9]:
def save_spliceai_predict_gt_indel_dist(pairs):
    gid_spliceai_repair_preds = {}
    for i, p in enumerate(tqdm(pairs)):
        gid = exp_grna_gid_map[p[0]][0]
        cutsite = get_cutsite(*p)
        indel_splice_count_map = collections.defaultdict(lambda: collections.defaultdict(int))
        distribution = predicted_gt_indel_dist_map[p]
        for deletion_size in range(1, MAX_INDEL_LEN + 1):
            for genotype_pos in distribution[-deletion_size]:
                if distribution[-deletion_size][genotype_pos] > 0:
                    indel = ('', 'DS', deletion_size, genotype_pos, cutsite)
                    indel_splice_count_map[indel]['gid'] = gid
                    indel_splice_count_map[indel]['indelphifreq'] = distribution[-deletion_size][genotype_pos]
        for base in 'AGTC':
            if distribution[1][base] > 0:
                indel = ('', 'IS', 1, base, cutsite)
                indel_splice_count_map[indel]['gid'] = gid
                indel_splice_count_map[indel]['indelphifreq'] = distribution[1][base]
                
        preds = predict_SpliceAIScores(indel_splice_count_map, verbose=0)
        y_preds = get_acceptor_score(preds)
        indelphifreq = [indel_splice_count_map[indel]['indelphifreq'] for indel, _ in indel_iterator(indel_splice_count_map)]
        
        gid_spliceai_repair_preds[gid] = [y_preds.tolist(), indelphifreq]
        
    save_var(gid_spliceai_repair_preds, SPLICEAI_GT_REPAIR_PREDS)

In [10]:
if not pickle_exists(SPLICEAI_GT_REPAIR_PREDS):
    lib_pairs = [gid_to_gt(gid) for gid in exp_gid_tid_map]
    save_spliceai_predict_gt_indel_dist(lib_pairs)

In [11]:
gid_spliceai_repair_preds = load_var(SPLICEAI_GT_REPAIR_PREDS)