In [None]:
# default_exp tile

# tile

> Extract sgrnas from a sequence

In [None]:
# export
import pandas as pd

In [None]:
# export
def get_sequence_kmers(sequence, k):
    """Get all subsequences of length k. Include overlapping sequences

    sequence: str |
    k: int |

    returns: list of str
    """
    seq_len = len(sequence)
    subsequences = [sequence[i:(i+k)] for i in range(seq_len - k + 1)]
    return subsequences

In [None]:
from sgrna_designer import ensembl

sequence = ensembl.get_transcript_sequence('ENST00000381577')
context_subsequences = get_sequence_kmers(sequence, 30)
assert (len(sequence) - len(context_subsequences)) == 30 - 1
assert all([len(x) == 30 for x in context_subsequences])
assert len(set(context_subsequences)) == len(context_subsequences)
assert context_subsequences[0] == sequence[0:30]
assert context_subsequences[-1] == sequence[-30:]

In [None]:
# export
def extract_subsequences(sequences, subseq_len, subseq_start):
    """Given a list of sequences extract a subsequence from each one

    sequences: list |
    pam_len: int, length of pam being considered |
    pam_start: int, starting position for the pam sequence (-6 for Cas9) |

    returns: list
    """
    subsequences = [seq[subseq_start:(subseq_start+subseq_len)] for seq in sequences]
    return subsequences

In [None]:
pam_sequences = extract_subsequences(context_subsequences, 4, -6)
assert pam_sequences[0] == 'TTCA'

In [None]:
sgrna_sequences = extract_subsequences(context_subsequences, 20, 4)
assert sgrna_sequences[0] == 'GGATATTTGCTGTCTTTATA'

In [None]:
# export
def build_sgrna_df(sequence, context_len, pam_start, pam_len,
                   sgrna_start, sgrna_len, pams=None):
    """Given a sequence, build a dataframe with all possible sgRNAs

    sequence: str, sequence to designs sgRNAs against |
    context_len: int, length of context sequence |
    pam_start: int, position of PAM start relative to the context sequence |
    pam_len: int, length of PAM |
    sgrna_start: int, position of sgRNA relative to context sequence |
    sgrna_len: length of sgRNA sequence |
    pams: list or None, PAMs to design against |

    returns: DataFrame
    """
    subsequences = get_sequence_kmers(sequence, context_len)
    sgrna_df = pd.DataFrame({'context_sequence': subsequences,
                             'context_relative_start': range(len(subsequences))})
    sgrna_df['pam_sequence'] = extract_subsequences(sgrna_df['context_sequence'], pam_len,
                                                    pam_start)
    if pams is not None:
        sgrna_df = (sgrna_df[sgrna_df.pam_sequence.isin(pams)]
                    .reset_index(drop=True))
    sgrna_df['sgrna_sequence'] = extract_subsequences(sgrna_df['context_sequence'], sgrna_len,
                                                      sgrna_start)
    sgrna_df['sgrna_relative_start'] = sgrna_df['context_relative_start'] + sgrna_start
    sgrna_df = sgrna_df.drop('context_relative_start', axis=1)
    return sgrna_df

In [None]:
sgrna_df = build_sgrna_df(sequence, context_len=30, pam_start=-6,
                          pam_len=3, sgrna_start=4, sgrna_len=20,
                          pams=['CGG', 'TGG', 'AGG', 'GGG'])
assert (sequence[sgrna_df['sgrna_relative_start'][0]:(sgrna_df['sgrna_relative_start'][0]+20)] ==
        sgrna_df['sgrna_sequence'][0])