### Code and Sanity Checks

In [1]:
# Download data
!wget --no-check https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa

--2018-12-22 21:50:47--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
Resolving d28rh4a8wq0iu5.cloudfront.net... 52.85.83.57, 52.85.83.80, 52.85.83.108, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net|52.85.83.57|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49270 (48K) [application/octet-stream]
Saving to: ‘lambda_virus.fa.2’


2018-12-22 21:50:47 (2.94 MB/s) - ‘lambda_virus.fa.2’ saved [49270/49270]



In [2]:
from Bio.Seq import Seq
import Bio.SeqIO

# Leverage what I wrote in a previous course
def find_seq(seq, codon, n=0, codon_index=[]):
    """
    Recursively searches a sequence and returns the
    start location of the specified codon.

    Note: code supports arbitrary sequences and lengths.
          So, "codon" here is perhaps too application
          specific.

    Args:
        seq (sequence): nucleotide sequence
        codon (str): triplet codon to search for
        n (int): starting index for search (0)
        codon_index (list): list to store start locations
                            Used as a passthrough for index
                            tracking/appending.

    Returns:
        codon_index (list): list of start locations
    """

    # Find the first instance of a start codon
    _seq = seq[n:].upper()

    if n == 0:

        codon_index = []

        # Case standardization
        codon = codon.upper()

    # Find the next instance of the specified codon
    index = _seq.find(codon)

    if index != -1:

        codon_index.append(n + index)

        return find_seq(seq, codon, n + index+1, codon_index)

    elif index == -1:

        return codon_index

In [3]:
genome = list(Bio.SeqIO.parse('lambda_virus.fa', 'fasta')).pop().seq

In [4]:
# Implement naive_rc
def naive_rc(reference, target, unique_only=False):
    """
    Naive, exact-matching algo that checks forward and reverse
    strand for matching.
    
    Args:
        reference (Seq): reference genome
        target (Seq): target sequence

    Returns:
        occurences (list): list of occurences
    """
    
    # Match forward direction
    forward = find_seq(reference, target)
    
    # If the reverse complement is identical to the original
    # sequence, don't double count
    if target != target.reverse_complement():        
        # Find reverse complement matches
        reverse = find_seq(reference, target.reverse_complement())
    else:
        reverse = []
    
    # Return only unique elements
    if unique_only:
        return list(set(forward + reverse))
    else:
        return forward + reverse


# Question 5 requires something slightly different (mismatch tolerance)
def naive_mismatch(reference, target, n_mismatch=2):
    """
    Modified version of naive, exact-match algo.
    
    Args:
        reference (Seq): reference genome
        target (Seq): target sequence

    Returns:
        occurences (list): list of occurences
    """

    occurrences = []
    for i in range(len(reference) - len(target) + 1):  # loop over alignments
        match = True
        
        # Track mismatch count
        mismatch = 0

        for j in range(len(target)):
            if reference[i+j] != target[j]:
                mismatch += 1

            if mismatch > n_mismatch:
                match = False
                break

        if match:
            occurrences.append(i)  # all chars matched; record

    return occurrences

In [5]:
naive_rc(Seq('GGGCCCGTGCAATGGG'), Seq('GGG'))

[0, 13, 3]

### Examples Provided in Course

In [6]:
# Example 1

p = 'CCC'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CCC' + ten_as + 'GGG' + ten_as

naive_rc(Seq(t), Seq(p))

[10, 23]

In [7]:
# Example 2
p = 'CGCG'
t = ten_as + 'CGCG' + ten_as + 'CGCG' + ten_as
Seq(p).reverse_complement() == Seq(p)
naive_rc(Seq(t), Seq(p))

[10, 24]

In [8]:
# Example 3
!wget http://d396qusza40orc.cloudfront.net/ads1/data/phix.fa
phix_genome = list(Bio.SeqIO.parse('phix.fa', 'fasta')).pop().seq
occurrences = naive_rc(phix_genome, Seq('ATTA'))
min(occurrences)

--2018-12-22 21:50:47--  http://d396qusza40orc.cloudfront.net/ads1/data/phix.fa
Resolving d396qusza40orc.cloudfront.net... 52.85.83.112, 52.85.83.100, 52.85.83.50, ...
Connecting to d396qusza40orc.cloudfront.net|52.85.83.112|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5528 (5.4K) [application/octet-stream]
Saving to: ‘phix.fa’


2018-12-22 21:50:47 (3.44 MB/s) - ‘phix.fa’ saved [5528/5528]



62

In [9]:
# Example 1
p = 'CTGT'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CTGT' + ten_as + 'CTTT' + ten_as + 'CGGG' + ten_as
naive_mismatch(t, p)

[10, 24, 38]

In [10]:
# Example 2
p = 'GATTACA'
t = str(phix_genome)

occurrences = naive_mismatch(t, p)
min(occurrences)

10

### Quiz

In [11]:
genome = list(Bio.SeqIO.parse('lambda_virus.fa', 'fasta')).pop().seq

In [12]:
# Question 1
target = Seq('AGGT')
occurences = naive_rc(genome, target)
len(occurences)

306

In [13]:
# Question 2
target = Seq('TTAA')
occurences = naive_rc(genome, target)
len(occurences)

195

In [14]:
# Question 3
target = Seq('ACTAAGT')
occurences = naive_rc(genome, target)
min(occurences)

26028

In [15]:
# Question 4
target = Seq('AGTCGA')
occurences = naive_rc(genome, target)
min(occurences)

450

In [16]:
# Question 5
target = Seq('TTCAAGCC')
occurrences = naive_mismatch(genome, target)
len(occurrences)

191

In [17]:
# Question 6
target = Seq('AGGAGGTT')
occurrences = naive_mismatch(genome, target)
min(occurrences)

49

In [18]:
# Question 7
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq
err_genome = list(Bio.SeqIO.parse('ERR037900_1.first1000.fastq', 'fastq'))


--2018-12-22 21:50:48--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq
Resolving d28rh4a8wq0iu5.cloudfront.net... 52.85.83.57, 52.85.83.80, 52.85.83.108, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net|52.85.83.57|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 241626 (236K) [application/octet-stream]
Saving to: ‘ERR037900_1.first1000.fastq’


2018-12-22 21:50:48 (2.25 MB/s) - ‘ERR037900_1.first1000.fastq’ saved [241626/241626]



In [19]:
# Convert phred_quality into a 2d numpy array for faster processing
import numpy as np

phred_quality = np.array(
    [s.letter_annotations['phred_quality'] for s in err_genome]
)

mean_quality = phred_quality.mean(axis=0)

In [20]:
np.where(mean_quality == mean_quality.min())

(array([66]),)

In [21]:
mean_quality[60:70]

array([30.805, 30.666, 30.343, 30.131, 29.463, 17.891,  4.526, 17.881,
       17.865, 17.852])

In [1]:
[1:1:10]

SyntaxError: invalid syntax (<ipython-input-1-75d8630360ff>, line 1)