## Overview

This workbook contains code and notes used to answer week 3's programming assignment.

In [1]:
# Need to load the genome
from Bio.Seq import Seq
import Bio.SeqIO

genome = list(Bio.SeqIO.parse('chr1.GRCh38.excerpt.fasta', 'fasta')).pop().seq

In [2]:
# Prefer to use numpy than a list of lists
import numpy as np

# We also need an approximate matching algorithm
# As mentioned in the programming notes, this is similar to the exact alignment algorithm
# but critically, the first row is initialized as zeros.
def approximate_match(P, T):
    """
    Adaptation of the edit distance function to do approximate matching
    I renamed the variables to be more telling than X/Y.
    Also, I leveraged numpy arrays instead of lists of lists,
    although there is little difference in efficiency in this case
    
    Args:
        P (str): pattern to match to T
        T (str): reference sequence (e.g., genome)
    
    Returns:
        edit_distance (integer): approximate match distance
    """

    # Need len + 1 so we can account for the initialization term
    D = np.zeros([len(P)+1, len(T)+1]).astype(int)

    D[:, 0] = range(len(P)+1)

    for row in range(1, len(P)+1):
        for col in range(1, len(T)+1):
            dist_vert = D[row-1, col] + 1
            dist_hor = D[row, col-1] + 1

            # Compare against last alignment
            dist_diag = D[row-1, col-1]

            # Do the letters mismatch?
            if P[row-1] == T[col-1]:
                dist_diag += 0
            else:
                dist_diag += 1

            # Finally, assign the distance to this particular cell in the array
            D[row, col] = min(dist_vert, dist_hor, dist_diag)

    return D[-1, :].min()

In [3]:
list(Bio.SeqIO.parse('chr1.GRCh38.excerpt.fasta', 'fasta'))

[SeqRecord(seq=Seq('TTGAATGCTGAAATCAGCAGGTAATATATGATAATAGAGAAAGCTATCCCGAAG...AGG', SingleLetterAlphabet()), id='CM000663.2_excerpt', name='CM000663.2_excerpt', description='CM000663.2_excerpt EXCERPT FROM CM000663.2 Homo sapiens chromosome 1, GRCh38 reference primary assembly', dbxrefs=[])]

## Example 01

Example provided in the programming reading section. Testing function above against it to make sure it works as expected

In [4]:
P = 'GCGTATGC'
T = 'TATTGGCTATACGGTT'

approximate_match(P, T)

2

## Question 01

In [5]:
P = 'GCTGATCGATCGTACG'
approximate_match(P, genome)

3

## Question 02

In [6]:
P = 'GATTTACCAGATTGAG'
approximate_match(P, genome)

2

## Question 03

In [7]:
# This contains all the reads in the file
genome = list(Bio.SeqIO.parse('ERR266411_1.for_asm.fastq', 'fastq')) # subset for testing purposes

In [8]:
genome[0]

SeqRecord(seq=Seq('TAAACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGA...CTC', SingleLetterAlphabet()), id='ERR266411.1', name='ERR266411.1', description='ERR266411.1 HS18_09233:8:1307:10911:3848#168/1', dbxrefs=[])

In [20]:
def overlap(seq_suffix, seq_prefix, min_length=3):
    """
    Check to see if the suffix of seq_suffix matches the prefix
    of seq_prefix exactly. Note that no differences are tolerated.
    
    Args:
        seq_suffix (str): sequence whose suffix will be compared
                          against the seq_prefix
        seq_prefix (str): sequence whose prefix will be compared
                          against the seq_suffix

    Returns:
        is_overlap (bool): True if the sequences match exactly
                           up to min_length. Otherwise, False
    """

    start = 0  # start all the way at the left

    while True:

        start = seq_suffix.find(seq_prefix[:min_length], start)  # look for b's prefix in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if seq_prefix.startswith(seq_suffix[start:]):
            return len(seq_suffix)-start

        start += 1

In [21]:
overlap('ATGGTC', 'GTCCCC')

3

In [66]:
# As mentioned in question 3, it is very slow to compare all possible combinations of reads from even a small
# set of reads. Instead, build an index out of the reads. To shorten our search path, we'll build a custom
# index class that will create a lookup for the desired k-mer.
#
# To do this, I modified previous work using a k-mer index.
class ReadDict(object):
    """
    Wanted a lighter-weight way of storing relationships between
    IDs and sequences, so decided to create a class.
    
    Reads are stored as a dict internally.
    """

    def __init__(self, reads):
            
        self.reads = reads
        
        self.read_dict = {}
        
        for seq_record in reads:
            
            self.read_dict[seq_record.id] = seq_record

    def get_read(self, read_id):
        
        return self.read_dict[read_id]

    def get_read_seq(self, read_id):
        
        return self.read_dict[read_id].seq

class Index(object):
    
    # Build a k-mer index
    def __init__(self, reads, k):

        # Track k_mer length
        self.k = k
        
        # For every read, build the index
        self.index = self._get_read_kmers(reads, k)

    def _get_read_kmers(self, reads, k):

        k_mers = {}

        # Loop through all the reads
        for seq_record in reads:
 
            seq = seq_record.seq
            seq_id = seq_record.id

            for i in range(len(seq) - k + 1):  # for each k-mer

                key = str(seq[i:i+k])

                if key not in k_mers:
                    k_mers[key] = set()

                k_mers[key].add(seq_id)

        return k_mers


# Now, use an index and a ReadDict to do the comparisons
def overlap_fast(seq_record, index, read_dict, min_length=3):
    """
    Check to see if the suffix of seq_suffix matches the prefix
    of seq_prefix exactly. Note that no differences are tolerated.
    
    Args:
        seq_suffix (str): sequence whose suffix will be compared
                          against the seq_prefix
        seq_prefix (str): sequence whose prefix will be compared
                          against the seq_suffix

    Returns:
        is_overlap (bool): True if the sequences match exactly
                           up to min_length. Otherwise, False
    """

    # Get the suffix length
    #  Need instructions here to support multiple input types
    if isinstance(seq_record, Bio.SeqRecord.SeqRecord):
        suffix = seq_record.seq[-1*min_length:]
    elif isinstance(seq_record, (Bio.Seq.Seq, str)):
        suffix = seq_record[-1*min_length:]
    
    # Get a list of sequences to search
    #  This will return a list of sequence IDs to compare against
    # Note: we need to remove self-referential comparisons
    # So, remove the desired seq_record from the set
    # This is done using a set difference
    seq_subset = index.index[suffix] - {seq_record.id}
    
    overlap_seq = set()

    for seq_id in seq_subset:

        # If there's any overlap
        if overlap(seq_record.seq, read_dict.get_read_seq(seq_id), min_length) > 0:
            
            overlap_seq.add(seq_id)

    return overlap_seq


def overlap_batch(index, read_dict, min_length):
    """
    Batch processing to return the number of overlaps per sequence
    
    Args:
        index (Index): an Index object
        read_dict (ReadDict)
        min_length (int): minimum overlap length

    Returns:
        overlap_seq (dict): a dictionary where keys are sequence IDs of suffix
                            sequence and the value is a set of overlapping prefix.
    """

    # Start with a dictionary
    overlap_seq = {}

    # Now that everything is working as anticipated for a single example,
    # look through all reads.
    for seq_id, seq_record in read_dict.read_dict.items():
        
        # Start with an empty set
        overlap_seq[seq_id] = overlap_fast(seq_record, index, read_dict, min_length)
        

    return overlap_seq
        

### Exampe 01

Test case to make sure the code is working as expected.

In [91]:
reads = [Bio.SeqRecord.SeqRecord(id=seq, seq=seq) for i, seq in enumerate(iter(['ABCDEFG', 'EFGHIJ', 'HIJABC']))]

min_length = 4
index = Index(reads, min_length)
read_dict = ReadDict(reads)
# read_dict.read_dict
overlap_batch(index, read_dict, min_length)

# So, no overlaps

{'ABCDEFG': set(), 'EFGHIJ': set(), 'HIJABC': set()}

In [92]:
min_length = 3
index = Index(reads, min_length)
read_dict = ReadDict(reads)
# read_dict.read_dict
overlap_batch(index, read_dict, min_length)

{'ABCDEFG': {'EFGHIJ'}, 'EFGHIJ': {'HIJABC'}, 'HIJABC': {'ABCDEFG'}}

### Example 02

In [93]:
reads = [Bio.SeqRecord.SeqRecord(id=seq, seq=seq) for i, seq in enumerate(iter(['CGTACG', 'TACGTA', 'GTACGT', 'ACGTAC', 'GTACGA', 'TACGAT']))]

min_length = 4
index = Index(reads, min_length)
read_dict = ReadDict(reads)
# read_dict.read_dict
overlap_batch(index, read_dict, min_length)

# So, no overlaps

{'CGTACG': {'GTACGA', 'GTACGT', 'TACGAT', 'TACGTA'},
 'TACGTA': {'ACGTAC', 'CGTACG'},
 'GTACGT': {'ACGTAC', 'TACGTA'},
 'ACGTAC': {'CGTACG', 'GTACGA', 'GTACGT'},
 'GTACGA': {'TACGAT'},
 'TACGAT': set()}

In [94]:
min_length = 5
index = Index(reads, min_length)
read_dict = ReadDict(reads)
# read_dict.read_dict
overlap_batch(index, read_dict, min_length)

{'CGTACG': {'GTACGA', 'GTACGT'},
 'TACGTA': {'ACGTAC'},
 'GTACGT': {'TACGTA'},
 'ACGTAC': {'CGTACG'},
 'GTACGA': {'TACGAT'},
 'TACGAT': set()}

In [95]:
# Build the index
min_length = 30

# This is costly to build initially
index = Index(genome, min_length)
read_dict = ReadDict(genome)

In [96]:
overlap_seq = overlap_batch(index, read_dict, min_length=30)



In [97]:
# Count the edges
edge_count = 0
for key, val in overlap_seq.items():
    edge_count += len(val)
edge_count

904746

In [98]:
node_count = 0
for key, val in overlap_seq.items():
    if len(val) > 0:
        node_count += 1
node_count

7161