# Homework 4: Practical analysis with BioPython

In [2]:
import re

import Bio.SeqIO

## A real biological analysis: parsing barcodes
The reads that we just read as `seqreads_str` come from a real sequencing run of influenza virus HA and NA genes.

The sequences are as follows:

    5'-[end of HA]-AGGCGGCCGC-[16 X N]-3'
    
or 

    5'-[end of NA]-AGGCGGCCGC-[16 X N]-3'
    
The end of NA is:

    ...CACGATAGATAAATAATAGTGCACCAT
    
The end of HA is:

    ...CCGGATTTGCATATAATGATGCACCAT
    
The sequencing run reads from the reverse end of the molecules, so the first thing in the sequencing reads is the barcode followed by the constant sequence and the end of HA or NA.

In [31]:
print(len('CACGATAGATAAATAATAGTGCACCAT'))
print(len('CCGGATTTGCATATAATGATGCACCAT'))

27
27


For the homework, you are going to extend the code from real biological analysis of our FASTQ files in lectures 8 and 9.

As described in the Jupyter notebook for that lecture, the FASTQ reads can originate from **either** HA or NA, and that will be distinguished by the most 3' end of the read.
But in our example, we did not distinguish among reads matching to HA and NA, as we didn't even look far enough into the read to tell the identity.

For the homework, your goal is to write code that extends the material from lectures 8 and 9 to also distinguish between HA and NA.

Please include code to address each of the following questions. Please include code comments to explain what your code is attempting to accomplish. Don't forget to include references to the sources you used to obtain your answer, including your classmates (if you are working in groups).  

In [3]:
seqreads = list(Bio.SeqIO.parse('barcodes_R1.fastq', format='fastq'))

seqreads_str = [str(s.seq) for s in seqreads]

In [None]:
import pandas

barcode_counts_df = pandas.Series(barcode_counts).reset_index()

barcode_counts_df.columns = ['barcode', 'count']

barcode_counts_df

In [42]:

import re

import Bio.SeqIO


def reverse_complement(seq):
    """Get reverse complement of a DNA sequence.
    
    Parameters
    -----------
    seq : str
        Uppercase DNA sequence.
        
    Returns
    -------
    str
        Reverse complement of the sequence in upper case.
        
    Example
    --------
    >>> reverse_complement('ATGCAC')
    'GTGCAT'
    
    """
    rc_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
    revcomplement = []
    for nt in reversed(seq.upper()):
        revcomplement.append(rc_dict[nt])
    return ''.join(revcomplement)

x = reverse_complement(seqreads_str[0]) #try to do a for loop to get the first 8 strings (try later)
#print (x)

def read_barcode(seqread, bclen, upstream='AGGCGGCCGC', HA='CCGGATTTGCATATAATGATGCACCAT'):
    """Identify barcode with known upstream sequence.
    
    Parameters
    ----------
    seqread : str
        Nucleotide sequence matching UPSTREAM-BARCODE read in reverse orientation.
    bclen : int
        Length of barcode
    upstream: str
        Sequence upstream of the barcode.
        
    Returns
    -------
    str or None
        Sequence of the barcode in the forward orientation, or `None` if no match to expected barcoded sequence.
        
    Example
    -------
    >>> read_barcode('TTTTTTTTTTTTTTTTGCGGCCGCCT', bclen=16)
    'AAAAAAAAAAAAAAAA'
        
    """
    barcode_matcher = re.compile(f'^(?P<barcode>[ACGT]{{{bclen}}})' + reverse_complement(upstream) + reserve_complement(HA))
    m = barcode_matcher.search(seqread)
    if not m:
        return None
    else:
        return reverse_complement(seqread[: bclen])

#trying to create a definition to read through seqreads_str and add to invalid_dict
def invalid_barcode(seqread, bclen = 16, upstream = 'AGGCGGCCGC'):
    """Identify barcode with known upstream sequence.
    
    Parameters
    ----------
    seqread : str
        Nucleotide sequence matching UPSTREAM-BARCODE read in reverse orientation.
    bclen : int
        Length of barcode
    upstream: str
        Sequence upstream of the barcode.
        
    Returns
    -------
    str or None
        Sequence of the barcode in the forward orientation, or `None` if no match to expected barcoded sequence.
        
    """
    invalid_dict ={}
    invalid_matcher = re.compile(f'^(?P<barcode>[ACGT]{{{bclen}}})' + reverse_complement ((?!^ABC$(upstream)))
    invalid_search = invalid_matcher.search(seqread)
    
    if invalid_search:
        invalid_dict += 1
    else:
        return None


    
    
# now read sequences and apply function
seqreads = list(Bio.SeqIO.parse('barcodes_R1.fastq', 'fastq'))
seqreads_str = [str(seqrecord.seq) for seqrecord in seqreads]

# Get the counts of all barcodes
#barcode_counts = {}
HA = {}
NA = {}



HA_count = 0
NA_count = 0
n_invalid = 0

for seq in seqreads_str:
    #seq = reverse_complement(seq)
    if reverse_complement(seq[26:53]) == 'CCGGATTTGCATATAATGATGCACCAT':
        HA_count += 1
    elif reverse_complement(seq[26:53]) == 'CACGATAGATAAATAATAGTGCACCAT':
        NA_count += 1
    else:
        n_invalid += 1
   
print(HA_count, NA_count, n_invalid)
#print(seqreads_str[0])
#print(reverse_complement(seqreads_str[0]))
#print(seqreads_str[0].find('GCGGCCGCCT')+1+len('GCGGCCGCCT'))
#print(seqreads_str[0][27:27+len('CACGATAGATAAATAATAGTGCACCAT')])
#print(reverse_complement(seqreads_str[0][26:26+len('CACGATAGATAAATAATAGTGCACCAT')]))
#print(seqreads_str[0].find('GCGGCCGCCT')+len('GCGGCCGCCT'))

#print(f"Parsed {len(seqreads_str)} sequences, of which {n_invalid} lacked valid barcodes")
#print (barcode_counts)
#print(seqreads_str[43])
#print(seqreads_str[43].find('GCGGCCGCCT'))
#print(read_barcode(seqreads_str[43], bclen=16))

n_invalid_dict = {}
n_invalid = 0

for seq in seqreads_str:
    invalid = invalid_barcode(seq, bclen=16)
    if invalid:
        n_invalid_dict[invalid] += 1



5299 3958 743


1. How many reads map to HA, and how many reads map to NA?

2. What is the HA barcode with the most counts (and how many counts)? Also answer the same question for NA.

In [44]:
HA = {} #creates empty dictionary that holds HA barcodes and values of each barcode
NA = {} # creates empty dictionary that holds NA barcodes and values of each barcode

HA_counts = 0 #starts an HA barcode counter
NA_counts = 0 #starts an NA barcode counter
n_invalid = 0 #starts an invalid counter

def read_barcode(seqread, bclen, upstream='AGGCGGCCGC', HA='CCGGATTTGCATATAATGATGCACCAT'): #defines the read_barcode function
    barcode_matcher = re.compile(f'^(?P<barcode>[ACGT]{{{bclen}}})' + reverse_complement(upstream) + reverse_complement(HA))
    #compiles a pattern that looks for the barcode, the given length (16), the reverse complement of upstream, and the reverse complement of HA gene
    m = barcode_matcher.search(seqread) #searches for specific pattern (barcode_matcher) in seqread
    if not m:
        return None
    else:
        return reverse_complement(seqread[: bclen])

seqreads = list(Bio.SeqIO.parse('barcodes_R1.fastq', 'fastq'))
seqreads_str = [str(seqrecord.seq) for seqrecord in seqreads]

for seq in seqreads_str:
    barcode = read_barcode(seq, bclen=16)
    if not barcode:
        n_invalid += 1
    elif barcode not in HA_counts:
        HA_counts[barcode] = 1
    else:
        HA_counts[barcode] += 1

TypeError: argument of type 'int' is not iterable