# Working with Dictionaries

In [None]:
from nose.tools import assert_equals, assert_true
from ipywidgets import interact
from IPython.display import HTML, display

**Exercise:** Create a dictionary variable named "sequences" that has as values sequences1 through sequence3 and keys "seq1", "seq2", and "seq3".

In [None]:
# the "\" character can be used as a line continuation in PPython
sequence1 = \
"""AGTTGTTAGTCTACGTGGACCGACAAGAACAGTTTCGAATCGGAAGCTTGCTTAACGTAGTTCTAACAGT
TTTTTATTAGAGAGCAGATCTCTGATGAACAACCAACGGAAAAAGACGGGTCGACCGTCTTTCAATATGC"""
sequence2= \
"""TGAAACGCGCGAGAAACCGCGTGTCAACTGTTTCACAGTTGGCGAAGAGATTCTCAAAAGGATTGCTTTC
AGGCCAAGGACCCATGAAATTGGTGATGGCTTTTATAGCATTCCTAAGATTTCTAGCCATACC"""
sequence3= \
"""GCAGGAATTTTGGCTAGATGGGGCTCATTCAAGAAGAATGGAGCGATCAAAGTGTTACGGGTCAAGA
AAGAAATCTCAAACATGTTGAACATAATGAACAGGAGGAAAAGATCTGTGACCATGCTCCTCATGCTGCT"""
sequence4= \
"""GCCCACAGCCCTGGCGTTCCATCTGACCACCCGAGGGGGAGAGCCGCACATGATAGTTAGCAAGCAGGAA
AGAGGAAAATCACTTTTGTTTAAGACCTCTGCAGGTGTCAACATGTGCACCCTTATTGCAATGGATTTGG
GAGAGTTATGTGAGGACACAATGACCTACAAATGCCCCCGGATCACTGAGACGGAACCAGATGACGTTGA
CTGTTGGTGCAATGCCACGGAGACATGGGTGACCTATGGAACATGTTCTCAAACTGGTGAACACCGACGA
GACAAACGTTCCGTCGCACTGGCACCACACGTAGGGCTTGGTCTAGAAACAAGAACCGAAACGTGGATGT"""


In [None]:
sequences = {"seq1": sequence1, "seq2": sequence2, "seq3": sequence3}

In [None]:
assert_true(type(sequences),dict)
assert_true("seq1" in sequences)
assert_true("seq4" not in sequences)
assert_true(sequences["seq2"]==sequence2)

In [None]:
values = [sequence1, sequence2, sequence3, sequence4]
keys = ["seq1", "seq2", "seq3", "seq4"]
dict1 = {}
for i in range(len(keys)):
    dict1[keys[i]] = values[i]
    
dict1

In [None]:
help(dict)

In [None]:
list(zip(keys, values))

In [None]:
dict2 = dict(zip(keys, values))
dict2

### Dictionary Comprehension

In [None]:
{keys[i]:values[i] for i in range(len(keys))}

**Exercise:** Modify the function in the cell below, so that it returns the count of how many times the kmer occurs in sequence. Before counting, make sure both `sequence` and `kmer` consists only of valid DNA sequence symbols (i.e. `A`, `C`, `T`, and `G`). If either is not a proper DNA sequence we will have the function fail (raise an Exception)

In [None]:
def kmer_count(sequence="", kmer = "CCGATTCG"):
    """This is an example of defining a function with keyword arguments:
    
    sequence and kmer are variables in the function kmer_present. 
    
    These variables have DEFAULT VALUES ("" and "CCGATTCG" respectively) 
    that will be used if no values are provided explicitly"""
    
    sequence=sequence.replace("\n","").upper() #remove new lines and convert to uppercase
    kmer=kmer.replace("\n","").upper()
    
    if not set(kmer).issubset (set("ACTG")) or not set(sequence).issubset(set("ACTG")):
        raise ValueError("Invalid DNA")
    
    return sequence.count(kmer)
    
        

In [None]:
help(sequence1.count)

In [None]:
set(sequence1)

In [None]:
kmer_count(sequence4, "ATGG")

In [None]:
assert_equals(kmer_count(sequence4, kmer="CTG"),7)
assert_equals(kmer_count(sequence4, kmer="CACACGTAGGG"),1)
assert_equals(kmer_count(sequence4, kmer="CAGG"), 2)
assert_equals(kmer_count(sequence4, kmer="CAGGTTT"), 0)

## Exercise

In the following exercise we are going to using some of the [IPython widget capabilities](http://ipywidgets.readthedocs.io/en/stable/examples/Using%20Interact.html) to make an interactive tool for exploring the sequences. We want the following functionality

### `@interact`
* We want a drop down menu where we select which sequence to explore
* We want a text box where we type in a kmer

### `kmer_present`

* If `kmer` is valid, we want to display the number of times (count) `kmer` occurs in the sequence
    * If the count is non-zero, modify the sequence so that the occurrences of `kmer` are displayed in red using
        * `<font color="red">some text</font>` (<font color="red">some text</font>).
    * Otherwise, just display the original sequence string unmodified
* If `kmer` is not valid, we should display either 'NA" or a not a number value (e.g. np.nan)



In [None]:
@interact(sequence = sequences, kmer = ""
)
def kmer_present(sequence="", kmer = "CCGATTCG"):
    """This is an example of defining a function with keyword arguments:
    
    sequence and kmer are variables in the function kmer_present. 
    
    These variables have DEFAULT VALUES ("" and "CCGATTCG" respectively) that will be used if no values are provided
    explicitly.
    """
    
    kmer=kmer.upper()
    sequence=sequence.upper()
    
    # Here we are going to create a string with HTML markup. We will use the string format() method to insert
    # our values into the string.
    
    # create a variable to store the length of the sequence
    seq_length = len(sequence) # replace this with actual value
    
    # create a variable to store the answer of whether the kmer is in the sequence
    s = '<h3>KMER Values:</h3><table>\n'
    s += '<tr><td>{0}</td><td>{1}</td></tr>\n'.format("sequence length",seq_length)
    try:
        # YOUR CODE HERE
        kmer_count(sequence, kmer)
        count = kmer_count(sequence, kmer)
        s+= '<tr><td>{0}</td><td>{1}</td></tr>\n'.format("kmer: "+kmer.upper(),count)
        if count >0:
            high_sequence = sequence.replace(kmer,'<font color="red">%s</font>'%kmer)
        else:
            high_sequence = sequence
    except ValueError:
        s+= '<tr><td>{0}</td><td>{1}</td></tr>\n'.format("kmer: "+kmer.upper(),"Not a valid kmer")
        high_sequence = sequence
    s += '<tr><td>{0}</td><td>{1}</td></tr>'.format("sequence ", high_sequence)    
    s += '</table>'

    display(HTML(s))