### Hyun-Joon Yang
### yanghyun@usc.edu
### QBIO 401
### Assignment 1

#### 1. Write a Python function that takes as input a FASTA file and returns a sequence string

In [1]:
def openFASTA(filename: str):
    sequence = ''
    with open(filename) as f:
        for line in f:
            # ignore whitespace
            line = line.strip()
            if line == '':
                continue
            # ignore headers & comments
            if line[0] == '>' or line[0] == ';':
                continue
            # ignore end-sequence denotation
            if line[-1] == '*':
                sequence += line[:-1]
            else:
                sequence += line
    return sequence

#### 2. Write a Python function that takes as input a sequence string and returns a list with 4 entries that are the number of A, C, G, and T in the sequence

In [2]:
def countACGT(sequence: str):
    result = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
    for base in sequence:
        if base in result:
            result[base] += 1
    return result

#### 3. Write a Python function that takes two inputs: a sequence string and a string of two letters (e.g., “CG” or “CT”). This function returns the number of times the two letters occur consecutively in the sequence

In [3]:
def countSubstring(sequence: str, substr: str):
    n = len(sequence)
    k = len(substr)
    count = 0
    if n < k:
        return 0
    if k == 0:
        return 0
    for i in range(n-k+1):
        if sequence[i:i+k] == substr:
            count += 1
    return count

#### 4. For each of the two FASTA files, print the output of function #2 and function #3 with input “CG”. Compare the results and describe your finding

In [6]:
seq_homo = openFASTA('homo.fasta')
seq_dros = openFASTA('drosophila.fasta')

In [7]:
print(countACGT(seq_homo))
print(countACGT(seq_dros))

{'A': 1773, 'C': 1139, 'G': 1410, 'T': 1751}
{'A': 2395, 'C': 1876, 'G': 1675, 'T': 1718}


In [8]:
print(countSubstring(seq_homo, 'CG'))
print(countSubstring(seq_dros, 'CG'))

99
455


It seems the genetic orthologue of drosophila is longer than that of humans, although the number of Thymine in each is relatively similar. Also, the dinucleotide 'CG' appears in dorsophila gene a lot more than it does in the homo sapiens one.

#### 5. Write another Python function that takes as input a sequence string and returns a list with 16 entries that are the outputs of function #3 for all 16 possible two letter strings

In [4]:
def countAllSubstrings(sequence: str):
    result = {}
    substrs = []
    for i in 'ACGT':
        for j in 'ACGT':
            substrs.append(i + j)
    for substr in substrs:
        result[substr] = countSubstring(sequence, substr)
    return result

#### TESTS

In [5]:
import unittest

class Test_openFASTA(unittest.TestCase):
    def setUp(self):
        fasta = ">MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken\n"
        fasta += "; a sample sequence in FASTA format\n"
        fasta += "ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID\n"
        fasta += "FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA\n"
        fasta += "DIDGDGQVNYEEFVQMMTAK*\n\n"
        with open('test.fasta', 'w') as f:
            f.write(fasta)
        
    def test_ignoresHeader(self):
        header = ">MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken"
        sequence = openFASTA('test.fasta')
        self.assertFalse(header in sequence)
    
    def test_ignoresComments(self):
        comment = "; a sample sequence in FASTA format"
        sequence = openFASTA('test.fasta')
        self.assertFalse(comment in sequence)
    
    def test_ignoresAsterisk(self):
        sequence = openFASTA('test.fasta')
        self.assertTrue(sequence[-1] != '*')
    
    def test_ignoresNewline(self):
        sequence = openFASTA('test.fasta')
        self.assertTrue('\n' not in sequence)
    
    def test_getsCorrectSequence(self):
        expected = 'ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID'
        expected += 'FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA'
        expected += 'DIDGDGQVNYEEFVQMMTAK'
        sequence = openFASTA('test.fasta')
        self.assertEqual(expected, sequence)

class Test_countACGT(unittest.TestCase):
    def test_default(self):
        seq = 'ATGC'
        output = countACGT(seq)
        expected = {'A': 1, 'T': 1, 'G': 1, 'C': 1}
        self.assertEqual(expected, output)
    
    def test_long(self):
        seq = 'ATGCATGCTAGCTGATCGTGCGCTGTCGATCGATGCTAGCTATGCTGATCGATGCTAGGTAG'
        output = countACGT(seq)
        expected = {'A': 12, 'T': 18, 'G': 19, 'C': 13}
        self.assertEqual(expected, output)
        
    def test_unknownBase(self):
        seq = 'ABCDEFG'
        output = countACGT(seq)
        expected = {'A': 1, 'T': 0, 'G': 1, 'C': 1}
        self.assertEqual(expected, output)
    
    def test_blank(self):
        seq = ''
        output = countACGT(seq)
        expected = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
        self.assertEqual(expected, output)
        
class Test_countSubstring(unittest.TestCase):
    def test_blankSubsequence(self):
        seq = 'ATGC'
        sub = ''
        output = countSubstring(seq, sub)
        expected = 0
        self.assertEqual(expected, output)
    
    def test_blankInputs(self):
        seq = ''
        sub = ''
        output = countSubstring(seq, sub)
        expected = 0
        self.assertEqual(expected, output)
    
    def test_SubsequenceLongerThanSequence(self):
        seq = 'AT'
        sub = 'ATGC'
        output = countSubstring(seq, sub)
        expected = 0
        self.assertEqual(expected, output)
    
    def test_default(self):
        seq = 'ATGC' * 5
        sub = 'TG'
        output = countSubstring(seq, sub)
        expected = 5
        self.assertEqual(expected, output)
    
    def test_noAppearance(self):
        seq = 'ATGC' * 5
        sub = 'CG'
        output = countSubstring(seq, sub)
        expected = 0
        self.assertEqual(expected, output)
    
    def test_long(self):
        seq = 'ATGCCGATATATTGCTCGGCTAGCTAAGCTAG'
        sub = 'AG'
        output = countSubstring(seq, sub)
        expected = 3
        self.assertEqual(expected, output)
        
    def test_sameLetters(self):
        seq = 'AAAAACAAACC'
        sub = 'AA'
        output = countSubstring(seq, sub)
        expected = 6
        self.assertEqual(expected, output)
        
class Test_countAllSubstrings(unittest.TestCase):
    def test_hasAllCombinations(self):
        output = countAllSubstrings('')
        print(output)
        self.assertEqual(16, len(output))
        
unittest.main(argv=[''], exit=False)

.................

{'AA': 0, 'AC': 0, 'AG': 0, 'AT': 0, 'CA': 0, 'CC': 0, 'CG': 0, 'CT': 0, 'GA': 0, 'GC': 0, 'GG': 0, 'GT': 0, 'TA': 0, 'TC': 0, 'TG': 0, 'TT': 0}



----------------------------------------------------------------------
Ran 17 tests in 0.037s

OK


<unittest.main.TestProgram at 0x186b6ff3f98>