# Introduction to Sequence Data Analysis with Python

## Agenda 
1. Validate Sequence 
2. Counting Letters / Frequency
3. % of Nucleotides 
4. Calculating `GC` Content 
5. `GC` Content of Sub-sequence
6. Calculating `AT` Content 
7. `AT` Content of Sub-sequence
8. Transcription
9. Complement 
10. Reverse Complement 
11. DNA Translation

In [1]:
seq = 'GGTCAGAAAAAGCCCTCTCCATGTCTACTCACGATACATCCCTGAAAACCACTGAGGAAGTGGCTTTTCA'

## Validate Sequence 

In [2]:
# counting letters 
seq.count("A")

21

In [3]:
# length of seq 
len(seq)

70

In [5]:
# % of nucleotide 
(seq.count("A") / len(seq)) * 100 

30.0

In [24]:
def validtae_seq(seq): 
    """Checks if DNA sequence is valid. Returns True is sequence is valid, or False otherwise""" 
    seq = seq.upper() 
    nt_counts = seq.count("A") + seq.count("T") + seq.count("G") + seq.count("C") 
    if nt_counts == len(seq): 
        return True 
    else:
        return False 

In [28]:
# function call 
validtae_seq(seq)  

True

In [29]:
import screed 
def readFASTA(inputfile): 
    """Reads a FASTA file and returns with special characters removed!""" 
    # open file in reading mood 
    with screed.open(inputfile) as seqfile: 
        for read in seqfile: 
            seq = read.sequence
    return seq 

In [53]:
# data read 
seqs = readFASTA("../data/Haemophilus_influenzae.fasta")

In [32]:
validtae_seq(seqs)

True

## Counting Letters / Frequency 

In [33]:
seq = 'GGTCAGAAAAAGCCCTCTCCATGTCTACTCACGATACATCCCTGAAAACCACTGAGGAAGTGGCTTTTCA'
base_counts = {'A': 0, 'T': 0, 'G': 0, 'C': 0} 
for i in seq: 
    base_counts[i] += 1 

In [34]:
base_counts

{'A': 21, 'T': 16, 'G': 13, 'C': 20}

In [35]:
base_counts.keys() 

dict_keys(['A', 'T', 'G', 'C'])

In [36]:
base_counts.values() 

dict_values([21, 16, 13, 20])

In [37]:
base_counts.items() 

dict_items([('A', 21), ('T', 16), ('G', 13), ('C', 20)])

In [38]:
letters = [] 
values = [] 
for k, v in base_counts.items(): 
    letters.append(k)
    values.append(v) 
    

In [39]:
letters

['A', 'T', 'G', 'C']

In [40]:
values

[21, 16, 13, 20]

In [42]:
(21/70) * 100 

30.0

In [49]:
seq = 'GGTCAGAAAAAGCCCTCTCCATGTCTACTCACGATACATCCCTGAAAACCACTGAGGAAGTGGCTTTTCA'

In [51]:
def basecount(seq): 
    """Read sequence and returns their frequency.""" 
    # store frequency 
    base_count = {} 
    for base in seq: 
        if base in base_count: 
            base_count[base] += 1 
        else: 
            base_count[base] = 1 
    return base_count 

In [52]:
basecount(seq) 

{'G': 13, 'T': 16, 'C': 20, 'A': 21}

In [61]:
%%timeit 
basecount(seqs) 

252 ms ± 4.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [55]:
from collections import Counter 
freqs = Counter(seq)

In [56]:
freqs

Counter({'G': 13, 'T': 16, 'C': 20, 'A': 21})

In [57]:
from collections import Counter 
freqs = Counter(seqs)

In [60]:
%%timeit
freqs

27.9 ns ± 0.377 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [62]:
from collections import Counter 
def count_fast(seq): 
    """Reads sequence and returns their frequency"""
    freqse = Counter(seq) 
    return freqs 

In [64]:
%%timeit 
count_fast(seqs) 

131 ms ± 4.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [65]:
basecount(seqs[1:500]) 

{'A': 178, 'C': 70, 'G': 105, 'T': 146}

In [68]:
basecount(seqs[500:1001]) 

{'G': 89, 'C': 108, 'A': 133, 'T': 171}

In [None]:
def pc_freq(seq): 
    """Reads sequence and returns their %""" 
    