In [30]:
!pip install biopython==1.71


Collecting biopython==1.71
  Downloading biopython-1.71.tar.gz (16.0 MB)
[K     |████████████████████████████████| 16.0 MB 21.9 MB/s 
Building wheels for collected packages: biopython
  Building wheel for biopython (setup.py) ... [?25l[?25hdone
  Created wheel for biopython: filename=biopython-1.71-cp37-cp37m-linux_x86_64.whl size=2208495 sha256=386ec6b7eb3ab9b7e2ee1c8f068017a10638ed557bae7f4399095199decc205d
  Stored in directory: /root/.cache/pip/wheels/48/85/4b/a9c428fb719da72b6c4c69c85d422b019becd050358f1ff3e4
Successfully built biopython
Installing collected packages: biopython
  Attempting uninstall: biopython
    Found existing installation: biopython 1.79
    Uninstalling biopython-1.79:
      Successfully uninstalled biopython-1.79
Successfully installed biopython-1.71


In [16]:
### Learning objectives
### Manipulate Sequence Objects
### Annotate Sequence Objects
### Read/write Fasta file
### Blasting Sequences
### Multiple Alignment Sequences

In [None]:
### The Seq Object
### Biological sequences are arguably the central object in Bioinformatics. Biopython sequences are essentially strings of letters like AGTACACTGGT 
## as seen in common biological file formats.

### The Biopython Seq object is defined in the Bio.Seq module

### The Seq object is different from traditional python strings:

### 1. It supports most of the string methods but it also comes with its specifc set of methods
### translate() - Turns a nucleotide sequence into a protein sequence.
### reverse_complement() - Returns the reverse complement sequence.
### complement() - Returns the complement sequence.
### transcribe() -Returns the RNA sequence from a DNA sequence.
### back_transcribe() - Returns the DNA sequence from an RNA sequence.
### ungap() - Return a copy of the sequence without the gap character(s).


In [1]:
from Bio.Seq import Seq
my_seq = Seq("AGTACACTGGT")
my_seq

Seq('AGTACACTGGT', Alphabet())

In [29]:
from Bio.Alphabet import IUPAC
my_seq = Seq("AGTACACTGGT", IUPAC.unambiguous_dna)
my_seq

Seq('AGTACACTGGT', IUPACUnambiguousDNA())

In [15]:
my_seq.alphabet

IUPACUnambiguousDNA()

In [16]:
### Seq as python strings

In [17]:
## Seq objects as if they were normal Python strings, for example getting the length, or iterating over the elements

In [18]:
for index, letter in enumerate(my_seq):
   print("%i %s" % (index, letter))

0 A
1 G
2 T
3 A
4 C
5 A
6 C
7 T
8 G
9 G
10 T


In [19]:
len(my_seq)

11

In [20]:
## The Seq object has a .count() method, just like a string. 
my_seq.count("A")

3

In [21]:
my_seq.count("GT")

2

In [22]:
### Slicing a Seq object

In [23]:
## get a slice of the sequence
my_seq[2:8]

Seq('TACACT', IUPACUnambiguousDNA())

In [26]:
## Concatenate sequence
## You can in principle add any two Seq objects together just like you can with Python strings.
## But Seq object are made for biological data so you the concatenation method only accept to merge sequences with compatible alphabets. 
from Bio.Alphabet import IUPAC
p_seq = Seq("EVRNAK", IUPAC.protein)
d_seq = Seq('TACACT', IUPAC.unambiguous_dna)
d_seq + my_seq


Seq('TACACTAGTACACTGGT', IUPACUnambiguousDNA())

In [27]:
p_seq + my_seq

TypeError: ignored

In [31]:
my_seq

Seq('AGTACACTGGT', IUPACUnambiguousDNA())

In [30]:
## You could use the -1 stride to reverse the string
my_seq[::-1]

Seq('TGGTCACATGA', IUPACUnambiguousDNA())

In [33]:
## You can easily obtain the complement or reverse complement of a Seq object using its built-in methods:
my_seq.complement()

Seq('TCATGTGACCA', IUPACUnambiguousDNA())

In [35]:
##The reverse complement for a given DNA string can be identified by reversing the original string and translating
 ##   each of the DNA bases with its complement.
my_seq.reverse_complement()

Seq('ACCAGTGTACT', IUPACUnambiguousDNA())

In [36]:
## Note that these methods only work for dna alphabet. Trying to (reverse)complement a protein sequence will raise you an error:
p_seq = Seq("EVRNAK", IUPAC.protein)
p_seq.reverse_complement()

ValueError: ignored

In [37]:
## Transcription, reverse transcription and translation

In [38]:
## Biologically the transcription do a reverse complement of the template strand while inserting Uracile instead of Thymine (TCAG → CUGA) to give the RNA.

In [39]:
## However, in Biopython and bioinformatics in general, we typically work directly with the 
## coding strand because this means we can get the mRNA sequence just by switching T → U

In [42]:
## Let's do a simple transcription of our sequence:

In [43]:
my_seq

Seq('AGTACACTGGT', IUPACUnambiguousDNA())

In [41]:
r_seq=my_seq.transcribe()
r_seq

Seq('AGUACACUGGU', IUPACUnambiguousRNA())

In [46]:
## And a reverse transcription of the resulting sequence:
## As you can see, all this does is switch T -> U or U -> T and adjust the alphabet

In [53]:
r_seq.back_transcribe()

Seq('AGTACACTGGT', IUPACUnambiguousDNA())

In [54]:
## Now let’s translate this mRNA into the corresponding protein sequence

In [55]:
p_seq = r_seq.translate()
p_seq



Seq('STL', IUPACProtein())

In [48]:
##Could you generate the mRNA from this template strand sequence: 'TACCGGTAACATTACCCGGCGACTTTCCCACGGGCTATC' ?

In [49]:
reverse_template_dna = Seq('TACCGGTAACATTACCCGGCGACTTTCCCACGGGCTATC', IUPAC.unambiguous_dna)
reverse_template_dna.complement().transcribe()

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())