In [2]:
import os
import numpy as np
import pandas as pd
from Bio.pairwise2 import format_alignment
from Bio.SubsMat import MatrixInfo 
from Bio import pairwise2
from Bio import SeqIO, SearchIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.PhyloXML import Phylogeny
from Bio import Phylo

from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style='white')
blast_id = True



![](https://i.imgur.com/jJmBNXq.png)

# <div style="padding: 30px;color:white;margin:10;font-size:80%;text-align:left;display:fill;border-radius:10px;background-color:#F1C40F;overflow:hidden;background-image: url(https://i.imgur.com/tNNAhbu.png)"><b><span style='color:white'>1 |</span></b> Background</div>

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>1.1 |</span></b> Biopython</b></p>
</div>

### <b><span style='color:#F55AA2'>BIOPYTHON</span></b>

- The Biopython Project is an international association of developers of freely available Python tools for computational molecular biology. 
- Basically, the goal of Biopython is to make it as easy as possible to use Python for bioinformatics by creating high-quality, reusable modules and classes. 
- Biopython features include parsers for various Bioinformatics file formats (**<span style='color:#F55AA2'>BLAST</span>**,**<span style='color:#F55AA2'>Clustalw</span>**, **<span style='color:#F55AA2'>FASTA</span>**, **<span style='color:#F55AA2'>Genbank</span>**, access to <b>online services</b> (**<span style='color:#F55AA2'>NCBI</span>**, Expasy,...)
- Interfaces to common and not-so-common programs (Clustalw, DSSP, MSMS...), a standard sequence class, various clustering modules, a KD tree data structure etc. and even documentation.

### <b><span style='color:#F55AA2'>NOTEBOOK AIM</span></b>

- In this notebook, we'll be covering very simple, beginner entry operations one can encounter when starting out in the field of <b><span style='color:#F55AA2'>bioinformatics</span></b>.
- <b><span style='color:#F55AA2'>BioPython</span></b> is a very convenient tool we can utlise when working with <b><span style='color:#F55AA2'>Sequences</span></b> & <b><span style='color:#F55AA2'>Alignments</span></b> of <b><span style='color:#F55AA2'>DNA/Protein</span></b> based sequences.
- Whilst it can be interesting to work with your own classes you can modify to your needs, it's much easier and less of a hastle to work in BioPython. 
- It is [well documented](https://biopython.org) & is quite popular, so there are plenty of of [examples](http://diverge.hunter.cuny.edu/~weigang/BioPython.html) that you can come across & will potentially help you understand how you can utilise this tool

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>1.2 |</span></b> Biological Sequence Alphabets</b></p>
</div>

Two sets of alphabets are used in bioinformatics, **<mark style="background-color:#5D2ECC;color:white;border-radius:5px;opacity:0.7">Nucleotides</mark>** & **<mark style="background-color:#5D2ECC;color:white;border-radius:5px;opacity:0.7">Amino Acids</mark>**

**<mark style="background-color:#323232;color:white;border-radius:5px;opacity:0.7">ABC (I/II)</mark>** **<mark style="background-color:#F55AA2;color:white;border-radius:5px;opacity:0.7">Nucleic Acids</mark>**

> - Among <b><span style='color:#5D2ECC'>molecules with a biological role</span></b>, we can find <b><span style='color:#5D2ECC'>nucleic acids</span></b>. 
> - Nucleic acids encode and express the genetic code that is kept within the cell. 
- There are two major types of <b><span style='color:#5D2ECC'>nucleic acids</span></b>: 
> - <b>DeoxyriboNucleic Acid (DNA)</b>
> - <b>RiboNucleic Acid (RNA)</b> (Obtainable via transcription)
- DNA contains the information necessary to build a cell, and keep it functioning. 
- In <b><span style='color:#5D2ECC'>eukaryotic</span></b> cells, DNA will be found in the nucleus, whilst in the <b><span style='color:#5D2ECC'>prokaryotic</span></b> cells, it will be found in the cytoplasm. 
- <b><span style='color:#5D2ECC'>IUPAC</span></b> defines the full list of nucleotides as shown in the table below, with <b>A,T,G,C</b> being the main four:
- Another type of nucleotide list often used is **[IUB Ambiguity Codes](http://biocorp.ca/IUB.php)**, which we use later in the notebook as well

**<mark style="background-color:#323232;color:white;border-radius:5px;opacity:0.7">ABC (II/II)</mark>** **<mark style="background-color:#F55AA2;color:white;border-radius:5px;opacity:0.7">Amino Acids</mark>**
- **<span style='color:#5D2ECC'>Amino acids</span>**: 
> The **<span style='color:#5D2ECC'>building blocks of proteins</span>**, which are <b>macromolecules</b> that perform most of the functions inside a cell
- Proteins have a **<span style='color:#5D2ECC'>broad range of functions</span>**, spanning from **<span style='color:#5D2ECC'>catalytic</span>** to **<span style='color:#5D2ECC'>structural functions</span>**:

> - **<span style='color:#5D2ECC'>Enzymes</span>** : Type of abundant proteins that promote chemical reactions and convert certain molecules into other types of molecules required for the functioning of the cell.
> - **<span style='color:#5D2ECC'>Carbohydrates</span>** : Serve as energy storage, both for immediate and long term energy demands.
> - **<span style='color:#5D2ECC'>Lipids</span>**: Part of the plasma membrane, doing signaling and energy storage.
- The cell also contains other components of varying complexity. Of importance: 
> - <b>Mitochondria</b> & the <b>Chloroplasts</b> : Organelles involved in the production of energy. 
> - <b>Ribosomes</b> : Large and complex molecules composed of a mixture of genetic material, req. to assemble proteins and play a central role in the flow of genetic information.

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>1.3 |</span></b> DNA Strands</b></p>
</div>

### <b><span style='color:#F55AA2'>COMPLEMENTARY STRANDS IN DNA</span></b>

- DNA is a molecule composed of **<span style='color:#5D2ECC'>two complementary strands</span>** that form and stick together due to the connections established between the nucleotides in both strands. 

> - This is made possible by due to the chemical phenomenon where **<span style='color:#5D2ECC'>Adenine (A)</span>** bonds only with **<span style='color:#5D2ECC'>Thymine (T)</span>** nucleotides, as a result of two hydrogen connections. 
> - Similarly, **<span style='color:#5D2ECC'>Guanine (G)</span>** bonds only with **<span style='color:#5D2ECC'>Cytosine (C)</span>** nucleotides by three hydrogen connections.

### <b><span style='color:#F55AA2'>REVERSE COMPLEMENT</span></b>

- This results in **<span style='color:#5D2ECC'>two complementary</span>** and **<span style='color:#5D2ECC'>anti-parallel strands</span>** (connected in opposite directions), if we know the nucleotide sequence in one of the strands, we can get the sequence in the opposite strand by taking the complement of its nucleotides, which are also read backwards, thus we have the **<span style='color:#5D2ECC'>reverse complement</span>** of the other strand.
- It has become a **<span style='color:#5D2ECC'>standard to describe the DNA though only one</span>** of the strands, due to this **<span style='color:#5D2ECC'>complementarity</span>** using <b>[A,T,G,C]</b>.
- The existence of these two strands is essential in order to **<span style='color:#5D2ECC'>pass on genetic information</span>** to new cells and **<span style='color:#5D2ECC'>produce proteins</span>**.

# <div style="padding: 30px;color:white;margin:10;font-size:80%;text-align:left;display:fill;border-radius:10px;background-color:#F1C40F;overflow:hidden;background-image: url(https://i.imgur.com/tNNAhbu.png)"><b><span style='color:white'>2 |</span></b> Sequence Definition</div>

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>2.1 |</span></b> Python Strings to Sequences</b></p>
</div>

### <b><span style='color:#F55AA2'>SMALLER SEQUENCES</span></b>

- When a sequence are quite short (eg. when we are looking at part of the entire DNA sequence), we can probably just define them using the **<span style='color:#F55AA2'>string</span>** format.
- To use some operations with this sequence in <b>biopython</b> (eg. sequence alignment), we need to <b><span style='color:#F55AA2'>create an instance</span></b> of the <b>Seq</b> class for each sequence.
- In previous biopython versions, we had to specify the <b><span style='color:#F55AA2'>sequence type</span></b>, (eg. <code>IUPAC.protein</code>), however this is not necessary in recent versions.

In [15]:
n = 'ATGACGGATCAGCCGCAAGCGGAATTGGCGTTTACGTACGATGCGCCGTAA'  # nucleotide sequence
aa = 'MMMELQHQRLMALAGQLQLESLISAAPALSQQAVDQEWSYMDFLEHLLHE' # protein sequence

seq_n = Seq(n)
print(seq_n)
seq_aa = Seq(aa)
print(seq_aa)

ATGACGGATCAGCCGCAAGCGGAATTGGCGTTTACGTACGATGCGCCGTAA
MMMELQHQRLMALAGQLQLESLISAAPALSQQAVDQEWSYMDFLEHLLHE


### <b><span style='color:#F55AA2'>SIMPLE SEQUENCE OPERATION</span></b>

- Since we don't have <code>.alphabet</code> details, you'll have to know the <b>sequence type</b> you are using, so it makes sense to give them names accordingly.
- An example operation you could apply on a <b>nucleotide sequence</b> would be the **<span style='color:#F55AA2'>reverse_complement</span>**, which is an operation applied to a <b><span style='color:#F55AA2'>DNA</span></b> based sequence.

In [17]:
print(seq_n.reverse_complement()) # possible
print(seq_aa.reverse_complement())
print("\n\nLo siguiente debería dar un ERROR!", "\n", seq_aa, "\n", "|"*len(seq_aa), "\n", seq_aa.reverse_complement(), sep="") 

TTACGGCGCATCGTACGTAAACGCCAATTCCGCTTGCGGCTGATCCGTCAT
EDLLDELFHKRSWEQHBTQQSLTPTTSILSELQLQCTLTKLYQDQLEKKK


Lo siguiente debería dar un ERROR!
MMMELQHQRLMALAGQLQLESLISAAPALSQQAVDQEWSYMDFLEHLLHE
||||||||||||||||||||||||||||||||||||||||||||||||||
EDLLDELFHKRSWEQHBTQQSLTPTTSILSELQLQCTLTKLYQDQLEKKK


<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>2.2 |</span></b> Sequence Annotation Objects</b></p>
</div>

### <b><span style='color:#F55AA2'>DEFINING MORE DETAIL ABOUT OUR SEQUENCE</span></b>
- Sequence files obtained from databases provied the option to for more <b>detailed information</b> storage about the sequence, not just the <b>sequence</b> alone.
- In BioPython, we can use <code>SeqRecord</code> objects with our defined <code>Seq</code> object instances & add define additional information about our sequence(s).
- Two commonly added detailed information about a sequence:
  > - <b><span style='color:#F55AA2'>Annotations</span></b> ( Extra metadata information about the sequence )
  > - <b><span style='color:#F55AA2'>Features</span></b> ( Extra information about the sequence content, which is extractable ) 

### <b><span style='color:#F55AA2'>SEQRECORD</span> CLASS CONTENT</b>

<b>SeqRecord</b> | [BioPython](https://biopython.org/wiki/SeqRecord)

> SeqRecord objects are used in Biopython to hold a sequence (as a Seq object) with identifiers (ID and name), description and optionally annotation and sub-features.

- <code>SeqRecord</code> is class we'd use when we want to work with <code>Seq</code> that have more information about the sequence, grouped all in one spot.
- If we wanted to do <b>multiple sequence alignment</b>, we have to define each individual sequences in the <code>SeqRecord</code> object, which requires just the <b>id</b> to be defined.

<code>SeqRecord</code> objects can store:
> - <code>seq</code> - The sequence itself ( <code>Seq()</code> objects )
> - <code>id</code> - Sequence identifier
> - <code>name</code> - Sequence name 
> - <code>description</code> - Description of the sequence
> - <code>annotations</code> - Global annotations for the whole sequence (<b>dic</b>)
> - <code>features</code> - Structured features (<b>lst</b> of <code>SeqFeature</code> objects)
> - <code>letter_annotations</code> - Annotations for each letter (position) in the sequence
> - <code>dexrefs</code> - References to databases

### <b><span style='color:#F55AA2'>SEQRECORD ANNOTATIONS</span> & </b> <b><span style='color:#F55AA2'>SAVING SEQUENCE FILE</span></b>
- As we saw, <code>SeqRecord</code> can hold information about different <b><span style='color:#F55AA2'>annotations</span></b> for the sequence in question.
- Let's define some basic annotations for <code>SeqRec</code> below, which we <b>can subsequently save</b> using <code>SeqIO</code> using the <b><span style='color:#F55AA2'>minimalistic</span></b> <b>FASTA</b> or <b><span style='color:#F55AA2'>detailed</span></b> <b>Genbank</b> formats.
- We'll look at which <code>annotations</code> data are typically added in <b>Section 3</b>, where we'll load and look at <b>real sequence</b> files.

In [19]:
seq_record = SeqRecord(seq_n)

seq_record.id = "ABC12345"
seq_record.description = 'Neucleotide sequence'
seq_record.annotations['molecule_type'] = 'DNA'
SeqIO.write(seq_record,"./kaggle/working/my_seq.fasta","fasta")
SeqIO.write(seq_record,"./kaggle/working/my_seq.gb","genbank")

1

In [27]:
# As we can see FASTA doesn't seem to save the annotations data
print('FASTA Annotations:')
read_seq1 = SeqIO.read('./kaggle/working/my_seq.fasta','fasta')
print(read_seq1.annotations)  # annotations

# Whereas Genbank does
print('GenBank Annotations')
read_seq2 = SeqIO.read('./kaggle/working/my_seq.gb','genbank')
print(read_seq2.annotations)  # annotations

FASTA Annotations:
{}
GenBank Annotations
{'molecule_type': 'DNA', 'data_file_division': 'UNK', 'date': '01-JAN-1980', 'accessions': ['ABC12345'], 'keywords': [''], 'source': '', 'organism': '.', 'taxonomy': []}


### <b><span style='color:#F55AA2'>SEQRECORD FEATURES</span></b>

- <b>Real sequences</b> can be very long & contain useful <b>subset sequences</b> that one might be interested in.
- <b><span style='color:#F55AA2'>Feature descriptions</span></b> can be quite useful; if we want to define subsets of the <b>sequence</b> to which a <b>feature</b> is applicable.
- More detailed information about the <code>SeqFeature</code> class | [BioPython Documentation](https://biopython.org/docs/1.75/api/Bio.SeqFeature.html)

<b>Some</b> <code>SeqFeature</code> object attributes:
> - <code>location</code> - Indicator of which region the annotation applies ( <code>FeatureLocation</code> object )
> - <code>type</code> - Feature type ( <code>string</code> )
> - <code>qualifiers</code> - Additional Info ( <code>dict</code> )



In [28]:
from Bio import SeqFeature

# Define locations 
s0 = SeqFeature.AfterPosition(10) # exact location
s1 = SeqFeature.BetweenPosition(40,left=35,right=40) # fuzzy location (range)
location = SeqFeature.FeatureLocation(s0,s1) # define the feature location
print(f'Feature Location Notation: {location}')

Feature Location Notation: [>10:(35^40)]


In [29]:
# Example for a gene feature

unitig = Seq('GTGCGACAGCAAAGTCCAAACCAGCGTCCCCGCC')
feature = SeqFeature.SeqFeature(SeqFeature.FeatureLocation(5,10), # location (shorter)
                                type='gene', # the specified type of the feature
                                strand=-1  # -1 indicates the minus strand (reverse_complement)
                               )

# If we wanted to extract the feature
feature_seq = feature.extract(unitig)
print(feature_seq)

GCTGT


We can of course add this feature to the <code>SeqRecord</code> class & save, subsequently recalling the feature again.

In [32]:
seq_record = SeqRecord(unitig)

seq_record.id = "ABC12345"
seq_record.description = 'Neisseria gonorrhoeae unitig'
seq_record.annotations['molecule_type'] = 'DNA'
seq_record.features = [feature]

In [33]:
# We can of course write to gb file & see our stored data
SeqIO.write(seq_record,"./kaggle/working/my_seq.gb","genbank")
read_seq2 = SeqIO.read('./kaggle/working/my_seq.gb','genbank')
print(read_seq2.features)  # annotations

# define one of the features from the list
feature = read_seq2.features[0]

# Extracted feature
feature_seq = feature.extract(read_seq2)
print('\nExtracted Feature:')
print(feature_seq)

[SeqFeature(FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1), type='gene')]

Extracted Feature:
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 1
Seq('GCTGT')


# <div style="padding: 30px;color:white;margin:10;font-size:80%;text-align:left;display:fill;border-radius:10px;background-color:#F1C40F;overflow:hidden;background-image: url(https://i.imgur.com/tNNAhbu.png)"><b><span style='color:white'>3 |</span></b> Reading Sequence Files</div>

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>3.1 |</span></b> Common Formats</b></p>
</div>

### <b><span style='color:#F55AA2'>TWO MAIN FORMATS</span></b>

- Two formats are commonly used for sequence storage, introduced in the previous section.
- In <b>BioPython</b>, <b><span style='color:#F55AA2'>FASTA</span></b> & <b><span style='color:#F55AA2'>GenBank</span></b> file formats can be read using BioPython's <code>Bio.SeqIO</code> class.

### <b><span style='color:#F55AA2'>SIMPLE OR COMPLEX FORMAT</span></b>

- <b>FASTA</b> is quite a <b>generic format</b> & can be used to store <b>multiple sequences</b> & even <b>alignments</b> as well.
- <b>FASTA</b> is definitely less detailed & very minimalistic when it comes to content, usually only containing a <b>description</b> & <b>sequence</b>.
- <b>GenBank</b> files on the other hand tend to be very detailed when it comes to the <b>sequence description</b>.

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>3.2 |</span></b> FASTA Format</b></p>
</div>

Snipplet from **[FASTA format](https://en.wikipedia.org/wiki/FASTA_format)**:
>In bioinformatics and biochemistry, the FASTA format is a text-based format for representing either <b>nucleotide sequences</b> or <b>amino acid (protein) sequences</b>, in which nucleotides or amino acids are represented using single-letter codes. The format also allows for <b>sequence names</b> and <b>comments</b> to precede the sequences. <br>
> The format originates from the FASTA software package, but has now become a near universal standard in the field of bioinformatics.

### <b><span style='color:#F55AA2'>READING SINGLE SEQUENCE FILES</span></b>
- The FASTA format can actually contain <b>multiple sequences</b> in one file & we need to read them slightly differently.
- <code>SeqIO.read()</code> is used for reading <b><span style='color:#F55AA2'>single sequence</span></b> files, else we get an error <code>More than one record found in handle</code>.

In [None]:
''' FASTA formats w/ one sequence '''

single_fasta = '/kaggle/input/biopython-genbank/example.fasta' # nuceotide seq
fasta_n = SeqIO.read(single_fasta,'fasta')
print(f'FASTA Content:')
print(fasta_n)      # Print fasta file information

print(f'\nNumber of Nucleotides: {len(fasta_n)}') # number of nucleotides
seq_fasta = fasta_n.seq # Extract Sequence

print('\nFeatures & Annotations:')
print(fasta_n.features)     # some files contain just the basic details
print(fasta_n.annotations)  

### <b><span style='color:#F55AA2'>READING MULTIPLE SEQUENCE FILES</span></b>
<code>SeqIO.parse()</code> is used for reading <b><span style='color:#F55AA2'>multiple sequence</span></b> files, which returns <b>an iterator</b>, that can be used to cycle through and get the sequences.

In [None]:
''' FASTA formats w/ multile sequences '''

multi_fasta = '/kaggle/input/biopython-genbank/NC_005816.faa' # protein seq
iter_multi_fasta_aa = SeqIO.parse(multi_fasta,'fasta') # returns an interator

lst_fasta_aa = []
for seq_aa in iter_multi_fasta_aa:
    lst_fasta_aa.append(seq_aa)
    
print(f'Number of Sequences read: {len(lst_fasta_aa)}')
print(f'\nFirst Sequence: \n{lst_fasta_aa[0]}')

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>3.3 |</span></b> GenBank Format</b></p>
</div>

Snipplet from [Bacterial Genoes Bioinformatics](https://www.futurelearn.com/info/courses/bacterial-genomes-bioinformatics/0/steps/47012)

> The Genbank format allows for the storage of information in addition to a DNA/protein sequence. It <b>holds much more information</b> than the FASTA format. Formats similar to Genbank have been developed by ENA (EMBL format) and by DDBJ (DDBJ format).

**COMPATIBILITY WITH DATABASES**

> Primary databases have developed highly structured data file formats that enable the storage of all of these additional data that accompany the otherwise “naked” DNA sequence encoded in a FASTA file. The strict layout is necessary for the file to be compatible with a range of computer programs. Each of the three primary databases have their own sequence file format layout. However, all of them contain almost the same fields and the same information, making them interchangeable.

- Like <b><span style='color:#F55AA2'>FASTA</span></b>, <b><span style='color:#F55AA2'>GenBank</span></b> files can store more than one sequence & are read in exactly the same way as shown above using class <code>SeqIO</code>. (<code>.read</code> & <code>.parse</code>)
- However as indicated above, they tend to contain much more in depth information about the sequence, content of the nature storable in the <code>SeqRecord</code> class object.
- As we can see below, they can store both <b><span style='color:#F55AA2'>Annotations</span></b> & <b><span style='color:#F55AA2'>Features</span></b> ( both of which we looked at in <b>Section 2.2</b> ), which makes this format very useful.

In [None]:
file_gb = '/kaggle/input/biopython-genbank/example.gb'

''' Read GenBank format '''
# GenBank format are more comprehnsive than fasta.

gb = SeqIO.read(file_gb,'genbank')
seq_gb = gb.seq # store sequence 

print('Print Seq (preview)')
print(gb.seq[:40])

print('\nLength of Sequence:')
print(len(gb.seq))

print('\nRecord ID:')
print(gb.id)

print('\nName:')
print(gb.name)

print('\nDescription:')
print(gb.description)

# Annotations 
print('\nNumber of Annotations:')
print(len(gb.annotations))

# Features 
print('\nNumber of Features:')
print(len(gb.features))

### <b><span style='color:#F55AA2'>ANNOTATION CONTENT</span></b>
- We can visualise the <b><span style='color:#F55AA2'>annotation</span></b> content (if it is present) using <code>.annotations</code>.
- Being a <b>dictionary</b>, we can get see what sort annotations are present by calling <code>.annotations.keys()</code>.

In [None]:
# annotations field; dic that provides a number of properties for the seq
print('Annotations dictionary:\n')
print(gb.annotations)

print('\nKeys:')
print(gb.annotations.keys())

In [None]:
# Getting specific key values
print('\nGet specific parts of the annotation:\n')
print('Taxonomy:\n')
print(gb.annotations['taxonomy'])

print('Source:\n')
print(gb.annotations['source'])

print('Date:\n')
print(gb.annotations['date'])

print('gi number:\n')
print(gb.annotations['gi'])

### <b><span style='color:#F55AA2'>SHOWING FEATURES CONTENT</span></b>
- If there are any <b><span style='color:#F55AA2'>features</span></b> present in the file, we can get the <b>list</b> by calling <code>.features</code> ( What we defined in <b>Section 2.2</b> as well )
- <b>Subsets</b> of the <b>sequence</b> can be extracted <code>.extract()</code> from the The <b>list</b> of <code>SeqFeature</code> objects.

In [None]:
gb.features[:5] # show only a few features (total=41)

### <b><span style='color:#F55AA2'>EXPLORING THE "GENE" FEATURE TYPE</span></b>
- One of the features <code>.type</code> that is used in <code>.features</code> is <b><span style='color:#F55AA2'>"gene"</span></b>.
- We can for example, cycling through all features & get all the annotated <b>genes</b> in the whole sequence.
- Exploring these genes, we can get their <b><span style='color:#F55AA2'>locus tag</span></b> (from <b>qualifiers</b>), <b><span style='color:#F55AA2'>database reference</span></b> (from <b>qualifiers</b>),  <b><span style='color:#F55AA2'>strand</span></b> & <b><span style='color:#F55AA2'>location</span></b>.
- We can apply an operation called <b>translate</b> (<b>Section 4.3</b>) to try to <b>find the proteins</b> that the gene feature encode

In [None]:
gene_features = []
for i in range(len(gb.features)):
    if(gb.features[i].type == 'gene'):
        gene_features.append(gb.features[i])

print(f'Number of gene features: {len(gene_features)}')
gene_features

In [None]:
# Gene Qualifiers
gene_features[0].qualifiers

In [None]:
for feat in gene_features:
    print(f"locus tag: {feat.qualifiers['locus_tag']}, database ref: {feat.qualifiers['db_xref']}, strand: {feat.strand}, location: {feat.location}")

In [None]:
protein_feat = []
for feat in gene_features:
    protein_feat.append(feat.extract(gb.seq).translate(table='Bacterial',cds=True))
    
protein_feat # list of protein sequences

### <b><span style='color:#F55AA2'>EXPLORING "CDS" FEATURE TYPE</span></b>
- Another <code>.type</code> that is used in <code>.features</code> is <b><span style='color:#F55AA2'>"CDS"</span></b> (<b>Coding Sequence</b>)
- <b>CDS</b> <code>qualifiers</code> contain quite a bit more content, including the <b>translation</b> sequence, so we can compare our translation of the <b>gene</b> feature, since both types are reffering to the same parts of the sequence.

In [None]:
CDS_features = []
for i in range(len(gb.features)):
    if(gb.features[i].type == 'CDS'):
        CDS_features.append(gb.features[i])
        
print(f"Number of CDS features: {len(CDS_features)}")
CDS_features

In [None]:
print(f'CDS Qualifier Keys: {CDS_features[0].qualifiers.keys()}\n')

print('Showing First CDS Feature')
pprint(CDS_features[0].qualifiers) # ordered dictionary

In [None]:
for key, value in CDS_features[0].qualifiers.items():
    print(f'{key} : {value}')

In [None]:
# Should be identical
Seq(CDS_features[0].qualifiers['translation'][0]) == protein_feat[0]

# <div style="padding: 30px;color:white;margin:10;font-size:80%;text-align:left;display:fill;border-radius:10px;background-color:#F1C40F;overflow:hidden;background-image: url(https://i.imgur.com/tNNAhbu.png)"><b><span style='color:white'>4 |</span></b> Sequence Processing</div>

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>4.1 |</span></b> Basic Sequence Processing Operations</b></p>
</div>

Biopython uses Python <b><span style='color:#F55AA2'>strings</span></b>, so the standard string operations can be used.

In [None]:
''' Basic Operations '''

print(seq_fasta.count('G'))   # Counter for Guanine Nucleotide Count
print(seq_fasta[1:3])         # Show specific Subset of a Sequence
print('ATA' in seq_fasta)     # Is string subset part of sequence
print(seq_fasta.find('ATA'))  # Find Substring index 

# Instantiate a sequence w/ Seq()
seq1 = Seq('MEVRNAKS')
seq2 = Seq('GHERWKY')

print(seq1 + seq2)

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>4.2 |</span></b> Transcription & Reverse Complement Strand</b></p>
</div>

### <b><span style='color:#F55AA2'>TRANSCRIPTION</span></b>

The process of <b><span style='color:#F55AA2'>transcription from DNA</span></b> to <b><span style='color:#F55AA2'>RNA</span></b>, is a critical step in the process of protein synthesis:

(I) Process that occurs within the nucleus of a cell; we obtain a __Mature messenger RNA, mRNA.__ <br>
(II) mRNA is then transfered to the cytoplasm, where it will be used by a cellular machine, to guide the production of a protein.
- Given a DNA sequence, transcription is the first step in <b>obtaining a Protein</b>.
- Nucleotide sequence of a gene from one of the DNA strands is transcribed ( copied into a compelentary molecule of RNA )
- Complementarity of code, allows us to recover information contained in the DNA sequence, process performed by an enzyme, RNA polymerase.
- Stabilisation of elements at the end of a molecule are perfomed by different protein complexes.

### <b><span style='color:#F55AA2'>REVERSE COMPLEMENT</span></b> 
- DNA has <b>two complementary strands</b>. 
- Due to the complementarity of the DNA strands, usually only one of the strands is provided in a sequence file obtained from databases.
- The second strand to the input <b>DNA sequence</b> can be obtained by calling the <code>.reverse_complement()</code> function.

In [None]:
''' Transcription & Reverse Complement '''

print(f'DNA Sequence: {seq_fasta[:100]}')
print(f'Reverse Complement: {seq_fasta.reverse_complement()[:100]}')
print(f'Transcription: {seq_gb.transcribe()[:100]}')

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>4.3 |</span></b> Translation</b></p>
</div>

### <b><span style='color:#F55AA2'>TRANSLATION & CODON TABLES</span></b> 

#### **TRANSLATION**

Snipplet from [genome.gov](https://www.genome.gov/genetics-glossary/Translation)

> Translation is the process of translating the sequence of a <b><span style='color:#F55AA2'>messenger RNA</span></b> (mRNA) molecule to a <b><span style='color:#F55AA2'>sequence of amino acids</span></b> during protein synthesis. The <b><span style='color:#F55AA2'>genetic code</span></b> describes the relationship between the sequence of base pairs in a gene and the corresponding amino acid sequence that it encodes. In the cell cytoplasm, the ribosome reads the sequence of the mRNA in groups of three bases to assemble the protein.

- Translation accepts a DNA sequence and converts it into a <b><span style='color:#F55AA2'>proteins</span></b> in the reading frame you specify.

#### **CODON TABLES**

A <b><span style='color:#F55AA2'>codon</span></b> is a sequence of <b><span style='color:#F55AA2'>three DNA/RNA nucleotides</span></b> that corresponds with a <b><span style='color:#F55AA2'>specific amino acid</span></b> or <b><span style='color:#F55AA2'>stop signal/codon(*)</span></b> during protein synthesis

Snipplet from [Khan Academy](https://www.khanacademy.org/science/ap-biology/gene-expression-and-regulation/translation/a/the-genetic-code-discovery-and-properties)

> The full set of relationships between <b><span style='color:#F55AA2'>codons</span></b> and amino acids (or stop signals) is called the <b><span style='color:#F55AA2'>genetic code</span></b>. The genetic code is often summarized in a table. 

There are a number of <b><span style='color:#F55AA2'>codon tables</span></b>, a few mentioned below:
> - Standard  (SGC)
> - Vertebrate Mitochondrial
> - Bacterial

In [None]:
from Bio.Data import CodonTable 

# Let's take a look at one type of table (SGC)
table = CodonTable.unambiguous_dna_by_name['Standard']
print(table)

In [None]:
# Getting a specific conversion
table.forward_table["CTA"]

In [None]:
''' Translation for both DNA & RNA '''
#  translate codon into an aminoacid

rna_seq = Seq('AUGCGUUUAACU')
print(f'translation RNA: {rna_seq.translate()}')

# Translate DNA sequence using two different tables (SGC & Vertebrate Mitochondrial)
print(f'translation DNA: {seq_gb.translate()[:100]}')
print(f'translation DNA (alternative): {seq_gb.translate(table="Vertebrate Mitochondrial")[:100]}') 

### <b><span style='color:#F55AA2'>REASLISTIC EXAMPLE - PROTEIN IDENTIFICATION</span></b> 

- We can of course load something more realistic, an entire <b>genome</b> sequence as well & apply translation as we try to understand what proteins are encoded in the genome.
- Let's obtain the largest <b>amino acid</b> chain in the sequence data.

In [None]:
# Load our sequence & apply translate
virus_fna = '/kaggle/input/coronavirus-genome-sequence/MN908947.fna'
virus_SeqRec = SeqIO.read(virus_fna,'fasta')
virus_Seq = virus_SeqRec.seq

translation = virus_SeqRec.translate()
print(f'Translation length: {len(translation)}')

# functional proteins are chains above 20 amino acids.

# Get Proteins only
proteins = [] 
for i in translation.seq.split('*'):
    if(len(i) > 19):
        proteins.append(str(i))

df = pd.DataFrame({'protein_chains':proteins})
df['length'] = df['protein_chains'].apply(len)
df = df.sort_values(by = ['length'],ascending=False) [:10]
display(df.head())

# select one protein
one_large_protein=df.nlargest(1,'length')
single_prot = one_large_protein.iloc[0,0]

# write to file which we'll recall later
with open("/kaggle/working/protein.fasta","w") as file:
    file.write(">large protein\n" + single_prot)

# <div style="padding: 30px;color:white;margin:10;font-size:80%;text-align:left;display:fill;border-radius:10px;background-color:#F1C40F;overflow:hidden;background-image: url(https://i.imgur.com/tNNAhbu.png)"><b><span style='color:white'>5 |</span></b> Pairwise Sequence Alignment</div>

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>5.1 |</span></b> Sequence Similarity</b></p>
</div>

### <b><span style='color:#F55AA2'>GENERATING A HYPOTHESIS</span></b> 

- We can generate a hypothesis about the biological function based on <b><span style='color:#F55AA2'>sequence similarity</span></b>.
- If two sequences show high degree of <b><span style='color:#F55AA2'>similarity (score)</span></b>, then they have a high chance of being __homologous__, sharing similar functions. 

### <b><span style='color:#F55AA2'>ISSUES IN COMPARING SEQUENCES</span></b> 

If we try to compare sequences one by one, we might run into issues since:

- Two similar sequences often have gone through  <b><span style='color:#F55AA2'>biological mutation</span></b>, which can change nucleotides, thus changing their structure.
- Thus we cannot simply rely on a comparison of nucleotides that are __in the same order__. 
- This process is even more complicated by nucleotide  <b><span style='color:#F55AA2'>insertions</span></b> &  <b><span style='color:#F55AA2'>deletions</span></b>. 

As a result, we need to resort to a procedure known as <b><span style='color:#F55AA2'>Pairwise Sequence Alignment (PSA)</span></b>.

### <b><span style='color:#F55AA2'>TYPES OF SEQUENCE ALIGNMENT</span></b> 

There are two main groups of Sequence Alignment:
- <b><span style='color:#F55AA2'>Global sequence alignment</span></b> (we are interested in aligning the entire sequences)
- <b><span style='color:#F55AA2'>Local sequence alignment</span></b> (we aim to find good alignments of adjacent nucleotides of both of the sequences, whilst ignoring the rest)

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>5.2 |</span></b> Biopython Specifics</b></p>
</div>

### <b><span style='color:#F55AA2'>CHOOSING THE TYPE OF ALIGNMENT</span></b> 

- <code>Bio.pairwise2</code> has implementations of <b><span style='color:#F55AA2'>Dynamic Programing (DP)</span></b> Algorithms for __Pairwise Sequence Alignment__.
- Alignment Type option is written after <code>pairwise2.align.X</code>, where <code>.X</code> is replaced by the type; <code>.global</code> and <code>.local</code>.
- After <code>.local</code> or <code>.global</code>, two letters are written to indicate which parameters it takes; eg. <code>.localXY</code>

### <b><span style='color:#F55AA2'>OPTIONS IN FOR X & Y</span></b> 

- The <b><span style='color:#F55AA2'>first (X)</span></b> indicates __a substitution matrix__ or __the parameters for match/missmatches__
- The <b><span style='color:#F55AA2'>second (Y)</span></b> indicates the parameters for the penalty of the gap.

#### **THE SPECIFICS**

Some more specifics<b><span style='color:#F55AA2'> (first letter)</span></b>:
> - If __x__ is provided, a <b>score of 1</b> is considered for the alignment & a <b>mismatch score of 0</b>. <br>
> - If __m__ is provided, the function will allow us to <b>define a match/mismatch score</b> using appropriate parameters. <br>
> - If __d__ is provided, we can pass on a **dictionary** to the function, **defining a full substitution matrix**. <br>

Some more specifics <b><span style='color:#F55AA2'>(second letter)</span></b>:
> - If __x__ is provided, no gap penalties are imposed (g=0) <br>
> - If __s__ is provided, we can define an affinite gap penalty model <br>

### <b><span style='color:#F55AA2'>SEQUENCE INPUT SEQUENCES</span></b> 
- Inputs can be both <b>string</b> formats or defined <b>Seq</b> class instances.

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>5.3 |</span></b> Pairwise Sequence Alignment</b></p>
</div>

### <b><span style='color:#F55AA2'>GLOBAL NUCLEOTIDE</span> SEQUENCE ALIGNMENT</b> 
- Let's try an an example, aligning <code>seq1</code> & <code>seq2</code>, both are <b>nucleotide</b> sequences.
- We are looking at <b>global alignment</b> (.global), with a match <b>score of 1</b> & <b>missmatch score of 0</b> (so first letter is x) & <b>no gap penalty emposed</b> (second letter is x).

In [None]:
''' Global Alignment of two DNA sequences'''

# String based sequences
seq1 = 'ATGGCAGATAGA'
seq2 = 'ATAGAGAATAG'

# define an instance of Seq
cseq1 = Seq(seq1)
cseq2 = Seq(seq2)

# - Global Alignment Problem
# - (match score = 1, missmatch = 0), gap penalties = 0
GDNA = pairwise2.align.globalxx(cseq1,cseq2)
print (f'# alternative optimal alignments: {len(GDNA)}')

# print all the alignments
for i in GDNA: 
    print(format_alignment(*i))

### <b><span style='color:#F55AA2'>GLOBAL PROTEIN</span> SEQUENCE ALIGNMENT</b> 
- Let's try another example, aligning <code>pseq1</code> & <code>pseq2</code>, both are <b>protein</b> sequences.
- We are looking at <b>global alignment</b> (.global), using the <b>BLOSUM62</b> match/missmatch scores (first letter d) & a <b>gap penalty</b> emposed (second letter s)

In [None]:
''' Global Alignment of two protein sequences ''' 
# - Global Alignment Problem of Protein Sequences
# - Using a substitute matrix BLOSUM64, opening gap penalty -4, extension penalty -1
    
pseq1 = "EVSAW"
pseq2 = "KEVLA"

sm = MatrixInfo.blosum62 # load BLOSUM62 Substitution Matrix
lPRT = pairwise2.align.globalds(pseq1,pseq2,sm,-4,-1)
for i in lPRT:
    print(format_alignment(*i))

### <b><span style='color:#F55AA2'>LOCAL NUCLEOTIDE</span> SEQUENCE ALIGNMENT</b> 
- Let's try another example, aligning nucleotide sequences <code>seq1</code> & <code>seq2</code>, this time <b>locally</b>.
- Local alignment (.local), <b>match/missmatch</b> score (first letter m), gap penalty emposed (second letter s)

In [None]:
''' Local Alignment of two DNA sequences ''' 
# Local Alignment Problem of DNA Sequences
# Match score = 3, mismatch score = -2, constant gap penalty g=-3 x2

lDNA = pairwise2.align.localms(seq1,seq2, 3,-2,-3,-3)
for i in lDNA: 
    print(format_alignment(*i))

### <b><span style='color:#F55AA2'>LOCAL PROTEIN </span> SEQUENCE ALIGNMENT</b> 
- Again aligning <code>pseq1</code> & <code>pseq2</code>, both are <b>protein</b> sequences, <b>locally</b>.
- We are looking at <b>local alignment</b> (.local), using the <b>BLOSUM62</b> match/missmatch scores (first letter d) & a <b>gap penalty</b> emposed (second letter s).

In [None]:
''' Local Alignment of Protein Sequences '''
# - Local Alignment Problem of Protein Sequences
# - Using a substitute matrix BLOSUM64, opening gap penalty -4, extension penalty -1

pPROT = pairwise2.align.localds(pseq1,pseq2,sm,-4,-1)
for i in pPROT:
    print(format_alignment(*i))

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>5.4 |</span></b> Unitig Sequence Alignment</b></p>
</div>

### <b><span style='color:#F55AA2'>FINDING THE BEST ALIGNMENT IN A LIST</span></b> 

- We can utilise <code>pairwise2.align</code> in a variety of situations, eg. cycle through a list & find the best alignment case.
- Having two lists of filled with <b>unitigs</b> (nucleotide sequences); <code>lst1</code> & <code>lst2</code> ( you can also change the input to a sequence in <code>get_bestalign</code>)
- Using the [gonorrhoea](https://www.kaggle.com/nwheeler443/gono-unitigs) dataset, we can compare <b>a sequence from one list</b> and try to find the the best alignment in the second list.
- In this case, we can see that the alignment is not ideal at all, resulting in lots of gaps for the input sequence & the best alignment score is due to the longer matching sequence size.

In [None]:
# class to used define feature matrix & target variavle 
class data:
    
    def __init__(self):
        self.df = pd.read_csv('../input/gono-unitigs/metadata.csv', index_col=0)
        self.features = self.df.columns
        self.targets = ['azm_sr','cip_sr','cro_sr','cfx_sr','tet_sr','pen_sr']
    
    def get_case(self,phenotype=None):
    
        _metadata = self.df
        _metadata = _metadata.dropna(subset=[phenotype])
        _metadata = _metadata[phenotype] # choose target variable 
        
        prefix = '../input/gono-unitigs/'
        suffix = '_gwas_filtered_unitigs.Rtab'
        
        # store all file pathways
        self.lst_files = []
        for dirname, _, filenames in os.walk('/kaggle/input'):
            for filename in filenames:
                self.lst_files.append(os.path.join(dirname, filename))
        
        # unitig feature matrix for phenotype
        tdf = pd.read_csv(prefix + phenotype + suffix, sep=" ", index_col=0, low_memory=False)
        tdf = tdf.T # align column data w/ metadata df (pattern_id = sample_idd)
        tdf = tdf[tdf.index.isin(_metadata.index)] # keep only common rows, ie. that have resistence measure
        self.train = tdf
        self.target = _metadata[_metadata.index.isin(tdf.index)]

In [None]:
gono = data()

# list of nucleotides
gono.get_case('azm_sr')
azm_unitigs = gono.train.columns.tolist() # list of unitigs

print('AZM unitigs:')
print(azm_unitigs[:3])

# List of nucleotides
gono.get_case('cfx_sr')
cfx_unitigs = gono.train.columns.tolist() # list of unitigs

print('\nCFX unitigs:')
print(cfx_unitigs[:3])

In [None]:
# Select one unitig in lst1 and find the best alignment in lst2
def get_bestalign(lst1,ids,lst2):
    
    # remove cases if columns exist in string
    
    lst2_upd = []
    for unitig in lst2:
        if(',' not in unitig):
            lst2_upd.append(unitig)
            
    # main unitig used for comparison
    n1 = lst1[ids]
    print('Finding Best Alignment Score:')
    print(f'For Sequence: {n1}\n')
    
    # Cycle through all unitigs in updated list
    
    max_score = 0
    best_ali = None
    ii=-1
    for unitig in lst2_upd:

        ii+=1;
        ALI = pairwise2.align.localxx(n1,unitig)
        lscore = ALI[0].score
        if(lscore > max_score):
            print(ii,unitig,lscore)
            max_score = lscore
            best_ali = ALI
            
    return best_ali

best_lalign = get_bestalign(azm_unitigs,10,cfx_unitigs)

In [None]:
print(format_alignment(*best_lalign[0]))

# <div style="padding: 30px;color:white;margin:10;font-size:80%;text-align:left;display:fill;border-radius:10px;background-color:#F1C40F;overflow:hidden;background-image: url(https://i.imgur.com/tNNAhbu.png)"><b><span style='color:white'>6 |</span></b> Multiple Sequence Alignment</div>

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>6.1 |</span></b> SeqRecord</b></p>
</div>

### <b><span style='color:#F55AA2'>FOR ALIGNMENT SEQUENCES</span></b> 
- We looked at <code>SeqRecord</code> earlier, a sequence read from a file is already in this format so we can instantly pass them to the alignment class.
- If we define sequences from <b>strings</b> we'll need to define a <code>Seq</code> instance & subsequently <code>SeqRecord</code> (only needing <code>id</code>)
- To align <b><span style='color:#F55AA2'>Multiple Sequences</span></b> together ( <b>More than two</b> ), we can use the <code>.Align</code> class & call <code>MultipleSeqAlignment</code>
- However unlike <code>Bio.pairwise2</code>, we can't use direct <b>strings</b> in the alignment class input & need to define a <code>Seq</code> object for a particular sequence, followed by a defined <b>SeqRecord</b> instance that has its defined <b>sequence ID</b>.

In [None]:
from Bio.SeqRecord import SeqRecord

# String format of alignment
aa1 = "MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW"
aa2 = "MH--IFIYQIGYALKSGYIQSIRSPEY-NW"
aa3 = "MHQAIFI-QIGYALKSGY-QSIRSPEYDNW"

# Sequence Class instances
seq_aa1 = Seq(aa1)
seq_aa2 = Seq(aa2)
seq_aa3 = Seq(aa3)
 
# Create a sequence record w/ a defined ID
seqr1 = SeqRecord(seq_aa1,id="seq1")
seqr2 = SeqRecord(seq_aa2,id="seq2")
seqr3 = SeqRecord(seq_aa3,id="seq3")

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>6.2 |</span></b> MultipleSeqAlignment</b></p>
</div>

### <b><span style='color:#F55AA2'>PUTTING IT TOGETHER</span></b> 

- Upon defining the sequence records of all the sequences we want to align, we can call <b><span style='color:#F55AA2'>MultipleSeqAlignment</span></b> with the records as inputs.
- We can also do simple operations on these sequences as they are grouped together, as they are essentially <b>strings</b> in alignment.

In [None]:
from Bio.Align import MultipleSeqAlignment as MSA

# Multiple Sequence Alignment
alin = MSA([seqr1, seqr2, seqr3])
sub_alin = alin[0:2,:]

In [None]:
print(alin[1]) # Get the 2nd sequence in the alignment
print(alin[:,2]) # Get the 3rd column in the alignment
print(alin[:,3:7])  # Get multiple columns in alignment
print(alin[0].seq[:3]) # first 3 columns of seq1
print(alin[1:3,5:12]) # sequences 2 and 3; 4th to 10th column

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>6.3 |</span></b> Alignment Formats</b></p>
</div>

### <b><span style='color:#F55AA2'>DEFINING AN ALIGNMENT</span></b> 
- Usually alignments are generated by <b><span style='color:#F55AA2'>MultipleSeqAlignment</span></b>, however we can create a blank alignment by passing a blank list to <code>MSA</code>.
- We can also add new sequences by using <code>.add_sequence</code>.

In [None]:
n1 = 'ACCGGTCAGGGCGACGGGTTTGCGCCCTTTGAT'
n2 = 'ACCGGTCATTTCGACGGGTTTGCGGCCTTTGCT'
n3 = 'CCAGGTCAGCTCGACGGGTTGGCGGCCTTTGCT'

align_n = MSA([])
align_n.add_sequence('n1',n1)
align_n.add_sequence('n2',n2)
align_n.add_sequence('n3',n3)
print(align_n)

### <b><span style='color:#F55AA2'>READING ALIGNMENT DATA</span></b> 

- As with the previously mentioned <b>FASTA</b> & <b>GENBANK</b> formats in <code>SeqIO</code>. There are formats used for <b><span style='color:#F55AA2'>saving</span></b> & <b><span style='color:#F55AA2'>loading alignments</span></b> as well.
- As indicated in the <code>AlignIO</code> [section](https://biopython.org/wiki/AlignIO), <b><span style='color:#F55AA2'>clustal</span></b> is one of such formats & is often used. 

In [None]:
from Bio import AlignIO
alin2 = AlignIO.read("/kaggle/input/biopython-genbank/PF05371_seed.aln", "clustal")

### <b><span style='color:#F55AA2'>DISPLAY SEQRECORD DATA</span></b> 
Once the <b>clustal</b> format is read, we are dealing with <b>SeqRecord</b> data as we defined earlier.

In [None]:
# Get the length of the alignment (column size)
print("Size (length):", alin2.get_alignment_length())

# Print the SeqRecord data
for record in alin2:
    print(f'Sequence: {record.seq}, ID: {record.id}')

In [None]:
# adjust alignment content & write if need be 
sub_alin2 = alin2[0:4,:]
AlignIO.write(alin2, "/kaggle/working/sub_alin2.fasta", "fasta")

### <b><span style='color:#F55AA2'>CONVERTING FORMATS</span></b> 
<b>FASTA</b> is quite a generic format, which aside from storing sequence data (which we saw above), can also store alignment data.

In [None]:
# Write the read clustal formal
AlignIO.write(alin2, "example_alin.fasta", "fasta")

# Or directly convert it
AlignIO.convert("/kaggle/input/biopython-genbank/PF05371_seed.aln", "clustal", "example_alin.fasta", "fasta")

### <b><span style='color:#F55AA2'>VISUALISING ALIGNMENTS WITH BOKEH</span></b> 
- It can be useful to <b><span style='color:#F55AA2'>colour code</span></b> our alignment data, so we can observe any variation in our sequence more easily. 
- We can use the <b>Bokeh</b> library to plot the sequence data using <b>gridplots</b> ( thanks to [Understanding phylogenetic trees](https://www.kaggle.com/nwheeler443/understanding-phylogenetic-trees/notebook) )

In [None]:
import panel as pn
import panel.widgets as pnw
pn.extension()
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Plot, Grid, Range1d
from bokeh.models.glyphs import Text, Rect
from bokeh.layouts import gridplot

def view_alignment(aln, fontsize="9pt", plot_width=800):
    """Bokeh sequence alignment view"""

    #make sequence and id lists from the aln object
    seqs = [rec.seq for rec in (aln)]
    ids = [rec.id for rec in aln]    
    text = [i for s in list(seqs) for i in s]
    colors = get_colors(seqs)    
    N = len(seqs[0])
    S = len(seqs)    
    width = .4

    x = np.arange(0.5,N+0.5)
    y = np.arange(0,S,1)
    #creates a 2D grid of coords from the 1D arrays
    xx, yy = np.meshgrid(x, y)
    #flattens the arrays
    gx = xx.ravel()
    gy = yy.flatten()
    #use recty for rect coords with an offset
    recty = gy+.5
    h= 1/S
    #now we can create the ColumnDataSource with all the arrays
    source = ColumnDataSource(dict(x=gx, y=gy, recty=recty, text=text, colors=colors))
    plot_height = len(seqs)*15+50
    x_range = Range1d(0,N+1, bounds='auto')
    if N>100:
        viewlen=100
    else:
        viewlen=N
    #view_range is for the close up view
    view_range = (0,viewlen)
    tools="xpan, xwheel_zoom, reset, save" 

    #sequence text view with ability to scroll along x axis
    p = figure(title=None, plot_width=plot_width, plot_height=plot_height,
                x_range=view_range, y_range=ids, tools="xpan,reset",
                min_border=0, toolbar_location='below')#, lod_factor=1)          
    glyph = Text(x="x", y="y", text="text", text_align='center',text_color="black",
                text_font="monospace",text_font_size=fontsize)
    rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                line_color=None, fill_alpha=0.4)
    p.add_glyph(source, glyph)
    p.add_glyph(source, rects)

    p.grid.visible = False
    p.xaxis.major_label_text_font_style = "bold"
    p.yaxis.minor_tick_line_width = 0
    p.yaxis.major_tick_line_width = 0

    p = gridplot([[p]], toolbar_location='below')
    return p

# Mapping Function for nucleotide sequences
def get_colors(seqs):
    """make colors for bases in sequence"""
    text = [i for s in list(seqs) for i in s]
    clrs =  {'A':'red','T':'green','G':'orange','C':'blue','-':'white'}
    colors = [clrs[i] for i in text]
    return colors

In [None]:
p = view_alignment(align_n, fontsize="9pt", plot_width=800)
pn.pane.Bokeh(p)

# <div style="padding: 30px;color:white;margin:10;font-size:80%;text-align:left;display:fill;border-radius:10px;background-color:#F1C40F;overflow:hidden;background-image: url(https://i.imgur.com/tNNAhbu.png)"><b><span style='color:white'>7 |</span></b> BLAST</div>

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>7.1 |</span></b> NCBI Database Online Queries</b></p>
</div>

### <b><span style='color:#F55AA2'>USING DATABASES TO UNDERSTAND OUR SEQUENCE</span></b> 

- There are are ways to do <b>Sequence Alignment</b> via websites like [ebi.ac.uk](https://www.ebi.ac.uk/Tools/sss/), which uses an approach called <b><span style='color:#F55AA2'>BLAST</span></b>.
- <b><span style='color:#F55AA2'>Basic Local Alignment Search Tool (BLAST)</span></b> uses the same alignment methods as implemented in <b>Biopython</b>, except the comparison sequences are called from a database.
- BioPython also includes a <b>BLAST</b> module that can be used to search for similar sequences, like we did in <b>Section 4</b> & <b>5</b>; <code>Bio.Blast</code>.
- The alignment requests are <b>made remotely</b>, and it can involve waiting time as our request is processed, however using BLAST locally is also an option in Biopython.

### <b><span style='color:#F55AA2'>BIOPYTHON BLAST CLASSES</span></b> 

- <code>NCBIWWW</code> : Code to invoke the NCBI BLAST server over the internet (uses [nih.gov](https://blast.ncbi.nlm.nih.gov/Blast.cgi))
- <code>NCBIXML</code> : Code to work with the BLAST XML output. (we need to interpret the received results)

### <b><span style='color:#F55AA2'>SENDING A QUERY</span></b> 

- <code>NCBIWWW.qblast</code> is used to send a request to NCBI BLAST.
- The main parameters used with the <code>NCBIWWW.qblast function</code>: | ([All parameter inputs](https://biopython.org/docs/1.75/api/Bio.Blast.NCBIWWW.html))
> - The function takes in the <code>program option</code> (<b><span style='color:#F55AA2'>"blastn"</span></b>,<b><span style='color:#F55AA2'>"blastp"</span></b>,<b><span style='color:#F55AA2'>"blastx"</span></b>,<b><span style='color:#F55AA2'>"tblastn"</span></b>,<b><span style='color:#F55AA2'>"tblastx"</span></b>)
> - The <code>database</code> to search (eg. <b><span style='color:#F55AA2'>"nr"</span></b>,<b><span style='color:#F55AA2'>"nt"</span></b>, <b><span style='color:#F55AA2'>"pdb"</span></b>)
> - The <code>sequence</code> to be used in the query ( can search by the <b>gi identifier</b> if you have it )

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>7.2 |</span></b> Python Strings to Sequences</b></p>
</div>

### <b><span style='color:#F55AA2'>LOADING THE SEQUENCE</span></b> 
- We'll use a <b>FASTA</b> file whos description contains the actual sequence identification, so we'll be able to see if we have a hit in the database of existing sequences.
- The actual description is not used in the alignment process, it's only there for us to visually confirm we have a match in the database.

In [None]:
# File which we want to align
# Description is included so we can compare results
file_fasta = '/kaggle/input/biopython-genbank/NC_005816.fna'

record = SeqIO.read(open(file_fasta),format='fasta')
print(record)
print(f'Length of Nucleotide Sequence: {len(record)}')

### <b><span style='color:#F55AA2'>BLAST QUERY & SAVING CONTENT</span></b> 
- Our sequence (let's say we didn't know the NBCI <b>gi</b> identifier) is made up of <b>nucleotide</b> content, so we'll use <code>'blastn'</code> option when calling <code>.qblast()</code>
- The query can be <b><span style='color:#F55AA2'>saved to a file</span></b> as we would with a normal python file. The content is saved using <code>.read()</code> and saved with the <b>xml</b> extension.

In [None]:
if(blast_id):
    
    # BLAST query instance
    result_handle = NCBIWWW.qblast('blastn','nt',record.format('fasta'))

    # Read the file (call only once)
    with open("/kaggle/working/NC_005816_blast.xml", "w") as save_to:
        save_to.write(result_handle.read())
        result_handle.close()

### <b><span style='color:#F55AA2'>READING BLAST SEARCH RESULT CONTENT</span></b> 
- When we want to read the <b>BLAST</b> search results, we can open it as we would any other file with python's <code>open()</code>.
- We need to <b>parse</b> the stored file (<code>NCBIXML.parse</code>), which returns the <code>Record</code> objects, which we can read through using <code>for</code> loops.

In [None]:
if(blast_id):
    result_handle = open('/kaggle/input/biopython-genbank/NC_005816_blast.xml','r')
    blast_records = NCBIXML.parse(result_handle)

In [None]:
if(blast_id):
     
    e_thresh = 0.00; # display only perfect matches
    len_id = 5 # show only 5 results
    for blast_record in blast_records: # go through all blast records
        
        jj=-1
        for alignment in blast_record.alignments:
            jj+=1;
            if(jj < len_id):
                print(f'\nAlignment #{jj}')
                for hsp in alignment.hsps:
                    if(hsp.expect <= e_thresh):
                        print('***** Alignment *****')
                        print(f'sequence: {alignment.title}')
                        print(f'length: {alignment.length}')
                        print(f'e value: {hsp.expect}')

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>7.3 |</span></b> BLAST Protein Sequence Query</b></p>
</div>

### <b><span style='color:#F55AA2'>READING BLAST SEARCH RESULT CONTENT</span></b> 
- In <b>Section 4.3</b>, we identified the longest <b>amino acid</b> chain in the genome in question, let's search for it using <b>BLAST</b>.
- Instead of using <code>NCBIXML.parse</code>, we can also use <code>SearchIO</code> to read our <b>BLAST XML</b> query results.
- For <b>Proteins</b>, we'll need to use <code>"blastp"</code> in our <code>NCBIWWW.qblast</code> query & search in the <code>pdb</code> database.

In [None]:
def read_blast(result_handle,hit_id=0,show_top=5):

    blast_qresult = SearchIO.read(result_handle,"blast-xml")
    print(blast_qresult[0:show_top])

    #fetch the id, description, evalue, bitscore & alignment
    print(f'Showing BLAST query result #{hit_id}')
    seqid = blast_qresult[hit_id]
    details = seqid[hit_id]
          
    print(f'\nSequence ID: {seqid.id}')
    print(f'Description: {seqid.description}')
    print(f'e-value: {details.evalue}')
    print(f'Bit Score: {details.bitscore}')
          
    print('\nShowing Alignment:')
    print(f"\nalignment:\n{details.aln}")

In [None]:
# read file
if(blast_id):
    c19_protein = SeqIO.read("/kaggle/working/protein.fasta", "fasta")
    result_handle = NCBIWWW.qblast("blastp","pdb",c19_protein.seq)
    read_blast(result_handle) # read qblast result & show first hit only

# <div style="padding: 30px;color:white;margin:10;font-size:80%;text-align:left;display:fill;border-radius:10px;background-color:#F1C40F;overflow:hidden;background-image: url(https://i.imgur.com/tNNAhbu.png)"><b><span style='color:white'>8 |</span></b> Phylogenetic Analysis</div>

### <b><span style='color:#F55AA2'>SEQUENCE EVOLUTION</span></b> 

- One of the more practical modules applications in BioPython, <code>Bio.Phylo</code> allows us to understand the relation between difference <b>aligned sequences</b>
- **<span style='color:#F55AA2'>Phylogenetic analysis analysis</span>** - concerned with determining how a given <b>set of sequences</b> (alignment) has evolved from a <b>common ancestor</b> through the process of natural evolution.
- The process of evolution is depicted in the form of **<span style='color:#F55AA2'>an evolutionary tree</span>** & bifurcations represent <b>events of mutation</b> from a common ancestor, that gives rise to <b>branches</b>.
- A **<span style='color:#F55AA2'>Phylogenetic tree</span>** will represent known sequences in its leaves & internal tree nodes will represent common ancestors of the sequence below it.

### <b><span style='color:#F55AA2'>USEFUL MATERIAL</span></b> 

- [<b>www <span style='color:#F55AA2'> // </span> PHYLOGENETIC TREES LECTURE</b>](https://www.ncbi.nlm.nih.gov/CBBresearch/Przytycka/download/lectures/PCB_Lect11_Phylogen_Trees.pdf)
- [<b>www <span style='color:#F55AA2'> // </span> INTERFACING WITH PAML</b>](https://biopython.org/wiki/PAML)
- [<b>www <span style='color:#F55AA2'> // </span> WORKING WITH PHYLOGENETIC TREES</b>](https://biopython.org/wiki/PhyloL)
- [<b>www <span style='color:#F55AA2'> // </span> PHYLOGENETIC COOKBOOK</b>](https://biopython.org/wiki/Phylo_cookbook)

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>8.1 |</span></b> Tree Construction</b></p>
</div>

### <b><span style='color:#F55AA2'>BIOPYTHONS IMPLEMENTATION</span></b> 

- <code>Bio.Phylo.TreeConstruction</code> - Biopython provides several tree construction algorithm implementations in pure python.
- There are a couple of approaches used to construct a <b><span style='color:#F55AA2'>phylogenetic tree</span></b>:
> - <b><span style='color:#F55AA2'>Distance Matrix based</span></b> (<b>implemented in biopython</b>)
> - <b><span style='color:#F55AA2'>Parsimony</span></b> (<b>implemented in biopython</b>)
> - Maximum Likelihood based ( Accessable via <b>external library wrapper</b> )

### <b><span style='color:#F55AA2'>EXTERNAL LIBRARY WRAPPERS</span></b> 

> - <code>Bio.Phylo.PhymlCommandline</code>
> - <code>Bio.Phylo.PAML</code>

### <b>APPROACH 1 - <span style='color:#F55AA2'>DISTANCE MATRIX BASED APPROACH</span></b> 

In [None]:
# Recall Alignment
print(alin2,'\n')

In [None]:
# Recall Alignment
print(alin2,'\n')

''' Calculate Distance Matrix'''
# From raw MSE alignment
# The ‘identity’ model can be used both for DNA and protein sequences

print('Distance Matrix')
calculator = DistanceCalculator('identity') # create calculator
dist_mat = calculator.get_distance(alin2) # get the distance matrix of a given alignment object
print(dist_mat)

In [None]:
''' Distance Tree Constructor '''
# 'nj' / 'upgma' tree construction approaches 

constructor = DistanceTreeConstructor(calculator,'nj') # initialise
tree = constructor.build_tree(alin2) # main algorithm
print(tree)

In [None]:
# Plot Tree 
fig,axes = plt.subplots(1,1,figsize=(12,6))
treep = Phylogeny.from_tree(tree)
Phylo.draw(treep,axes=axes)

### <b>APPROACH 2 - <span style='color:#F55AA2'>PARSIMONY BASED APPROACH</span></b> 

In [None]:
''' Create a starting tree '''

# sub_alin2 =  "/kaggle/working/sub_alin2.fasta"
sub_alin2 = AlignIO.read("/kaggle/working/sub_alin2.fasta", "fasta")

calculator = DistanceCalculator('identity') # create calculator
dist_mat = calculator.get_distance(sub_alin2) # get the distance matrix of a given alignment object
constructor = DistanceTreeConstructor(calculator,'nj') # initialise
tree_0 = constructor.build_tree(sub_alin2) # main algorithm

In [None]:
# Recall Alignment
print(alin2,'\n')

In [None]:
from Bio.Phylo.TreeConstruction import ParsimonyScorer, NNITreeSearcher,ParsimonyTreeConstructor

scorer = ParsimonyScorer()
searcher = NNITreeSearcher(scorer)
constructor = ParsimonyTreeConstructor(searcher,tree_0)
pars_tree = constructor.build_tree(alin2)
print(pars_tree)

In [None]:
# Plot Tree 
fig,axes = plt.subplots(1,1,figsize=(12,6))
treep = Phylogeny.from_tree(pars_tree)
Phylo.draw(treep,axes=axes)

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#F1A424'>8.2 |</span></b> Tree Visualisation</b></p>
</div>

In [None]:
simple = open("/kaggle/working/example.dnd", "w")
simple.write('(((A,B),(C,D),(E,F,G)));')
simple.close()

tree = Phylo.read('/kaggle/working/example.dnd','newick')
print(tree)

In [None]:
Phylo.draw_ascii(tree)

In [None]:
Phylo.convert('/kaggle/working/example.dnd','newick',
              '/kaggle/working/tree.xml','phyloxml')
trees = Phylo.parse('/kaggle/working/tree.xml','phyloxml')
for t in trees:
    print(t)

In [None]:
treep = Phylogeny.from_tree(tree)
Phylo.draw(treep)

In [None]:
treep.root.color = 'gray'
mrca = treep.common_ancestor({'name':'E'},{'name':'F'})
mrca.color = 'salmon'
treep.clade[0,1].color = 'blue'
Phylo.draw(treep)