<a href="https://colab.research.google.com/github/collvey/Biopython/blob/main/NCBI_Entrez_Alignment_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Biopython

In [2]:
!pip install biopython



# Source

http://biopython.org/DIST/docs/tutorial/Tutorial.pdf

# Import Modules

# Investigation

In [16]:
from Bio import Entrez

Entrez.email = "collvey.veritas@gmail.com" # Always tell NCBI who you are
handle = Entrez.einfo()
record = Entrez.read(handle)
handle.close()

In [19]:
record.keys()
record

{'DbList': ['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'biosystems', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']}

In [20]:
handle = Entrez.einfo(db="pubmed")
record = Entrez.read(handle)
record["DbInfo"]["Description"]

'PubMed bibliographic record'

In [21]:
handle = Entrez.esearch(db="pubmed", term="biopython[title]", retmax="40" )
record = Entrez.read(handle)

In [22]:
record

{'Count': '3', 'RetMax': '3', 'RetStart': '0', 'IdList': ['34434786', '22909249', '19304878'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'biopython[title]', 'Field': 'title', 'Count': '3', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': 'biopython[title]'}

In [24]:
id_list = ["19304878", "18606172", "16403221", "16377612", "14871861", "14630660"]
search_results = Entrez.read(Entrez.epost("pubmed", id=",".join(id_list)))
webenv = search_results["WebEnv"]
query_key = search_results["QueryKey"]

In [27]:
search_results

{'QueryKey': '1', 'WebEnv': 'MCID_621c54edc58355509175abb2'}

In [28]:
handle = Entrez.esummary(db="nlmcatalog", id="101660833")
record = Entrez.read(handle)
info = record[0]["TitleMainList"][0]
print("Journal info\nid: {}\nTitle: {}".format(record[0]["Id"], info["Title"]))

Journal info
id: 101660833
Title: IEEE transactions on computational imaging.


In [30]:
record

[{'Item': [], 'Id': '101660833', 'Host': '', 'NLMUniqueID': '101660833', 'AuthorList': [{'CollectiveName': 'Institute of Electrical and Electronics Engineers.', 'LastName': '', 'ForeName': '', 'NameQualifier': '', 'OtherInformation': '', 'TitleAssociatedWithName': '', 'AIID': ''}], 'PublicationInfoList': [{'Imprint': 'Piscataway, NJ : IEEE', 'Place': 'Piscataway, NJ :', 'Publisher': 'IEEE', 'DateIssued': '', 'Edition': '', 'DatesOfSerialPublication': 'Began with Vol. 1, issue 1 (March 2015).'}], 'ResourceInfoList': [{'TypeOfResource': 'Serial', 'ResourceUnit': 'Remote electronic resource, Text'}], 'TitleMainList': [{'SortTitle': 'ieee transactions on computational imaging', 'Title': 'IEEE transactions on computational imaging.', 'AIID': ''}], 'TitleOtherList': [], 'ISSN': '', 'ISBN': ''}]

In [31]:
handle = Entrez.efetch(db="nucleotide", id="EU490707", rettype="gb", retmode="text")
print(handle.read())

LOCUS       EU490707                1302 bp    DNA     linear   PLN 26-JUL-2016
DEFINITION  Selenipedium aequinoctiale maturase K (matK) gene, partial cds;
            chloroplast.
ACCESSION   EU490707
VERSION     EU490707.1
KEYWORDS    .
SOURCE      chloroplast Selenipedium aequinoctiale
  ORGANISM  Selenipedium aequinoctiale
            Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
            Spermatophyta; Magnoliopsida; Liliopsida; Asparagales; Orchidaceae;
            Cypripedioideae; Selenipedium.
REFERENCE   1  (bases 1 to 1302)
  AUTHORS   Neubig,K.M., Whitten,W.M., Carlsward,B.S., Blanco,M.A., Endara,L.,
            Williams,N.H. and Moore,M.
  TITLE     Phylogenetic utility of ycf1 in orchids: a plastid gene more
            variable than matK
  JOURNAL   Plant Syst. Evol. 277 (1-2), 75-84 (2009)
REFERENCE   2  (bases 1 to 1302)
  AUTHORS   Neubig,K.M., Whitten,W.M., Carlsward,B.S., Blanco,M.A.,
            Endara,C.L., Williams,N.H. and Moore,M.J.
  TIT

In [38]:
from Bio import SeqIO

handle = Entrez.efetch(db="nucleotide", id="EU490707", rettype="gb", retmode="text")
record1 = SeqIO.read(handle, "genbank")
handle.close()


In [39]:
handle = Entrez.efetch(db="nucleotide", id="EU490708", rettype="gb", retmode="text")
record2 = SeqIO.read(handle, "genbank")
handle.close()

In [45]:
len(record1)

1302

In [46]:
len(record1.seq)

1302

In [50]:
len(record2.seq)

1302

In [41]:
record1.seq

Seq('ATTTTTTACGAACCTGTGGAAATTTTTGGTTATGACAATAAATCTAGTTTAGTA...GAA')

In [42]:
record2.seq

Seq('ATTTTTTACGAACCTGTAGAAATTATTGGTTATGACAATAAATCTAGTTTAGTA...GAA')

In [43]:
from Bio import pairwise2

alignments = pairwise2.align.globalxx(record1.seq, record2.seq)

In [53]:
alignments[0]

Alignment(seqA='ATTTTTTACGAACCTGTG-GAAATTT-TTGGTTATGACAATAAATCTAGTTTAGTACTTGTGAAACGTTTAATTACTCGAATGTATCAACAGAATT-T-TTTGATTTCTTCGGTT-AATGATTCTAACCAAAAA-GGATTTTGGGGGCACAAGCAT-TT-TTTTTCTTCTCATTTTT-CTTCTCAAATGGTATCAGAAGGT-TTTGGAGTCATTCTGGAAATTCCATTCTCGTCGCA-ATTAGTATCTTCTC-TTGAAGAAAAAAA-AATACCAAAATA-TCAGAATTTACGATCTATTCATTCAATATTTCCCTTTTTAGAAG-AC-AAATTTT-T-ACATTTGAA-TTATGTGTCAGATCTACTAATACCCCATCCCATCCATCTGGAAATCTTGGTTCAAATCCTTCAATGCC-GGATCAAG-GATGTTCCTTCTTTGCATTTATTGCGATTGCTTTTC--CACGAATATCATAATTTGA--ATAGTCTCA-TTACTTCAAAGAAATTC-ATTTACGC-CTTTTCAAAAAGAAAGAAAAGATTCCTTT-GGTTAC-TAT-ATAATTCTTATGTATATGAATGCGAATATCTATTCCA-GTTTCTTCGTAAACAGTCTTCTTATTTACGATCAAC-ATCTTCTGGAGTC-TTTCTTGAGCGAACACATTTA-TATGT-AAAAATAGAAC-ATCTTC-TAGTAGTGTGTTGTAATTCTTTTCAGAGGATCCTATGCT-TTC-TCAAG-GATCCTTTCATG-CATTATGTTCGATATCAAGGAAAAGCAATTCTGGCTTCAAAGGGAACTCTTATTCTGATGAAGAAATGGAAATTTCATCTTGTGAATTTTTGGCAATCTTATTTTCACTTTTGGTCTCAACCGT-ATAGGATTC-ATATAAAGCAATTATCC-AACTATTCCTTCTCTTTTCTGGGGTATTTTTCAAGTGTACTAGAAAA-TCA-TTTGGTAGTAAGAAATC

In [54]:
alignments[1]

Alignment(seqA='ATTTTTTACGAACCTGTGGAAATTT-TTGGTTATGACAATAAATCTAGTTTAGTACTTGTGAAACGTTTAATTACTCGAATGTATCAACAGAATT-T-TTTGATTTCTTCGGTT-AATGATTCTAACCAAAAA-GGATTTTGGGGGCACAAGCAT-TT-TTTTTCTTCTCATTTTT-CTTCTCAAATGGTATCAGAAGGT-TTTGGAGTCATTCTGGAAATTCCATTCTCGTCGCA-ATTAGTATCTTCTC-TTGAAGAAAAAAA-AATACCAAAATA-TCAGAATTTACGATCTATTCATTCAATATTTCCCTTTTTAGAAG-AC-AAATTTT-T-ACATTTGAA-TTATGTGTCAGATCTACTAATACCCCATCCCATCCATCTGGAAATCTTGGTTCAAATCCTTCAATGCC-GGATCAAG-GATGTTCCTTCTTTGCATTTATTGCGATTGCTTTTC--CACGAATATCATAATTTGA--ATAGTCTCA-TTACTTCAAAGAAATTC-ATTTACGC-CTTTTCAAAAAGAAAGAAAAGATTCCTTT-GGTTAC-TAT-ATAATTCTTATGTATATGAATGCGAATATCTATTCCA-GTTTCTTCGTAAACAGTCTTCTTATTTACGATCAAC-ATCTTCTGGAGTC-TTTCTTGAGCGAACACATTTA-TATGT-AAAAATAGAAC-ATCTTC-TAGTAGTGTGTTGTAATTCTTTTCAGAGGATCCTATGCT-TTC-TCAAG-GATCCTTTCATG-CATTATGTTCGATATCAAGGAAAAGCAATTCTGGCTTCAAAGGGAACTCTTATTCTGATGAAGAAATGGAAATTTCATCTTGTGAATTTTTGGCAATCTTATTTTCACTTTTGGTCTCAACCGT-ATAGGATTC-ATATAAAGCAATTATCC-AACTATTCCTTCTCTTTTCTGGGGTATTTTTCAAGTGTACTAGAAAA-TCA-TTTGGTAGTAAGAAATCA

In [55]:
print(pairwise2.format_alignment(*alignments[0])) 

ATTTTTTACGAACCTGTG-GAAATTT-TTGGTTATGACAATAAATCTAGTTTAGTACTTGTGAAACGTTTAATTACTCGAATGTATCAACAGAATT-T-TTTGATTTCTTCGGTT-AATGATTCTAACCAAAAA-GGATTTTGGGGGCACAAGCAT-TT-TTTTTCTTCTCATTTTT-CTTCTCAAATGGTATCAGAAGGT-TTTGGAGTCATTCTGGAAATTCCATTCTCGTCGCA-ATTAGTATCTTCTC-TTGAAGAAAAAAA-AATACCAAAATA-TCAGAATTTACGATCTATTCATTCAATATTTCCCTTTTTAGAAG-AC-AAATTTT-T-ACATTTGAA-TTATGTGTCAGATCTACTAATACCCCATCCCATCCATCTGGAAATCTTGGTTCAAATCCTTCAATGCC-GGATCAAG-GATGTTCCTTCTTTGCATTTATTGCGATTGCTTTTC--CACGAATATCATAATTTGA--ATAGTCTCA-TTACTTCAAAGAAATTC-ATTTACGC-CTTTTCAAAAAGAAAGAAAAGATTCCTTT-GGTTAC-TAT-ATAATTCTTATGTATATGAATGCGAATATCTATTCCA-GTTTCTTCGTAAACAGTCTTCTTATTTACGATCAAC-ATCTTCTGGAGTC-TTTCTTGAGCGAACACATTTA-TATGT-AAAAATAGAAC-ATCTTC-TAGTAGTGTGTTGTAATTCTTTTCAGAGGATCCTATGCT-TTC-TCAAG-GATCCTTTCATG-CATTATGTTCGATATCAAGGAAAAGCAATTCTGGCTTCAAAGGGAACTCTTATTCTGATGAAGAAATGGAAATTTCATCTTGTGAATTTTTGGCAATCTTATTTTCACTTTTGGTCTCAACCGT-ATAGGATTC-ATATAAAGCAATTATCC-AACTATTCCTTCTCTTTTCTGGGGTATTTTTCAAGTGTACTAGAAAA-TCA-TTTGGTAGTAAGAAATCAAATGCTAGAGAATTC

In [44]:
alignments

[Alignment(seqA='ATTTTTTACGAACCTGTG-GAAATTT-TTGGTTATGACAATAAATCTAGTTTAGTACTTGTGAAACGTTTAATTACTCGAATGTATCAACAGAATT-T-TTTGATTTCTTCGGTT-AATGATTCTAACCAAAAA-GGATTTTGGGGGCACAAGCAT-TT-TTTTTCTTCTCATTTTT-CTTCTCAAATGGTATCAGAAGGT-TTTGGAGTCATTCTGGAAATTCCATTCTCGTCGCA-ATTAGTATCTTCTC-TTGAAGAAAAAAA-AATACCAAAATA-TCAGAATTTACGATCTATTCATTCAATATTTCCCTTTTTAGAAG-AC-AAATTTT-T-ACATTTGAA-TTATGTGTCAGATCTACTAATACCCCATCCCATCCATCTGGAAATCTTGGTTCAAATCCTTCAATGCC-GGATCAAG-GATGTTCCTTCTTTGCATTTATTGCGATTGCTTTTC--CACGAATATCATAATTTGA--ATAGTCTCA-TTACTTCAAAGAAATTC-ATTTACGC-CTTTTCAAAAAGAAAGAAAAGATTCCTTT-GGTTAC-TAT-ATAATTCTTATGTATATGAATGCGAATATCTATTCCA-GTTTCTTCGTAAACAGTCTTCTTATTTACGATCAAC-ATCTTCTGGAGTC-TTTCTTGAGCGAACACATTTA-TATGT-AAAAATAGAAC-ATCTTC-TAGTAGTGTGTTGTAATTCTTTTCAGAGGATCCTATGCT-TTC-TCAAG-GATCCTTTCATG-CATTATGTTCGATATCAAGGAAAAGCAATTCTGGCTTCAAAGGGAACTCTTATTCTGATGAAGAAATGGAAATTTCATCTTGTGAATTTTTGGCAATCTTATTTTCACTTTTGGTCTCAACCGT-ATAGGATTC-ATATAAAGCAATTATCC-AACTATTCCTTCTCTTTTCTGGGGTATTTTTCAAGTGTACTAGAAAA-TCA-TTTGGTAGTAAGAAAT

In [49]:
len(alignments)

838