In [33]:
'''

Retrieve Genbank entries from the nucleotide database at NCBI.

-----------------------------------------------------------
(c) 2013 Allegra Via and Kristian Rother
    Licensed under the conditions of the Python License

    This code appears in section 20.4.3 of the book
    "Managing Biological Data with Python".
-----------------------------------------------------------

Modified by Gail Rosen for Biological Data Science Workshop on 6/16/21
'''

from Bio import Entrez
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna, generic_protein

In [34]:
Entrez.email = 'glr26@drexel.edu'

In [35]:
all_records=[]

In [44]:
# search sequences by a combination of keywords
handle1=Entrez.esearch(db="nucleotide", term="(cox1 OR coxi) AND fungi[ORGN] AND 450:30000[SLEN] NOT \"complete genome\" NOT partial NOT \"whole genome\"",retmax=200)
records = Entrez.read(handle1)
print(len(records['IdList']))
ids_list=records['IdList'];
print("These are the ID Numbers chosen: ",ids_list)
#all_records.append(records['IdList'])

134
These are the ID Numbers chosen:  ['2027921680', '1419039330', '1833534572', '1069865990', '1418980243', '1383863664', '1383821680', '389646782', '212544747', '1778663160', '1701862139', '1678300228', '1655371837', '1655371828', '1655371806', '1624799536', '46496691', '38353103', '38352297', '30401551', '38356031', '1423539693', '1422746175', '1422724147', '1422630398', '1419170847', '1419157519', '1419134766', '1419086786', '1419056480', '1419006388', '1390379229', '815889166', '685417770', '1195855765', '1044897081', '1044897080', '48526543', '48526539', '2738523', '294042', '12751551', '971493373', '971493372', '971493371', '971493370', '971493369', '971493368', '971493367', '971493366', '971493365', '971493364', '971493363', '971493362', '971493361', '971493360', '971493359', '13639', '686482741', '667481203', '390607153', '374110326', '219964435', '219964458', '219964457', '219964456', '219964455', '219964453', '219964451', '219964449', '219964446', '219964444', '219964442', '

In [87]:
# retrieve the sequences by their GI numbers
#ids_list = ','.join(str(v) for v in all_records)
handle = Entrez.efetch(db="nucleotide", id=ids_list, rettype="gb", retmode="xml", retmax = len(ids_list))
my_genbank_records = Entrez.read(handle)
handle.close()

In [88]:
#print These are the organisms that it first found
print("My first fungi: ",my_genbank_records[0]['GBSeq_organism'],"and its accession number: ",my_genbank_records[0]['GBSeq_primary-accession'])

My first fungi:  Aspergillus fijiensis CBS 313.89 and its accession number:  XM_040944842


In [89]:
#We can output the genbank files directly to a file and to go further, we to convert Genbank to FASTA
my_fasta_records=[]
for i in range(len(my_genbank_records)):
	my_fasta_records.append(SeqRecord(Seq(my_genbank_records[i]['GBSeq_sequence']),id=my_genbank_records[i]['GBSeq_primary-accession'],description=my_genbank_records[i]['GBSeq_definition']))

In [90]:
#This can output one file
one_file=open("my_seqs.fa","w")
SeqIO.write(my_fasta_records, one_file, "fasta")
one_file.close()

In [91]:
#To output file as Genbank files, it's easier to refetch as text and output it
one_file=open("my_seqs.gb","w")
handle = Entrez.efetch(db="nucleotide", id=ids_list, rettype="gbwithparts", retmode="text", retmax = len(ids_list))
my_genbank_records = SeqIO.parse(handle,"genbank")
SeqIO.write(my_genbank_records, one_file, "gb")
handle.close()
one_file.close()