In [4]:
import pandas as pd
from Bio import Entrez, SeqIO

In [2]:
Entrez.email = 'chen1i6c04@gmail.com'

def einfo(db=''):
    with Entrez.einfo(db=db) as handle:
        record = Entrez.read(handle)
    if db:
        for fileld in record['DbInfo']['FieldList']:
            print(fileld['Name'], fileld['Description'], sep='\t')
    else:
        return record['DbList']

def egquery(term, db):
    with Entrez.egquery(term=term) as handle:
        record = Entrez.read(handle)
    for row in record["eGQueryResult"]:
        if row["DbName"]==db:
            count = int(row["Count"])
            return count

def esearch(term, db, retmax=10000):
    with Entrez.esearch(db=db, term=term, retmax=retmax) as handle:
        record = Entrez.read(handle)
        return record['IdList']

def efetch(uid, db, rettype='xml', retmode="xml"):
    with Entrez.efetch(db=db, id=uid, rettype=rettype, retmode=retmode) as handle:
        record = handle.read()
    return record

In [3]:
einfo('protein')

ALL	All terms from all searchable fields
UID	Unique number assigned to each sequence
FILT	Limits the records
WORD	Free text associated with record
TITL	Words in definition line
KYWD	Nonstandardized terms provided by submitter
AUTH	Author(s) of publication
JOUR	Journal abbreviation of publication
VOL	Volume number of publication
ISS	Issue number of publication
PAGE	Page number(s) of publication
ORGN	Scientific and common names of organism, and all higher levels of taxonomy
ACCN	Accession number of sequence
PACC	Does not include retired secondary accessions
GENE	Name of gene associated with sequence
PROT	Name of protein associated with sequence
ECNO	EC number for enzyme or CAS registry number
PDAT	Date sequence added to GenBank
MDAT	Date of last update
SUBS	CAS chemical name or MEDLINE Substance Name
PROP	Classification by source qualifiers and molecule type
SQID	String identifier for sequence
GPRJ	BioProject
SLEN	Length of sequence
MLWT	Molecular Weight
FKEY	Feature annotated on sequenc

In [6]:
file = '/media/GenomicResearch/Issue/20220110_CPfVfsDetected/Exotoxin.faa'

In [10]:
term = ' OR '.join([record.id + '[ACCN]' for record in SeqIO.parse(file, 'fasta')])

In [11]:
uids = esearch(term, 'protein')
uids

['490591301', '489563104', '499270857', '499319092', '489565498', '489554297', '224980825', '929032', '414655']

In [13]:
fetch_results = efetch(uids, 'protein', 'fasta_cds_na', 'text')

In [14]:
fetch_results

'\n\n\n\n\n\n>lcl|FJ189503.1_cds_ACN73257.1_1 [protein=NetB] [protein_id=ACN73257.1] [location=255..1223] [gbkey=CDS]\nTTGAAAAGATTAAAAATTATTTCAATTACACTAGTTCTTACAAGTGTAATTAGTACAAGCCTTTTTTCAA\nCTCAAACTCAAGTTTTTGCAAGTGAATTAAATGACATAAACAAAATTGAGTTGAAAAATCTAAGTGGAGA\nAATAATAAAAGAAAATGGAAAGGAAGCTATTAAATATACTTCTAGTGATACCGCTTCACATAAAGGTTGG\nAAGGCAACTTTAAGTGGAACATTTATTGAAGATCCTCATTCTGATAAGAAAACTGCTTTATTAAATTTAG\nAAGGATTTATACCTTCTGATAAACAGATTTTTGGTTCTAAATATTACGGAAAAATGAAATGGCCTGAAAC\nTTATAGAATTAATGTAAAAAGTGCTGATGTAAATAATAATATAAAAATAGCAAATTCTATTCCTAAAAAT\nACTATAGATAAAAAAGATGTATCTAATTCAATTGGTTATTCTATAGGCGGTAATATATCTGTTGAAGGAA\nAAACTGCTGGTGCTGGAATAAATGCTTCATATAATGTCCAAAATACTATAAGCTATGAACAACCTGATTT\nTAGAACAATTCAAAGAAAAGATGATGCAAATTTAGCATCATGGGATATAAAATTTGTTGAGACTAAGGAC\nGGTTATAATATAGATTCTTATCATGCTATTTATGGAAATCAATTATTCATGAAATCAAGATTGTATAATA\nATGGTGATAAAAATTTCACAGATGATAGAGATTTATCAACATTAATTTCTGGTGGATTTTCACCCAATAT\nGGCTTTAGCATTAACAGCACCTAAAAATGCTAAAGAATCTGTAATAATAGTTGAATATCAAAGATTTGAT\nAATGACTATATTTTAAA

In [25]:
fetch_result = ""

batch = 100
for start in range(0, len(accession_list), batch):
    end = start + batch
    accessions = accession_list[start:end]
    term = ' OR '.join(accessions)
    uids = esearch(term, 'protein')
    fetch_result += efetch(uids, 'protein', 'fasta_cds_na', 'text')

In [26]:
with open('/media/NGS/Data_Analysis/20210220_Campylobacter_resistance/cmeB/campylobacter_cmeB.fna', 'w') as f:
    f.write(fetch_result)