In [1]:
import xml.etree.cElementTree as ET
import pandas as pd
from Bio import Entrez
import io

In [2]:
Entrez.email = 'chen1i6c04@gmail.com'

def einfo(db):
    with Entrez.einfo(db=db) as handle:
        record = Entrez.read(handle)
    for fileld in record['DbInfo']['FieldList']:
        print(fileld['Name'], fileld['Description'])

def egquery(term, db='sra'):
    with Entrez.egquery(term=term) as handle:
        record = Entrez.read(handle)
    for row in record["eGQueryResult"]:
        if row["DbName"]==db:
            count = int(row["Count"])
            return count

def esearch(term, db="sra", retmax=10000):
    with Entrez.esearch(db=db, term=term, retmax=retmax) as handle:
        record = Entrez.read(handle)
        return record['IdList']

def efetch(uid, db, rettype='xml', retmode="xml"):
    with Entrez.efetch(db=db, id=uid, rettype=rettype, retmode=retmode) as handle:
        record = handle.read()
    return record

In [3]:
einfo('sra')

ALL All terms from all searchable fields
UID Unique number assigned to publication
FILT Limits the records
ACCN Accession number of sequence
TITL Words in definition line
PROP Classification by source qualifiers and molecule type
WORD Free text associated with record
ORGN Scientific and common names of organism, and all higher levels of taxonomy
AUTH Author(s) of publication
PDAT Date sequence added to GenBank
MDAT Date of last update
GPRJ BioProject
BSPL BioSample
PLAT Platform
STRA Strategy
SRC Source
SEL Selection
LAY Layout
RLEN Percent of aligned reads
ACS Access is public or controlled
ALN Percent of aligned reads
MBS Size in megabases


In [38]:
patterm = "Salmonella enterica[ORGN] AND illumina[PLAT] AND wgs[STRA] AND genomic[SRC] AND 2021/06/25:2021/06/25[MDAT]"

In [39]:
count = egquery(patterm)
count

92

In [5]:
idlist = esearch(patterm, retmax=count)
len(idlist)

8120

In [6]:
batch_size = 10000
fetch_results = []
for start in range(0, len(idlist), batch_size):
    end = start + batch_size
    fetch_results.append(efetch(idlist[start: end], 'sra', 'runinfo', 'csv'))

In [7]:
run_info = [pd.read_csv(io.StringIO(fetch_result)) for fetch_result in fetch_results]

In [8]:
run_info = pd.concat(run_info, axis=0, ignore_index=True)

In [9]:
run_info = run_info.drop_duplicates(keep=False)

In [14]:
run_info.to_csv('/media/NGS/Data_Analysis/NCBI/Run_Info/Vibrio_cholerae_Run_info.csv', index=0)