In [1]:
!pip install requests pandas numpy

Collecting pandas
  Downloading pandas-2.1.4-cp311-cp311-macosx_10_9_x86_64.whl.metadata (18 kB)
Collecting numpy
  Downloading numpy-1.26.3-cp311-cp311-macosx_10_9_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m408.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.4-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.1.4-cp311-cp311-macosx_10_9_x86_64.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m246.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hDownloading numpy-1.26.3-cp311-cp311-macosx_10_9_x86_64.whl (20.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.6/20.6 MB[0m [31m335.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hDownloading tzdata-2023.4-py2.py3-none-any.whl (346 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m346.6/346.6 kB[

In [2]:
import requests
import pandas as pd
import numpy as np

In [3]:
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

In [45]:
def search_pubdb(database, query, ret_type='text'):
    url = BASE_URL + f"esearch.fcgi?db={database}&rettype={ret_type}&term={query}&usehistory=y"
    res = requests.get(url)
    return res

In [46]:
resp = search_pubdb("nucleotide", "txid9606%5bOrganism:noexp%5d") #  science%5bjournal%5d+AND+breast+cancer+AND+2008%5bpdat%5d

In [47]:
resp.text

'<?xml version="1.0" encoding="UTF-8" ?>\n<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">\n<eSearchResult><Count>28490156</Count><RetMax>20</RetMax><RetStart>0</RetStart><QueryKey>1</QueryKey><WebEnv>MCID_65a2ac9a51064d455000193d</WebEnv><IdList>\n<Id>2647008342</Id>\n<Id>2647008340</Id>\n<Id>2647008182</Id>\n<Id>2646764794</Id>\n<Id>2646764786</Id>\n<Id>2646764778</Id>\n<Id>2646764736</Id>\n<Id>2646712570</Id>\n<Id>2646294547</Id>\n<Id>2645214823</Id>\n<Id>2645213667</Id>\n<Id>2645213665</Id>\n<Id>2645211384</Id>\n<Id>2645211382</Id>\n<Id>2645168897</Id>\n<Id>2645167875</Id>\n<Id>2645167860</Id>\n<Id>2645091689</Id>\n<Id>2645091687</Id>\n<Id>2645091685</Id>\n</IdList><TranslationSet/><TranslationStack>   <TermSet>    <Term>txid9606[Organism:noexp]</Term>    <Field>Organism</Field>    <Count>28490156</Count>    <Explode>N</Explode>   </TermSet>   <OP>GROUP</OP>  </TranslationStack><QueryTranslation>txi

In [26]:
import xml.etree.ElementTree as ET

In [49]:
tree = ET.fromstring(resp.text)

In [50]:
tree.tag

'eSearchResult'

In [51]:
for child in tree:
    print(child.tag, child.attrib)

Count {}
RetMax {}
RetStart {}
QueryKey {}
WebEnv {}
IdList {}
TranslationSet {}
TranslationStack {}
QueryTranslation {}


In [54]:
for id in tree[5]:
    print(id.text)

2647008342
2647008340
2647008182
2646764794
2646764786
2646764778
2646764736
2646712570
2646294547
2645214823
2645213667
2645213665
2645211384
2645211382
2645168897
2645167875
2645167860
2645091689
2645091687
2645091685


In [56]:
tree[4].text

'MCID_65a2ac9a51064d455000193d'

In [92]:
import re
from tqdm.notebook import tqdm

In [98]:
def get_ids_list(database, query, batch_size=100):
    res = search_pubdb(database, query)
    if res.status_code != 200:
        return None
    xm_str = res.text
    web_env_match = re.search(r'<WebEnv>(\S+)<\/WebEnv>', xm_str)
    query_key_match = re.search(r'<QueryKey>(\d+)<\/QueryKey>', xm_str)
    count_match = re.search(r'<Count>(\d+)<\/Count>', xm_str)
    
    web = web_env_match.group(1) if web_env_match else None
    key = query_key_match.group(1) if query_key_match else None
    count = count_match.group(1) if count_match else None
    
    # Print or use the extracted values
    print("WebEnv:", web)
    print("QueryKey:", key)
    print("Count:", count)

    # Loop through batches
    with open("ids_file.txt", "w") as out_file:
        for retstart in tqdm(range(0, int(count), batch_size)):
            efetch_url = f"{BASE_URL}efetch.fcgi?db=nucleotide&WebEnv={web}"
            efetch_url += f"&query_key={key}&retstart={retstart}"
            efetch_url += f"&retmax={batch_size}&rettype=genbankfull&retmode=text"
            print(efetch_url)
        
            # Make the request
            efetch_out = requests.get(efetch_url).text
        
            # Print or use the retrieved data
            # print(efetch_out)
            out_file.write(efetch_out)
            # break
    return True

In [99]:
ids_list = get_ids_list("nucleotide", "txid9606%5bOrganism:noexp%5d")

WebEnv: MCID_65a2c99326257d11036ed6b2
QueryKey: 1
Count: 28490156


  0%|          | 0/284902 [00:00<?, ?it/s]

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&WebEnv=MCID_65a2c99326257d11036ed6b2&query_key=1&retstart=0&retmax=100&rettype=genbankfull&retmode=text
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&WebEnv=MCID_65a2c99326257d11036ed6b2&query_key=1&retstart=100&retmax=100&rettype=genbankfull&retmode=text
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&WebEnv=MCID_65a2c99326257d11036ed6b2&query_key=1&retstart=200&retmax=100&rettype=genbankfull&retmode=text
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&WebEnv=MCID_65a2c99326257d11036ed6b2&query_key=1&retstart=300&retmax=100&rettype=genbankfull&retmode=text
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&WebEnv=MCID_65a2c99326257d11036ed6b2&query_key=1&retstart=400&retmax=100&rettype=genbankfull&retmode=text
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&WebEnv=MCID_65a2c99326257d11036ed6b2&que

KeyboardInterrupt: 

In [81]:
len(ids_list)

40