In [1]:
!pip install Biopython

Collecting Biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Biopython
Successfully installed Biopython-1.84


In [5]:
import time
from Bio import Entrez
import pandas as pd
import urllib.error

# Provide your email address to the Entrez system
Entrez.email = "youremail@example.com"

def esearch(query, retmax=10000):
    """Fetch all IDs matching the query in a single search."""
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax=retmax,  # Fetch maximum number of results
                            retmode='xml',
                            term=query)
    results = Entrez.read(handle)
    return results

def efetch(id_list):
    """Fetch details of articles using a list of PubMed IDs."""
    print(f"Fetching {len(id_list)} IDs")
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results



if __name__ == '__main__':
    term = "CRISPR gene editing"

    # Initialize variables
    title_list, abstract_list = [], []
    batch_size = 50  # Fetch articles in batches of XX
    sleep_time = 10  # Sleep for 10 seconds between requests
    max_retries = 5  # Maximum number of retries for a failed batch

    # Single search to get all PubMed IDs for the query
    initial_results = esearch(term)
    total_count = int(initial_results['Count'])
    id_list = initial_results['IdList']
    print(f"Total number of articles: {total_count}")

    # Process the list of IDs in batches using efetch
    for start in range(0, len(id_list), batch_size):
        batch_ids = id_list[start:start+batch_size]  # Get a chunk of IDs

        retries = 0
        while retries < max_retries:
            try:
                # Sleep before making the efetch request to avoid rate limits
                time.sleep(sleep_time)

                # Fetch article details for the current batch
                papers = efetch(batch_ids)

                # Process the fetched papers
                for i, paper in enumerate(papers['PubmedArticle']):
                    try:
                        title = paper['MedlineCitation']['Article']['ArticleTitle']
                        title_list.append(title)
                    except:
                        title_list.append("NA")
                    try:
                        abstract = paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
                        abstract_list.append(abstract)
                    except:
                        abstract_list.append("NA")

                # Break out of the retry loop if the batch is successful
                break

            except urllib.error.HTTPError as e:
                # Handle HTTP error and retry the batch
                retries += 1
                print(f"HTTP error occurred: {e}, retrying... ({retries}/{max_retries})")
                if retries == max_retries:
                    print(f"Max retries reached for batch {batch_ids}. Skipping this batch.")

    # Create a DataFrame from the collected titles and abstracts
    data = list(zip(title_list, abstract_list))
    df = pd.DataFrame(data, columns=['Title', 'Abstract'])
    df.to_csv('pubmed_articles.csv', index=False)

    # Print the DataFrame to check the result
    print(df)


Total number of IDs returned: 2371
Total number of articles: 2371
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 50 IDs
Fetching 21 IDs
                                                  Title  \
0     Understanding the virulence of Streptococcus s...   
1     Streptococcus suis biofilm: regulation, dr