In [1]:
# Author: Henry Cope
import requests
import csv
import html
import urllib.parse

# Human-readable query
# query = "('space flight' OR 'spaceflight' OR 'space station') AND ('omics' OR 'genetic*' OR 'epigenetic*' OR 'genom*' OR 'transcriptom*' OR 'epigenom*' OR 'microbiom*' OR 'proteom*' OR 'metabolom*')"
query = "('spaceflight' AND 'omics')"

print("Scraping pubmed with query: " + query)
papers = []

# Encode the query for a URL
encoded_query = urllib.parse.quote_plus(query)

page = 1
while True:
    url = f"https://pubmed.ncbi.nlm.nih.gov/?term={encoded_query}&format=pubmed&size=200&page={page}"

    # Make a request
    response = requests.get(url)
    if response.status_code != 200:
        print("FAIL:" + url)
        break
    print("SUCCESS:" + url)

    # Force encoding and split into lines
    response.encoding = 'utf-8-sig'
    lines = response.text.split('\n')

    # process the data - messy!!
    paper = {}
    in_abstract,in_title = False, False # multi-line flags
    new_papers_found = False

    for line in lines:
        line = html.unescape(line)

        if 'PMID- ' in line: #PMID
            new_papers_found = True
            if paper:
                papers.append(paper)
                paper = {}
            paper['PMID'] = line.split('- ')[1].strip()
            in_abstract,in_title = False, False
        elif line.startswith('TI  -'): #Title
            paper['Title'] = line[6:].strip()
            in_abstract,in_title = False, True
        elif line.startswith('AB  -'): #Abstract
            paper['Abstract'] = line[6:].strip()
            in_abstract,in_title = True, False
        elif line.startswith('AU  -'): #Authors
            if 'Authors' not in paper:
                paper['Authors'] = []
            paper['Authors'].append(line[6:].strip())
            in_abstract,in_title = False, False
        elif line.startswith('OT  -'): #Keywords (sometimes missing)
            if 'Keywords' not in paper:
                paper['Keywords'] = []
            paper['Keywords'].append(line[6:].strip())
            in_abstract,in_title = False, False
        elif line.startswith('DP  -'):#Date
            paper['Publication Year'] = line[6:10].strip()
            in_abstract,in_title = False, False
        elif line.startswith('JT  -'): #Journal
            paper['Journal Title'] = line[6:].strip()
            in_abstract,in_title = False, False
        elif line.startswith('PT  -'): #Publication type
            if 'Publication Type' not in paper:
                paper['Publication Type'] = []
            paper['Publication Type'].append(line[6:].strip())
            in_abstract,in_title = False, False
        elif line.startswith('GR  -'):#Grant identifiers (sometimes missing)
            if 'Grants' not in paper:
                paper['Grants'] = []
            paper['Grants'].append(line[6:].strip())
            in_abstract,in_title = False, False
        elif in_abstract and line.startswith('      '):#Multi-line abstract processing
            paper['Abstract'] += ' ' + line.strip()
            in_abstract,in_title = True, False
        elif in_title and line.startswith('      '): #Multi-line title processing
            paper['Title'] += ' ' + line.strip()
            in_abstract,in_title = False, True

    # Add the last paper
    if paper:
        papers.append(paper)

    if not new_papers_found:
        print("No more papers found.")
        break

    page += 1  # Go to the next page

# convert lists to strings
for paper in papers:
    for field in ['Authors', 'Keywords', 'Publication Type', 'Grants']:
        if field in paper:
            paper[field] = '; '.join(paper[field])

# Only get journal articles (ignore preprints, news articles...)
filtered_papers = [paper for paper in papers if 'Journal Article' in paper.get('Publication Type', '')]

with open('literature.csv', 'w', newline='', encoding='utf-8-sig') as file:
    fieldnames = ['PMID', 'Title', 'Abstract', 'Authors', 'Keywords', 'Publication Year', 'Journal Title', 'Publication Type', 'Grants']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(filtered_papers)

print(f"Found {len(filtered_papers)} papers")
print("Data saved to 'literature.csv'")

Scraping pubmed with query: ('spaceflight' AND 'omics')
SUCCESS:https://pubmed.ncbi.nlm.nih.gov/?term=%28%27spaceflight%27+AND+%27omics%27%29&format=pubmed&size=200&page=1
FAIL:https://pubmed.ncbi.nlm.nih.gov/?term=%28%27spaceflight%27+AND+%27omics%27%29&format=pubmed&size=200&page=2
Found 58 papers
Data saved to 'literature.csv'
