In [1]:
from Bio import Entrez
from Bio import Medline
import xml.etree.ElementTree as ET

In [2]:
def fetch_pubmed_data(pubmed_ids):
    Entrez.email = "she4@mdanderson.org"  # Provide your email here
    data = {}
    try:
        # Fetch the PubMed records in XML format
        handle = Entrez.efetch(db="pubmed", id=",".join(pubmed_ids), rettype="medline", retmode="xml")
        xml_data = handle.read()
        handle.close()
        
        # Parse XML data
        root = ET.fromstring(xml_data)
        for article in root.findall('.//PubmedArticle'):
            pmid_element = article.find('.//PMID')
            pmid = pmid_element.text if pmid_element is not None else 'No PMID'
            title_element = article.find('.//ArticleTitle')
            title = title_element.text if title_element is not None else 'No Title'
            abstract_element = article.find('.//Abstract/AbstractText')
            abstract = abstract_element.text if abstract_element is not None else 'No Abstract'
            data[pmid] = {'title': title, 'abstract': abstract}
    except Exception as e:
        print(f"An error occurred: {e}")
    return data

def save_pubtator_files(data):
    if not data:
        print("No data to save.")
    for pmid, content in data.items():
        with open(f"{pmid}.txt", 'w') as file:
            file.write(f"{pmid}|t|{content['title']}\n")
            file.write(f"{pmid}|a|{content['abstract']}\n")

def search_pubmed(keywords,start_year,end_year):
    """Search PubMed for given keywords and return a list of PMIDs."""
    Entrez.email = "she4@mdanderson.org"  # Provide your email here
    query = keywords
    # handle = Entrez.esearch(db="pubmed", term=query, retmax=500)  # Adjust retmax as needed for more or fewer results
    handle = Entrez.esearch(db="pubmed",
                            term=query,
                            datetype="pdat",  # 'pdat' for publication date
                            mindate=str(start_year),
                            maxdate=str(end_year),
                            retmax=20000) 
    record = Entrez.read(handle)
    handle.close()
    return record['IdList']

In [3]:
import os
os.chdir("/Volumes/she4/knowledgegraph/biobert_aioner/AIONER/example/input_more/")

# Define your keywords
keywords = "cancer immunology natural killer cells"
pmids = search_pubmed(keywords,2020,2024)

pubmed_ids = pmids  # List of PubMed IDs
pubmed_data = fetch_pubmed_data(pubmed_ids)
save_pubtator_files(pubmed_data)

In [3]:
import os
os.chdir("/Volumes/she4/knowledgegraph/biobert_aioner/AIONER/example/Tcell/")

# Define your keywords
keywords = "cancer immunology T cells"
pmids = search_pubmed(keywords,2020,2024)

pubmed_ids = pmids  # List of PubMed IDs
pubmed_data = fetch_pubmed_data(pubmed_ids)
save_pubtator_files(pubmed_data)

In [4]:
import os
os.chdir("/Volumes/she4/knowledgegraph/biobert_aioner/AIONER/example/Bcell/")

# Define your keywords
keywords = "cancer immunology B cells"
pmids = search_pubmed(keywords,2020,2024)

pubmed_ids = pmids  # List of PubMed IDs
pubmed_data = fetch_pubmed_data(pubmed_ids)
save_pubtator_files(pubmed_data)

In [5]:
import os
os.chdir("/Volumes/she4/knowledgegraph/biobert_aioner/AIONER/example/Macrophage/")

# Define your keywords
keywords = "cancer immunology Macrophages"
pmids = search_pubmed(keywords,2020,2024)

pubmed_ids = pmids  # List of PubMed IDs
pubmed_data = fetch_pubmed_data(pubmed_ids)
save_pubtator_files(pubmed_data)