In [28]:
import pandas as pd
from Bio import Entrez
import time
import requests
from keybert import KeyBERT

In [3]:
# Configure Entrez
Entrez.email = "christian.goldoni@gmail.com"

In [7]:
# Function to search PubMed
def search_pubmed(query):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=10000)
    results = Entrez.read(handle)
    handle.close()
    return results["IdList"]

# Function to fetch article details
def fetch_article_details(ids):
    handle = Entrez.efetch(db="pubmed", id=ids, rettype="xml", retmode="text")
    articles = Entrez.read(handle)
    handle.close()
    return articles

def get_first_author_affiliation(article):
    #try:
    author_list = article["MedlineCitation"].get("Article", {}).get("AuthorList", [])

    # Check if AuthorList is a list or a single dictionary
    if isinstance(author_list, list):
        first_author = author_list[0]
    else:
        first_author = author_list  # Assume it's a dictionary if not a list

    # Get the affiliation info
    return first_author.get("AffiliationInfo", [{}])[0].get("Affiliation", "")
    #except (IndexError, AttributeError, TypeError):
    #return ""

In [8]:
# Function to get the article's DOI
def get_DOI(article):
    article_id_list = article.get("PubmedData", {}).get("ArticleIdList", [])
    for item in article_id_list:
        if item.attributes.get("IdType") == "doi":
            return str(item)

# Function to calculate the number of citations scraping pubmed
def get_citation_count(doi):
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get("message", {}).get("is-referenced-by-count", 0)
    else:
        return "Error: Unable to retrieve citation count"

In [45]:
# Initialize KeyBERT
kw_model = KeyBERT()

# Function to get keywords through KeyBERT
def get_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=5)
    output = "; ".join(
            [kw[0] for kw in keywords]
            )
    return output

In [46]:
# Define search query
query = "University Colorado cancer center [ad]"

# Search PubMed with refined query
print("Searching PubMed...")
ids = search_pubmed(query)
print(f"Found {len(ids)} articles.")

# Prepare DataFrame to store metadata
columns = ["PMID","DOI", "Title", "Journal", "PubDate", "PubTypes", "Authors", "Abstract", "Citations", "Keywords"]
data = []

# Fetch article details in batches
batch_size = 100
for start in range(0, len(ids), batch_size):
    end = start + batch_size
    print(f"Fetching articles {start + 1}-{min(end, len(ids))}...")
    batch_ids = ids[start:end]
    articles = fetch_article_details(batch_ids)

    for article in articles["PubmedArticle"]:
        try:
            # Extract relevant fields
            doi = get_DOI(article)
            abstract_text = article["MedlineCitation"].get("Article", {}).get("Abstract", {}).get("AbstractText", [""])[0]
            metadata = {
                "PMID": article["MedlineCitation"]["PMID"],
                "DOI": doi,
                "Title": article["MedlineCitation"].get("Article", {}).get("ArticleTitle", ""),
                "Journal": article["MedlineCitation"].get("Article", {}).get("Journal", {}).get("Title", ""),
                "PubDate": article["MedlineCitation"].get("Article", {}).get("Journal", {}).get("JournalIssue", {}).get("PubDate", {}).get("Year", ""),
                "PubTypes": "; ".join(
                    [
                        f"{str(pubtype)}"
                        for pubtype in article["MedlineCitation"].get("Article", {}).get("PublicationTypeList", [])
                    ]
                ),
                
                "Authors": "; ".join(
                    [
                        f"{author.get('LastName', '')} {author.get('ForeName', '')}"
                        for author in article["MedlineCitation"].get("Article", {}).get("AuthorList", [])
                    ]
                ),
                
                "Abstract": abstract_text,
                "Citations": get_citation_count(doi),
                "Keywords": get_keywords(abstract_text),
            }

            # Append metadata to the data list
            data.append([
                metadata["PMID"],
                metadata["DOI"],
                metadata["Title"],
                metadata["Journal"],
                metadata["PubDate"],
                metadata["PubTypes"],
                metadata["Authors"],
                metadata["Abstract"],
                metadata["Citations"],
                metadata["Keywords"],
            ])
        except Exception as e:
            print(f"Error processing article: {e}")

    # Pause to respect NCBI rate limits
    time.sleep(1)

# Create DataFrame and display it
df = pd.DataFrame(data, columns=columns)

print("Done.")

Searching PubMed...
Found 1764 articles.
Fetching articles 1-100...
Fetching articles 101-200...
Fetching articles 201-300...
Fetching articles 301-400...
Fetching articles 401-500...
Fetching articles 501-600...
Fetching articles 601-700...
Fetching articles 701-800...
Fetching articles 801-900...
Fetching articles 901-1000...
Fetching articles 1001-1100...
Fetching articles 1101-1200...
Fetching articles 1201-1300...
Fetching articles 1301-1400...
Fetching articles 1401-1500...
Fetching articles 1501-1600...
Fetching articles 1601-1700...
Fetching articles 1701-1764...
Done.


In [50]:
df.to_csv('CU_Cancer_Center_PubMed.csv')

In [18]:
SplitAuthors = df.join(df['Authors'].str.split(';', expand=True))

In [21]:
SplitAuthors.to_csv('CU_CC_SplitAuthors.csv')

In [22]:
SplitTypes = df.join(df['PubTypes'].str.split(';', expand=True))

In [24]:
SplitTypes.to_csv('CU_CC_SplitTypes.csv')

In [48]:
SplitKeywords = df.join(df['Keywords'].str.split(';', expand=True))

In [49]:
SplitKeywords.to_csv('CU_CC_SplitKeywords.csv')