In [22]:
# Install and import necessary libraries

!python3 -m pip install pandas openpyxl metapub transformers torch torchvision tensorflow ragas==0.3.7 langchain-openai langchain openai tiktoken python-dotenv tf-keras scikit-learn biopython beautifulsoup4 lxml textstat openai nest_asyncio
import pandas as pd
import json
from metapub import PubMedFetcher
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from functools import reduce
import os
from datetime import datetime
from dotenv import load_dotenv
from openai import OpenAI
from Bio import Entrez
from bs4 import BeautifulSoup
from ragas.metrics import faithfulness
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from datasets import Dataset
import csv

import time
import textstat
from pathlib import Path

import nest_asyncio
nest_asyncio.apply()

from openai import AsyncOpenAI
import tqdm
import asyncio




In [23]:
# Load environment variables from your .env file, or export your api key
# load_dotenv("")
openai_api_key = os.environ.get("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("API key not found.")
else:
    print(f"API Key Loaded: {openai_api_key}...")

# OpenAI client
os.environ["OPENAI_API_KEY"] = openai_api_key
client = OpenAI(api_key=openai_api_key)

# Set location of configuration file, to load configuration variables
with open("./config.json", "r") as config_file:
    config = json.load(config_file)

num_of_articles = config["num_of_articles"]
max_summary_length = config["max_summary_length"]
min_summary_length = config["min_summary_length"]

# Set an Entrez email within the config file
Entrez.email = config["entrez_email"]
Entrez.api_key = config["entrez_api_key"]
print(f"Entrez email set: {Entrez.email}")
print(f"Entrez API key: {Entrez.api_key}")
print(f"Lay Summarizer Configuration: \n Number of articles: {num_of_articles} \n Maximum word length: {max_summary_length} \n Minimum word length: {min_summary_length}")

API Key Loaded: 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...
Entrez email set: t2yu@oicr.on.ca
Entrez API key: da059acf5271585c53851508e6054b7f0807
Lay Summarizer Configuration: 
 Number of articles: 1 
 Maximum word length: 350 
 Minimum word length: 250


In [24]:
# Fetch full texts 
def fetch_full_texts(pmids, fetcher):
    time.sleep(1) 
    full_texts, titles = {}, {}

    print(f"Retrieved PMIDs: {pmids}")
    print(f"Number of PMIDs retrieved: {len(pmids)}")

    for pmid in pmids:
        try:
            article = fetcher.article_by_pmid(pmid)
            titles[pmid] = article.title
            
            pmcid = article.pmc
            if pmcid:
                handle = Entrez.efetch(db="pmc", id=pmcid, rettype="xml", retmode="xml")
                xml_data = handle.read()
                handle.close()

                soup = BeautifulSoup(xml_data, features="xml")

                paragraphs = soup.find_all("p")
                sections = soup.find_all("sec")

                text_chunks = [p.get_text() for p in paragraphs]
                if not text_chunks and sections:
                    # Fallback to extracting from section text
                    text_chunks = [s.get_text() for s in sections]

                full_text = "\n".join(p.get_text() for p in paragraphs)
                full_texts[pmid] = full_text if full_text.strip() else "Full text unavailable"
                
            else:
                full_texts[pmid] = "Full text unavailable"
        except Exception as e:
            print(f"Error fetching full text for PMID {pmid}: {e}")
            titles[pmid] = titles.get(pmid, "Unavailable")
            full_texts[pmid] = "Full text unavailable"
    return titles, full_texts
    

def fetch_abstracts(pmids, fetcher):
    abstracts = {}
    titles = {}
    
    for pmid in pmids:
        try:
            article = fetcher.article_by_pmid(pmid)
            titles[pmid] = article.title
            abstracts[pmid] = article.abstract if article.abstract else "Abstract unavailable"
        except Exception as e:
            print(f"Error fetching abstract for PMID {pmid}: {e}")
            titles[pmid] = titles.get(pmid, "Unavailable")
            abstracts[pmid] = "Abstract unavailable"
    return titles, abstracts

async_client = AsyncOpenAI(api_key=openai_api_key)

async def async_summarize(pmid, text, max_length):
    if not text or text == "Full text unavailable":
        return pmid, "No text available"

    prompt = (
        f"Summarize this article in 250 to {max_length} words for a 2nd grade lay audience using simple words. "
        f"The tone should be casual. Avoid emotive or negative language such as battle or fight. "
        f"Keep words under 4 syllables and sentences under 10 words: {text}"
    )

    try:
        response = await async_client.chat.completions.create(
            model="gpt-5-mini", #"gpt-4-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that summarizes medical research."},
                {"role": "user", "content": prompt}
            ],
            temperature=1,
        )
        content = response.choices[0].message.content.strip()
        return pmid, content
    except Exception as e:
        return pmid, f"Error summarizing: {e}"

async def gpt4_summarize_async(texts, max_length, concurrency=15):
    semaphore = asyncio.Semaphore(concurrency)

    async def limited_summary(pmid, text):
        async with semaphore:
            return await async_summarize(pmid, text, max_length)

    tasks = [limited_summary(pmid, text) for pmid, text in texts.items()]
    results = await asyncio.gather(*tasks)
    return dict(results)


def compute_readability(summary):
    if summary and summary != "No text available":
        flesch_score = textstat.flesch_reading_ease(summary)
        grade_level = textstat.flesch_kincaid_grade(summary)
        return flesch_score, grade_level
    else:
        return None, None


def compute_ragas_faithfulness(summaries, references, prompt):
    evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
    print(evaluator_llm)
    data = {
        "question": [],
        "answer": [],
        "contexts": []
    }
    valid_pmids = []

    for pmid, summary in summaries.items():
        reference = references.get(pmid)
        if (
            summary 
            and reference 
            and summary != "No text available" 
            and reference != "Full text unavailable"
        ):
            data["question"].append(prompt)
            data["answer"].append(summary)
            data["contexts"].append([reference])
            valid_pmids.append(pmid)
            print("RAGAS data:", data)

    
    dataset = Dataset.from_dict(data)
    results = evaluate(dataset, metrics=[faithfulness], llm=evaluator_llm)
    print("RAGAS results:", results)
    return dict(zip(valid_pmids, results["faithfulness"]))




In [6]:

# Use if you need to query for PMIDS
if "queries" in config:

    for query in config["queries"]:
            keyword = query["keyword"]
            output_file = query["output_file"]

            fetch = PubMedFetcher()
            pmids = fetch.pmids_for_query(f"{keyword} AND pubmed pmc open access[filter]", retmax=num_of_articles)


            print(pmids)

            #writes the queried PMIDs to a txt file
            with open("pmids_queried.txt", "w") as f:
                for pmid in pmids:
                    f.write(pmid + "\n")

            
else:
    print("No queries found in the configuration file.")

['39119834']


In [7]:
# Use if you have a file of PMIDs to summarize and calculate metrics
if "pmid_file" in config:
    pmid_file = config["pmid_file"]
    with open(pmid_file, "r") as f:
        pmids = [line.strip() for line in f.readlines()]
    print(f"Loaded {len(pmids)} PMIDs from {pmid_file}")


In [7]:

# Prints article information for pmids retrieved
for pmid in pmids:
    article = fetch.article_by_pmid(pmid)
    print(f"PMID {pmid}: {article.title}")
    print(f"Authors: {', '.join([str(a) for a in article.authors])}")
    print(f"Journal: {article.journal}")
    print("---")

#Generates fulltext-based summaries
titles, full_texts = fetch_full_texts(pmids, fetch)
filtered_texts = {pmid: text for pmid, text in full_texts.items() if text != "Full text unavailable"}
full_text_summaries = await gpt4_summarize_async(filtered_texts, max_length=max_summary_length)

#Generates abstract-based summaries
titles, abstracts = fetch_abstracts(pmids, fetch)
abstract_summaries = await gpt4_summarize_async(abstracts, max_length=max_summary_length)

links = {pmid: f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" for pmid in pmids}


# Calculates similarity between abstract vs full text summaries
# Define the evaluate_similarity function
def evaluate_similarity(abstract_summaries, full_text_summaries):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([abstract_summaries, full_text_summaries])
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0][0]

similarity_scores = {
    pmid: evaluate_similarity(abstract_summaries[pmid], full_text_summaries[pmid])
    for pmid in abstract_summaries
    if abstract_summaries[pmid] != "No text available"
    and full_text_summaries.get(pmid)
    and full_text_summaries[pmid] != "No text available"
}

# Calculate readability scores for abstract summaries
abstract_readability = {
    pmid: compute_readability(abstract_summaries[pmid])
    for pmid in abstract_summaries
}

# Calculate readability scores for full text summaries
full_text_readability = {
    pmid: compute_readability(full_text_summaries[pmid])
    for pmid in full_text_summaries
}


# Full text availability
full_text_availability = {
    pmid: "Available" if full_texts[pmid] != "Full text unavailable" else "Unavailable"
    for pmid in pmids
}

# Define the prompt used for summarization
ragas_prompt = ("Summarize this article in 250 to 350 words for a 2nd grade lay audience using simple words."
                "The tone should be casual."
                "Avoid emotive or negative language such as battle or fight."
                "Keep words under 4 syllables and sentences under 10 words."
                )

# Compute RAGAS faithfulness
abstract_ragas_faithfulness = compute_ragas_faithfulness(abstract_summaries, abstracts, ragas_prompt)
fulltext_ragas_faithfulness = compute_ragas_faithfulness(full_text_summaries, full_texts, ragas_prompt)


# Prepare DataFrames
Title = pd.DataFrame(list(titles.items()), columns=["pmid", "Title"])
Link = pd.DataFrame(list(links.items()), columns=["pmid", "Link"])
Abstract = pd.DataFrame(list(abstracts.items()), columns=["pmid", "Abstract"])
Abstract = Abstract.assign(
    Abstract=lambda df: df["Abstract"].str.replace("\n", " ")
)
FullTextFlag = pd.DataFrame(list(full_text_availability.items()), columns=["pmid", "FullText_Availability"])
Abstract_Summary = pd.DataFrame(list(abstract_summaries.items()), columns=["pmid", "Abstract_Summary"])
Full_Text_Summary = pd.DataFrame(list(full_text_summaries.items()), columns=["pmid", "Full_Text_Summary"])
Similarity = pd.DataFrame(list(similarity_scores.items()), columns=["pmid", "Abstract_vs_FullText_Similarity"])

Abstract_Readability = pd.DataFrame([
    {"pmid": pmid, 
    "Abstract_Flesch_Reading_Ease": scores[0], 
    "Abstract_Flesch_Kincaid_Grade": scores[1]}
    for pmid, scores in abstract_readability.items()
])


Full_Text_Readability = pd.DataFrame([
    {"pmid": pmid, 
    "FullText_Flesch_Reading_Ease": scores[0], 
    "FullText_Flesch_Kincaid_Grade": scores[1]}
    for pmid, scores in full_text_readability.items()
])

# RAGAS faithfulness scores
Abstract_RAGAS_Faithfulness = pd.DataFrame(list(abstract_ragas_faithfulness.items()), columns=["pmid", "Abstract_RAGAS_Faithfulness"])
FullText_RAGAS_Faithfulness = pd.DataFrame(list(fulltext_ragas_faithfulness.items()), columns=["pmid", "FullText_RAGAS_Faithfulness"])


# Combine all DataFrames
data_frames = [
    Title, 
    Link, 
    Abstract, 
    FullTextFlag, 
    Abstract_Summary, 
    Full_Text_Summary, 
    Similarity,
    Abstract_Readability,
    Full_Text_Readability,
    Abstract_RAGAS_Faithfulness,
    FullText_RAGAS_Faithfulness
]

# Combine all DataFrames
df_full = reduce(lambda left, right: pd.merge(left, right, on="pmid", how="outer"), data_frames)

# Print Availability Counts
print(f"Full Text Availability for keyword '{keyword}':")
availability_counts = df_full["FullText_Availability"].value_counts()
print("Full Text Availability Counts:")
print(availability_counts)

# Filter to keep only articles with full text available
df_full = df_full[df_full["FullText_Availability"] == "Available"]

# Save the DataFrame to a CSV file with a timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base_name, _ = os.path.splitext(output_file)
output_file_with_timestamp = f"{base_name}_{timestamp}.csv"

# Specify the output directory
output_dir = Path("./output")
output_dir.mkdir(parents=True, exist_ok=True)
full_output_path = output_dir / output_file_with_timestamp
df_full.to_csv(full_output_path, index=False)

print(f"Data for keyword '{keyword}' saved to: {output_file_with_timestamp}")

PMID 39119834: Methuosis Inducer SGI-1027 Cooperates with Everolimus to Promote Apoptosis and Pyroptosis by Triggering Lysosomal Membrane Permeability in Renal Cancer.
Authors: Luo Y, Guan B, Deng X, Bai P, Huang H, Miao C, Sun A, Li Z, Yang D, Wang X, Shao Z, Wu Y, Xing J, Chen B, Wang T
Journal: Adv Sci (Weinh)
---
Retrieved PMIDs: ['39119834']
Number of PMIDs retrieved: 1


2025-11-11 00:14:22 ML8136-T2YU.local httpx[7485] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-11 00:14:40 ML8136-T2YU.local httpx[7485] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


TypeError: llm_factory() got an unexpected keyword argument 'client'

In [25]:
test = compute_ragas_faithfulness(abstract_summaries, abstracts, ragas_prompt)
print(client)

  evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))


LangchainLLMWrapper(langchain_llm=ChatOpenAI(...))
RAGAS data: {'question': ['Summarize this article in 250 to 350 words for a 2nd grade lay audience using simple words.The tone should be casual.Avoid emotive or negative language such as battle or fight.Keep words under 4 syllables and sentences under 10 words.'], 'answer': ['This work looks at drugs for kidney cancer.  \nOne drug, everolimus, helps at first.  \nBut it can stop working over time.  \nThe team looked for new ways to help.  \nThey tested a second drug, SGI-1027.  \nSGI-1027 can change how DNA acts.  \nIt makes cells fill with big bubbles.  \nThe bubbles can make the cell break.  \nThe team found SGI-1027 helps everolimus.  \nThe two drugs work well when used together.  \nThey slow tumor cell growth and move.  \nThey cut how cells spread and invade.  \nThe mix makes cells die in two ways.  \nOne way is a calm, planned shut down.  \nThe other way makes cells swell and pop.  \nA protein called GSDME joins the pop way.  \nKid

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]2025-11-11 00:29:31 ML8136-T2YU.local httpx[7485] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-11 00:30:06 ML8136-T2YU.local httpx[7485] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Evaluating: 100%|██████████| 1/1 [00:44<00:00, 44.20s/it]


RAGAS results: {'faithfulness': 0.9333}
<openai.OpenAI object at 0x34def3890>
