In [1]:
!pip install PyPDF2



In [2]:
!pip install openai==0.28



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [19]:
import os
import re
import requests
from PyPDF2 import PdfReader
import pandas as pd
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Increase the max_length limit
nlp.max_length = 2000000  # Increased to 2,000,000

# List of PDF URLs from the GitHub repository
pdf_urls = [
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Anhydrous%20sodium%20thiosulfate-Recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Baricitinib-not%20recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Bevacizumab%20gamma-recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Dupilumab-not%20recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Lorlatinib-not%20recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Lu%20vipivotide-not%20recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Pembrolizumab-recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Tixagevimab-not%20recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-crizotinib-recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-eplontersen-recommend.pdf"
]

def download_file(url, local_filename):
    # Download the file from `url` and save it locally under `local_filename`
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

def read_pdf(filepath):
    with open(filepath, 'rb') as file:
        reader = PdfReader(file)
        full_text = ""
        for page in reader.pages:
            full_text += page.extract_text()
        return full_text

def extract_paediatric_population(filepath):
    text = read_pdf(filepath)
    doc = nlp(text)

    # Check if paediatric is mentioned
    if re.search(r'paediatric', text, re.IGNORECASE):
        return "yes"
    else:
        return "no"



                                            Filename The paediatric population
0  committee-papers-Anhydrous%20sodium%20thiosulf...                       yes
1   committee-papers-Baricitinib-not%20recommend.pdf                       yes
2  committee-papers-Bevacizumab%20gamma-recommend...                        no
3     committee-papers-Dupilumab-not%20recommend.pdf                        no
4    committee-papers-Lorlatinib-not%20recommend.pdf                        no
5  committee-papers-Lu%20vipivotide-not%20recomme...                        no
6       committee-papers-Pembrolizumab-recommend.pdf                        no
7   committee-papers-Tixagevimab-not%20recommend.pdf                       yes
8          committee-papers-crizotinib-recommend.pdf                       yes
9         committee-papers-eplontersen-recommend.pdf                        no


In [22]:
def extract_meta_analysis(filepath):
    text = read_pdf(filepath)

    # Find the "Meta-analysis" section
    meta_analysis_section = re.search(r'Meta-analysis', text, re.IGNORECASE)
    if meta_analysis_section:
        # Extract text from the "Meta-analysis" section onwards
        section_text = text[meta_analysis_section.start():]

        # Find the end of the section (assuming the next section starts with another heading)
        next_section = re.search(r'\n[A-Z][^\n]*\n', section_text)
        if next_section:
            section_text = section_text[:next_section.start()]

        # Use re.findall to get all matches of sentences containing "meta-analysis"
        meta_analysis_sentences = re.findall(r'([^.]*meta-analysis[^.]*\.)', section_text, re.IGNORECASE)

        # Check each matched sentence for "no" or "not"
        for sentence in meta_analysis_sentences:
            if re.search(r'\bno\b|\bnot\b', sentence, re.IGNORECASE):
                return "no"

        # If there are any matched sentences and none contain "no" or "not", return "yes"
        if meta_analysis_sentences:
            return "yes"



    #return "no Meta-Analysis mention"


In [24]:
def extract_number_of_patients(filepath):
    text = read_pdf(filepath)

    # Find all instances of N= followed by a number
    matches = re.findall(r'N\s*=\s*(\d+)', text, re.IGNORECASE)

    # Sum up all the numbers found
    number_of_patients = sum(int(match) for match in matches)

    return number_of_patients



In [25]:


def process_pdfs(pdf_urls):
    results = []
    for url in pdf_urls:
        filename = os.path.basename(url)
        local_filepath = download_file(url, filename)
        paediatric_population = extract_paediatric_population(local_filepath)
        rct_number = extract_rct_number(local_filepath)
        meta_analysis = extract_meta_analysis(local_filepath)
        observational_studies = extract_observational_studies(local_filepath)
        number_of_patients=extract_number_of_patients(local_filepath)
        results.append([filename, paediatric_population, rct_number, meta_analysis, observational_studies,number_of_patients ])

    df = pd.DataFrame(results, columns=['Filename', 'The paediatric population', 'How many RCT', 'Meta-analysis', 'Observational Studies','number_of_patients'])
    return df

In [26]:


if __name__ == "__main__":
    df = process_pdfs(pdf_urls)
    print(df)

                                            Filename  \
0  committee-papers-Anhydrous%20sodium%20thiosulf...   
1   committee-papers-Baricitinib-not%20recommend.pdf   
2  committee-papers-Bevacizumab%20gamma-recommend...   
3     committee-papers-Dupilumab-not%20recommend.pdf   
4    committee-papers-Lorlatinib-not%20recommend.pdf   
5  committee-papers-Lu%20vipivotide-not%20recomme...   
6       committee-papers-Pembrolizumab-recommend.pdf   
7   committee-papers-Tixagevimab-not%20recommend.pdf   
8          committee-papers-crizotinib-recommend.pdf   
9         committee-papers-eplontersen-recommend.pdf   

  The paediatric population How many RCT                   Meta-analysis  \
0                       yes            2  no Meta-analysis section found   
1                       yes            4  no Meta-analysis section found   
2                        no            1  no Meta-analysis section found   
3                        no            2  no Meta-analysis section found   
4  

In [42]:
df.to_csv("output.csv", index=False)

In [27]:
!openai migrate


usage: openai [-h] [-v] [-b API_BASE] [-k API_KEY] [-o ORGANIZATION] {api,tools,wandb} ...
openai: error: argument {api,tools,wandb}: invalid choice: 'migrate' (choose from 'api', 'tools', 'wandb')


In [3]:
import os
os.environ['OPENAI_API_KEY'] = "sk-svcacct-38lVsQJmAnpHYYGlwHLRfemYH6XipJ83Bn4AHnE19lo8NioSbNQshLh_0Ie6gFHT3BlbkFJgi4LuRlCOMye2qnlGRAGY4_JQCoLousyuBpUsrPte0HftMthAZWRdreWw1EFRdQA"


In the soft glow of the moonlit forest, Luna the unicorn gently closed her eyes, knowing that her shimmering horn would guide the dreams of all the woodland creatures to a land of endless rainbows and starlit adventures.


In [18]:

import os
import re
from PyPDF2 import PdfReader
import openai
def read_pdf(filepath):
    with open(filepath, "rb") as file:
        reader = PdfReader(file)
        full_text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text
        return full_text

# Extract paragraphs with "adverse reaction(s)"
def extract_adverse_reactions_paragraphs(text):
    pattern = re.compile(r"([^.]*\badverse reactions?\b[^.]*\.)", re.IGNORECASE)
    matches = pattern.findall(text)
    return matches

# Summarize & rank adverse reactions using ChatGPT
def summarize_adverse_reactions(paragraphs):
    combined_text = "\n".join(paragraphs)
    prompt = (
        "From the text below, extract only the types of adverse reactions mentioned. "
        "Then rank them on a scale from 1 to 10, where 1 is the least severe and 10 is the most severe. "
        "Return the only one number of the output by average each types of adverse reactions mentioned as a list in the format:\n"
        "\"Adverse Reaction: Rank\"\n"
        "Do not include any explanation or extra text. Just return the ranked list.\n\n"
        f"{combined_text}"
    )

    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=500,
        temperature=0.5,
    )

    # Return only the clean, structured ranked list
    return response["choices"][0]["message"]["content"].strip()

if __name__ == "__main__":
    pdf_path = "/content/committee-papers-Anhydrous sodium thiosulfate-Recommend.pdf"
    text = read_pdf(pdf_path)

    adverse_reactions_paragraphs = extract_adverse_reactions_paragraphs(text)

    if adverse_reactions_paragraphs:
        ranked_adverse_reactions = summarize_adverse_reactions(adverse_reactions_paragraphs)
        print("Ranked Adverse Reactions:")
        print(ranked_adverse_reactions)
    else:
        print("No adverse reactions found in the document.")



Ranked Adverse Reactions:
Adverse Reaction: 5


In [51]:
def read_pdf(filepath):
    with open(filepath, "rb") as file:
        reader = PdfReader(file)
        full_text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text
        return full_text

# Extract the full section titled "Interpretation of clinical effectiveness and safety evidence"
def extract_interpretation_section(text):
    match = re.search(
        r"(?s)(Interpretation of clinical effectiveness and safety evidence.*?)(?:\n[A-Z][A-Z\s\-]{5,}\n|\Z)",
        text, re.IGNORECASE
    )
    if match:
        return match.group(1)
    return ""

# Ask GPT to find and rank uncertainties in the section
def summarize_uncertainties_from_interpretation_section(section_text):
    prompt = (
        "You are given the full section titled 'Interpretation of clinical effectiveness and safety evidence'. "
        "From this section, extract the main clinical uncertainties that are either explicitly stated or implied. "
        "Then rank each uncertainty from 1 to 10, where:\n"
        "- 1 = minimal or negligible uncertainty\n"
        "- 10 = very high uncertainty that significantly impacts clinical conclusions\n\n"
        "Return only one ranked in total in the format:\n"
        "Clinical Uncertainty: Rank\n\n"
        "Do not include explanations or additional text. Here is the section:\n\n"
        f"{section_text}"
    )

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=700,
        temperature=0.5,
    )

    return response["choices"][0]["message"]["content"].strip()

# Main execution
if __name__ == "__main__":
    pdf_path = "/content/committee-papers-Anhydrous sodium thiosulfate-Recommend.pdf"
    full_text = read_pdf(pdf_path)

    interpretation_section = extract_interpretation_section(full_text)

    if interpretation_section:
        ranked_uncertainties = summarize_uncertainties_from_interpretation_section(interpretation_section)
        print("Ranked Clinical Uncertainties:")
        print(ranked_uncertainties)
    else:
        print("Section 'Interpretation of clinical effectiveness and safety evidence' not found in the document.")

FileNotFoundError: [Errno 2] No such file or directory: '/content/committee-papers-Anhydrous sodium thiosulfate-Recommend.pdf'

In [19]:
def read_pdf(filepath):
    with open(filepath, "rb") as file:
        reader = PdfReader(file)
        full_text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text
        return full_text

# Extract the full section titled "Interpretation of clinical effectiveness and safety evidence"
def extract_interpretation_section(text):
    match = re.search(
        r"(?s)(Interpretation of clinical effectiveness and safety evidence.*?)(?:\n[A-Z][A-Z\s\-]{5,}\n|\Z)",
        text,
        re.IGNORECASE,
    )
    if match:
        return match.group(1)
    return ""

# Ask GPT to evaluate and rank Quality of Evidence from that section
def summarize_quality_of_evidence(section_text):
    prompt = (
        "You are given the full section titled 'Interpretation of clinical effectiveness and safety evidence'. "
        "Based on the content, determine the overall **quality of clinical evidence** presented in this section. "
        "Rank the quality on a scale from 1 to 10, where:\n"
        "- 1 = very poor or unreliable evidence\n"
        "- 10 = very strong, high-quality, reliable evidence\n\n"
        "Return only one ranked result in the following format:\n"
        "Quality of Evidence: Rank\n\n"
        "Do not include any explanation or extra text. Here is the section:\n\n"
        f"{section_text}"
    )

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=300,
        temperature=0.5,
    )

    return response["choices"][0]["message"]["content"].strip()

# Main execution
if __name__ == "__main__":
    pdf_path = "/content/committee-papers-Anhydrous sodium thiosulfate-Recommend.pdf"
    full_text = read_pdf(pdf_path)

    interpretation_section = extract_interpretation_section(full_text)

    if interpretation_section:
        quality_rank = summarize_quality_of_evidence(interpretation_section)
        print("Quality of Evidence:")
        print(quality_rank)
    else:
        print("Section 'Interpretation of clinical effectiveness and safety evidence' not found in the document.")

Quality of Evidence (based on interpretation section):
Quality of Evidence: 7


In [22]:
def extract_unmet_need_paragraphs(text):
    # Split text into paragraphs (you can adjust splitting if needed)
    paragraphs = text.split("\n")
    unmet_need_paragraphs = [p for p in paragraphs if re.search(r"\bunmet need\b", p, re.IGNORECASE)]
    return unmet_need_paragraphs
def rank_unmet_need(paragraphs):
    if not paragraphs:
        return "No paragraph discussing 'unmet need' was found."

    combined_text = "\n".join(paragraphs)
    prompt = (
        "You are given several paragraphs discussing the 'unmet need' for a treatment or condition. "
        "Based on the content, assess how severe or significant the unmet need is. "
        "Rank the severity from 1 to 10, where:\n"
        "- 1 = the need is mostly met or only minor need remains\n"
        "- 10 = the need is entirely unmet and very significant\n\n"
        "Return your result in the format:\n"
        "Unmet Need Severity: Rank\n\n"
        "Do not explain anything else. Here is the content:\n\n"
        f"{combined_text}"
    )

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=300,
        temperature=0.5,
    )

    return response["choices"][0]["message"]["content"].strip()
if __name__ == "__main__":
    pdf_path = "/content/committee-papers-Anhydrous sodium thiosulfate-Recommend.pdf"
    full_text = read_pdf(pdf_path)

    # --- Extract and Rank Unmet Need ---
    unmet_need_paragraphs = extract_unmet_need_paragraphs(full_text)
    unmet_need_rank = rank_unmet_need(unmet_need_paragraphs)
    print("\n📊 Unmet Need Severity:")
    print(unmet_need_rank)


📊 Unmet Need Severity:
Unmet Need Severity: 9


In [29]:
def extract_rct_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    rct_sentences = [s for s in sentences if re.search(r'\bRCTs?\b', s, re.IGNORECASE)]
    return rct_sentences

# Use OpenAI to extract the number of RCTs
def get_rct_number_from_gpt(rct_sentences):
    if not rct_sentences:
        return "No RCT sentences found."

    combined = "\n".join(rct_sentences)
    prompt = (
        "The following sentences are extracted from a scientific report. "
        "Based on these sentences, how many randomized controlled trials (RCTs) are used in experiements? Not the number of time RCTs appear in the PDFs, don't add up numbers "
        "Return only the number as an integer. If no number is found, return 0.\n\n"
        f"{combined}"
    )

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=10,
        temperature=0,
    )

    result = response["choices"][0]["message"]["content"].strip()
    return result

# Main execution
if __name__ == "__main__":
    pdf_path = "/content/committee-papers-Baricitinib-not recommend.pdf"
    full_text = read_pdf(pdf_path)

    rct_sentences = extract_rct_sentences(full_text)
    rct_number = get_rct_number_from_gpt(rct_sentences)

    print("Number of RCTs mentioned in the PDF:")
    print(rct_number)

Number of RCTs mentioned in the PDF:
12


In [30]:
def extract_observational_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s for s in sentences if re.search(r'\bobservational stud(?:y|ies)\b', s, re.IGNORECASE)]

# Use OpenAI API to extract number of observational studies
def get_observational_study_count(sentences):
    if not sentences:
        return "0"

    combined_text = "\n".join(sentences)
    prompt = (
        "The following sentences are from a clinical document. "
        "Based on these sentences, how many observational studies are mentioned in total? "
        "Return only the number as an integer. If no number is mentioned, return NA.\n\n"
        f"{combined_text}"
    )

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=10,
        temperature=0,
    )

    return response.choices[0].message.content.strip()

# Main
if __name__ == "__main__":
    pdf_path = "/content/committee-papers-Anhydrous sodium thiosulfate-Recommend.pdf"
    full_text = read_pdf(pdf_path)
    obs_sentences = extract_observational_sentences(full_text)
    obs_count = get_observational_study_count(obs_sentences)

    print("Number of Observational Studies Mentioned:")
    print(obs_count)

📊 Number of Observational Studies Mentioned:
2


code for 10 PDFs

In [58]:

import openai
import os
import re
import requests
from PyPDF2 import PdfReader
import pandas as pd
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Increase the max_length limit
nlp.max_length = 2000000
os.environ['OPENAI_API_KEY'] = "sk-svcacct-38lVsQJmAnpHYYGlwHLRfemYH6XipJ83Bn4AHnE19lo8NioSbNQshLh_0Ie6gFHT3BlbkFJgi4LuRlCOMye2qnlGRAGY4_JQCoLousyuBpUsrPte0HftMthAZWRdreWw1EFRdQA"

pdf_urls = [
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Anhydrous%20sodium%20thiosulfate-Recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Baricitinib-not%20recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Bevacizumab%20gamma-recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Dupilumab-not%20recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Lorlatinib-not%20recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Lu%20vipivotide-not%20recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Pembrolizumab-recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-Tixagevimab-not%20recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-crizotinib-recommend.pdf",
    "https://github.com/engie4800/dsi-capstone-spring-2025-JnJ/raw/main/sampledocs/committee-papers-eplontersen-recommend.pdf"
]

def download_file(url, local_filename):
    # Download the file from `url` and save it locally under `local_filename`
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

# Read PDF
def read_pdf(filepath):
    with open(filepath, "rb") as file:
        reader = PdfReader(file)
        return " ".join([page.extract_text() for page in reader.pages if page.extract_text()])

# Feature extractors (same as before)
def extract_paediatric_population(text):
    return "yes" if re.search(r'paediatric', text, re.IGNORECASE) else "no"

def extract_meta_analysis(text):
    match = re.search(r'Meta-analysis', text, re.IGNORECASE)
    if match:
        section_text = text[match.start():]
        next_section = re.search(r'\n[A-Z][^\n]*\n', section_text)
        if next_section:
            section_text = section_text[:next_section.start()]
        meta_sentences = re.findall(r'([^.]*meta-analysis[^.]*\.)', section_text, re.IGNORECASE)
        for sentence in meta_sentences:
            if re.search(r'\bno\b|\bnot\b', sentence, re.IGNORECASE):
                return "no"
        return "yes" if meta_sentences else "no"
    return "no"

def extract_number_of_patients(text):
    matches = re.findall(r'N\s*=\s*(\d+)', text, re.IGNORECASE)
    return sum(int(m) for m in matches)

def extract_adverse_reactions_paragraphs(text):
    return re.findall(r"([^.]*\badverse reactions?\b[^.]*\.)", text, re.IGNORECASE)

def summarize_adverse_reactions(paragraphs):
    if not paragraphs:
        return "NA"
    prompt = (
        "From the text below, extract only the types of adverse reactions mentioned. "
        "Then rank them on a scale from 1 to 10, where 1 is the least severe and 10 is the most severe. "
        "Return the average of all ranks as a single number only.\n\n"
        + "\n".join(paragraphs)
    )
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=10,
        temperature=0,
    )
    return response.choices[0].message.content.strip()

def extract_interpretation_section(text):
    match = re.search(
        r"(?s)(Interpretation of clinical effectiveness and safety evidence.*?)(?:\n[A-Z][A-Z\s\-]{5,}\n|\Z)",
        text,
        re.IGNORECASE,
    )
    if match:
        return match.group(1)
    return ""

# Ask GPT to evaluate and rank Quality of Evidence from that section
def summarize_quality_of_evidence(section_text):
    prompt = (
        "You are given the full section titled 'Interpretation of clinical effectiveness and safety evidence'. "
        "Based on the content, determine the overall **quality of clinical evidence** presented in this section. "
        "Rank the quality on a scale from 1 to 10, where:\n"
        "1 = very poor or unreliable evidence\n"
        "10 = very strong, high-quality, reliable evidence\n\n"
        "Return only one ranked result \n"
        "Return only Rank number\n\n"
        "Do not include any explanation or extra text.\n\n"
        f"{section_text}"
    )

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=300,
        temperature=0.5,
    )

    return response["choices"][0]["message"]["content"].strip()


def summarize_uncertainties_from_interpretation_section(section_text):
    prompt = (
        "You are given the full section titled 'Interpretation of clinical effectiveness and safety evidence'. "
        "From this section, extract the main clinical uncertainties that are either explicitly stated or implied. "
        "Then rank each uncertainty from 1 to 10, where:\n"
        "- 1 = minimal or negligible uncertainty\n"
        "- 10 = very high uncertainty that significantly impacts clinical conclusions\n\n"
        "Return only one ranked in total \n"
        "Do not include explanations or additional text. Here is the section:\n\n"
        f"{section_text}"
    )

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=700,
        temperature=0.5,
    )

    return response["choices"][0]["message"]["content"].strip()

def extract_unmet_need_paragraphs(text):
    return [p for p in text.split("\n") if re.search(r"\bunmet need\b", p, re.IGNORECASE)]

def rank_unmet_need(paragraphs):
    if not paragraphs:
        return "NA"
    prompt = (
        "You are given several paragraphs discussing the 'unmet need' for a treatment or condition. "
        "Based on the content, assess how severe or significant the unmet need is. "
        "Rank the severity from 1 to 10, where:\n"
        "- 1 = the need is mostly met or only minor need remains\n"
        "- 10 = the need is entirely unmet and very significant\n\n"
        "Return only Rank number\n\n"
        "Do not explain anything else. Here is the content:\n\n" + "\n".join(paragraphs)
    )
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=10,
        temperature=0,
    )
    return response.choices[0].message.content.strip()

def extract_rct_sentences(text):
    return [s for s in re.split(r'(?<=[.!?])\s+', text) if re.search(r'\bRCTs?\b', s, re.IGNORECASE)]

def get_rct_number_from_gpt(sentences):
    if not sentences:
        return "NA"
    prompt = (
        "How many RCTs are used in the study? Return only the number.\n\n" + "\n".join(sentences)
    )
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=10,
        temperature=0,
    )
    return response.choices[0].message.content.strip()

def extract_observational_sentences(text):
    return [s for s in re.split(r'(?<=[.!?])\s+', text) if re.search(r'\bobservational stud(?:y|ies)\b', s, re.IGNORECASE)]

def get_observational_study_count(sentences):
    if not sentences:
        return "NA"

    combined_text = "\n".join(sentences)
    prompt = (
        "The following sentences are from a clinical document. "
        "Based on these sentences, how many observational studies are mentioned in total? "
        "Return only the number as an integer. If no number is mentioned, return NA.\n\n"
        f"{combined_text}"
    )

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=10,
        temperature=0,
    )

    return response.choices[0].message.content.strip()


In [59]:
data = []
for url in pdf_urls:
    try:
        filename = os.path.basename(url)
        local_filepath = download_file(url, filename)
        print(f"Processing: {filename}")
        text = read_pdf(local_filepath)
        interpretation = extract_interpretation_section(text)
        row = {
            "Filename": filename,
            "Paediatric Population": extract_paediatric_population(text),
            "Meta-analysis": extract_meta_analysis(text),
            "Number of Patients": extract_number_of_patients(text),
            "Adverse Reaction Severity": summarize_adverse_reactions(extract_adverse_reactions_paragraphs(text)),
            "Clinical Uncertainty Score": summarize_uncertainties_from_interpretation_section(interpretation),
            "Quality of Evidence Score": summarize_quality_of_evidence(interpretation),
            "Unmet Need": rank_unmet_need(extract_unmet_need_paragraphs(text)),
            "Number of RCTs": get_rct_number_from_gpt(extract_rct_sentences(text)),
            "Number of Observational Studies": get_observational_study_count(extract_observational_sentences(text))
        }
        data.append(row)
    except Exception as e:
        print(f"Failed to process {filename}: {e}")

# ... (rest of your code)

Processing: committee-papers-Anhydrous%20sodium%20thiosulfate-Recommend.pdf
Processing: committee-papers-Baricitinib-not%20recommend.pdf
Processing: committee-papers-Bevacizumab%20gamma-recommend.pdf
Processing: committee-papers-Dupilumab-not%20recommend.pdf
Processing: committee-papers-Lorlatinib-not%20recommend.pdf
Processing: committee-papers-Lu%20vipivotide-not%20recommend.pdf
Processing: committee-papers-Pembrolizumab-recommend.pdf
Processing: committee-papers-Tixagevimab-not%20recommend.pdf
Processing: committee-papers-crizotinib-recommend.pdf
Processing: committee-papers-eplontersen-recommend.pdf


In [60]:
df = pd.DataFrame(data)
df

Unnamed: 0,Filename,Paediatric Population,Meta-analysis,Number of Patients,Adverse Reaction Severity,Clinical Uncertainty Score,Quality of Evidence Score,Unmet Need,Number of RCTs,Number of Observational Studies
0,committee-papers-Anhydrous%20sodium%20thiosulf...,yes,no,10934,7.0,8,8,10,2,3.0
1,committee-papers-Baricitinib-not%20recommend.pdf,yes,no,82567,5.5,7,7,10,13,34.0
2,committee-papers-Bevacizumab%20gamma-recommend...,no,no,13370,5.5,"I'm sorry, but I can't extract or analyze text...",7,9,22,
3,committee-papers-Dupilumab-not%20recommend.pdf,no,no,25209,5,"I'm sorry, but it appears that the section tit...",7,9,7,6.0
4,committee-papers-Lorlatinib-not%20recommend.pdf,no,yes,12016,7.5,8,7,10,10,1.0
5,committee-papers-Lu%20vipivotide-not%20recomme...,no,no,55005,Lymphopenia\n\nRanking: 5,8,7,10,20,3.0
6,committee-papers-Pembrolizumab-recommend.pdf,no,no,18062,5.0,8,7,8,Five,1.0
7,committee-papers-Tixagevimab-not%20recommend.pdf,yes,no,196557,7.0,7,7,9,1,6.0
8,committee-papers-crizotinib-recommend.pdf,yes,yes,8415,3.0,"I'm sorry, but it seems that you haven't provi...",7,1,0,3.0
9,committee-papers-eplontersen-recommend.pdf,no,no,14576,6.5,"I'm sorry, but it seems that you have not prov...",7,9,2,


In [61]:
df.to_csv("committee_papers_summary.csv")



In [57]:
import os
print(os.getcwd())

/content
