In [1]:
#%pip install -U -q "google-genai==1.7.0"
#%pip install dotenv
#%pip install pypdf2 requests beautifulsoup4 google-api-python-client

In [2]:
from google import genai
import google.genai as genai
from google.genai import types
from dotenv import load_dotenv
import os
from IPython.display import HTML, Markdown, display
import os
import time
import requests
import io
import PyPDF2
from requests.sessions import Session
from googleapiclient.discovery import build
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [3]:
load_dotenv(dotenv_path="keys.env")
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
client = genai.Client(api_key=GOOGLE_API_KEY)
NCCN_username = os.getenv("NCCN_username")
NCCN_password = os.getenv("NCCN_password")

In [4]:
# Configure the Google Generative AI API
def setup_genai(api_key=None):
    """
    Set up the Google Generative AI client for using Gemini 2.0 Flash
    
    Args:
        api_key: Optional API key to override the one from environment variables
        
    Returns:
        Tuple of (client, selected_model_name) or (None, None) if setup fails
    """
    try:
        # Use gemini-2.5-pro-exp-03-25
        selected_model = "gemini-2.5-pro-exp-03-25"
        print(f"Selected model: {selected_model}")
        return client, selected_model
    except Exception as e:
        print(f"Error setting up GenAI: {e}")
        return None, None

In [5]:
def get_nccn_pdf(cancer_type):
    """
    Attempt to retrieve the NCCN PDF for a specific cancer type using credentials from env file.

    Args:
        cancer_type: The type of cancer

    Returns:
        The PDF content or None if retrieval fails
    """
    # Use the global NCCN credentials
    if not NCCN_username or not NCCN_password:
        print("Cannot access NCCN resources: missing credentials")
        return None

    print(f"Attempting to retrieve NCCN guidelines for {cancer_type}...")
    print(f"Using NCCN credentials: {NCCN_username[:2]}*** (password hidden)")

    # Create a session to maintain cookies
    session = requests.Session()

    try:
        # Set headers to mimic a browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.nccn.org/',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
        session.headers.update(headers)

        # Step 1: Visit main page to get initial cookies
        main_response = session.get('https://www.nccn.org/')
        if main_response.status_code != 200:
            print(f"Failed to access NCCN main page: Status {main_response.status_code}")
            return None

        # Step 2: Visit login page to get form details
        login_url = 'https://www.nccn.org/login'
        login_response = session.get(login_url)
        if login_response.status_code != 200:
            print(f"Failed to access login page: Status {login_response.status_code}")
            return None

        # Parse the login page to extract any required tokens
        soup = BeautifulSoup(login_response.text, 'html.parser')
        login_form = soup.find('form', {'action': lambda x: x and '/login' in x.lower()})

        if not login_form:
            print("Could not find login form on the page")
            return None

        # Extract hidden fields needed for login
        form_data = {
            'Username': NCCN_username,
            'Password': NCCN_password
        }

        # Add any hidden fields that may be required (like CSRF tokens)
        for hidden_input in login_form.find_all('input', {'type': 'hidden'}):
            if hidden_input.get('name'):
                form_data[hidden_input.get('name')] = hidden_input.get('value', '')

        # Step 3: Submit login form
        login_post_url = login_form.get('action')
        if not login_post_url:
            login_post_url = login_url
        elif not login_post_url.startswith('http'):
            # Handle relative URLs
            login_post_url = urljoin('https://www.nccn.org', login_post_url)

        print(f"Submitting login to: {login_post_url}")
        login_submit = session.post(login_post_url, data=form_data, allow_redirects=True)

        # Check if login was successful
        if 'login' in login_submit.url.lower() or 'incorrect username or password' in login_submit.text.lower():
            print("Login failed. Check your NCCN credentials.")
            return None

        print("Login successful!")

        # Step 4: Search for the cancer type guidelines on the Category 1 page
        category_1_url = "https://www.nccn.org/guidelines/category_1"
        print(f"Searching for guidelines on: {category_1_url}")

        category_1_response = session.get(category_1_url)
        if category_1_response.status_code != 200:
            print(f"Failed to access Category 1 page: Status {category_1_response.status_code}")
            return None

        category_1_soup = BeautifulSoup(category_1_response.text, 'html.parser')

        # Find all links on the page
        all_links = category_1_soup.find_all('a', href=True)

        # 1. Prioritize EXACT match (link text equals cancer_type)
        cancer_page_link = None
        for link in all_links:
            if link.text.strip().lower() == cancer_type.lower():
                cancer_page_link = link
                break

        if not cancer_page_link:
            # 2. If no exact match, try a more flexible search
            for link in all_links:
                if cancer_type.lower() in link.text.strip().lower():
                    cancer_page_link = link
                    break

        if not cancer_page_link:
            print(f"No link found for {cancer_type} on Category 1 page.")
            return None

        cancer_page_url = cancer_page_link['href']
        if not cancer_page_url.startswith('http'):
            cancer_page_url = urljoin('https://www.nccn.org', cancer_page_url)

        print(f"Navigating to cancer-specific page: {cancer_page_url}")

        # Step 5: Navigate to the cancer-specific page
        cancer_page_response = session.get(cancer_page_url)
        if cancer_page_response.status_code != 200:
            print(f"Failed to access cancer-specific page: Status {cancer_page_response.status_code}")
            return None

        cancer_page_soup = BeautifulSoup(cancer_page_response.text, 'html.parser')

        # Step 6: Find the PDF link within the "pdfList"
        pdf_list = cancer_page_soup.find('ul', class_='pdfList')
        if not pdf_list:
            print("Could not find 'pdfList' on cancer-specific page.")
            return None

        pdf_link = pdf_list.find('a', href=lambda x: x and x.endswith('.pdf'))
        if not pdf_link:
            print("Could not find PDF link within 'pdfList'.")
            return None

        pdf_url = pdf_link['href']
        if not pdf_url.startswith('http'):
            pdf_url = urljoin('https://www.nccn.org', pdf_url)

        print(f"Found PDF URL: {pdf_url}")

        # Step 7: Download the PDF
        print(f"Attempting to download PDF from: {pdf_url}")
        # Add a slight delay to avoid overloading the server
        time.sleep(1)

        pdf_response = session.get(pdf_url)
        if pdf_response.status_code != 200:
            print(f"Failed to download PDF: Status {pdf_response.status_code}")
            return None

        # Check if we actually got a PDF
        if 'application/pdf' not in pdf_response.headers.get('Content-Type', ''):
            print(f"Response is not a PDF. Content-Type: {pdf_response.headers.get('Content-Type')}")
            return None

        print(f"Successfully downloaded PDF guidelines ({len(pdf_response.content)} bytes)")
        return pdf_response.content

    except Exception as e:
        print(f"Error retrieving NCCN guidelines: {e}")
        return None
    finally:
        # Always close the session to free resources
        session.close()
        print("Session closed properly")

def extract_text_from_pdf(pdf_content):
    """Extract text content from a PDF file."""
    try:
        pdf_file = io.BytesIO(pdf_content)
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

In [6]:
def generate_content_with_model(client, model_name, prompt, use_google_search=False):
    """
    (The function you provided - keep it as is)
    """
    try:
        # Create a chat session
        chat = client.chats.create(model=model_name)
        
        # Use Google Search if requested
        if use_google_search:
            try:
                config_with_search = types.GenerateContentConfig(
                    tools=[types.Tool(google_search=types.GoogleSearch())],
                    temperature=0.2,
                )
                
                response = chat.send_message(
                    message=prompt,
                    config=config_with_search,
                )
                print("Using Google Search for enhanced response")
            except Exception as search_error:
                print(f"Google Search tool failed: {search_error}")
                print("Falling back to standard chat output.")
                response = chat.send_message(prompt)
        else:
            # Standard generation without search
            response = chat.send_message(prompt)
        
        # Extract text from the response
        if hasattr(response, 'text'):
            return response.text
        elif hasattr(response, 'parts') and len(response.parts) > 0:
            return response.parts[0].text
        else:
            print(f"Unexpected chat response format: {response}")
            
            # Try alternative attribute access patterns
            for attr in dir(response):
                if not attr.startswith('_'):
                    print(f"Response has attribute: {attr}")
                    
            # Fallback to string representation
            return str(response)
    except Exception as e:
        print(f"Chat error: {e}")
        return "Error generating response. Please try again later."
        
def extract_relevant_info(search_results):
    """Extract relevant information from search results."""
    combined_text = ""
    
    for result in search_results:
        combined_text += f"Title: {result['title']}\n"
        combined_text += f"Snippet: {result['snippet']}\n\n"
        
        # Optionally fetch and parse content from the page
        try:
            response = requests.get(result["link"], timeout=5)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                paragraphs = soup.find_all("p")
                content = "\n".join([p.get_text() for p in paragraphs[:5]])  # Get first 5 paragraphs
                combined_text += f"Content: {content}\n\n"
        except:
            pass
    
    return combined_text

In [7]:
def detect_cancer_type(client, query, model_name="gemini-1.5-pro-latest"):  # Or your preferred model
    """
    Detects the cancer type from a query, using an LLM to resolve ambiguities,
    and simplifies the cancer type to match NCCN document categories,
    without hardcoding abbreviations, using the generate_content_with_model helper.

    Args:
        client: The genai Client object
        query: The user's query string
        model_name: The name of the language model to use for cancer type detection

    Returns:
        The simplified cancer type as a string, or None if detection fails
    """
    try:
        # NCCN Category List (from user)
        nccn_categories = [
            "Acute Lymphoblastic Leukemia",
            "Acute Myeloid Leukemia",
            "Ampullary Adenocarcinoma",
            "Anal Carcinoma",
            "Basal Cell Skin Cancer",
            "B-Cell Lymphomas",
            "Biliary Tract Cancers",
            "Bladder Cancer",
            "Bone Cancer",
            "Breast Cancer",
            "Castleman Disease",
            "Central Nervous System Cancers",
            "Cervical Cancer",
            "Chronic Lymphocytic Leukemia/Small Lymphocytic Lymphoma",
            "Chronic Myeloid Leukemia",
            "Colon Cancer",
            "Dermatofibrosarcoma Protuberans",
            "Esophageal and Esophagogastric Junction Cancers",
            "Gastric Cancer",
            "Gastrointestinal Stromal Tumors",
            "Gestational Trophoblastic Neoplasia",
            "Hairy Cell Leukemia",
            "Head and Neck Cancers",
            "Hepatobiliary Cancers",
            "Hepatocellular Carcinoma",
            "Histiocytic Neoplasms",
            "Hodgkin Lymphoma",
            "Kaposi Sarcoma",
            "Kidney Cancer",
            "Melanoma: Cutaneous",
            "Melanoma: Uveal",
            "Merkel Cell Carcinoma",
            "Mesothelioma: Peritoneal",
            "Mesothelioma: Pleural",
            "Multiple Myeloma",
            "Myelodysplastic Syndromes",
            "Myeloid/Lymphoid Neoplasms with Eosinophilia and Tyrosine Kinase Gene Fusions",
            "Myeloproliferative Neoplasms",
            "Neuroblastoma",
            "Neuroendocrine and Adrenal Tumors",
            "Non-Small Cell Lung Cancer",
            "Occult Primary",
            "Ovarian Cancer/Fallopian Tube Cancer/Primary Peritoneal Cancer",
            "Pancreatic Adenocarcinoma",
            "Pediatric Acute Lymphoblastic Leukemia",
            "Pediatric Aggressive Mature B-Cell Lymphomas",
            "Pediatric Central Nervous System Cancers",
            "Pediatric Hodgkin Lymphoma",
            "Penile Cancer",
            "Primary Cutaneous Lymphomas",
            "Prostate Cancer",
            "Rectal Cancer",
            "Small Bowel Adenocarcinoma",
            "Small Cell Lung Cancer",
            "Soft Tissue Sarcoma",
            "Squamous Cell Skin Cancer",
            "Systemic Light Chain Amyloidosis",
            "Systemic Mastocytosis",
            "T-Cell Lymphomas",
            "Testicular Cancer",
            "Thymomas and Thymic Carcinomas",
            "Thyroid Carcinoma",
            "Uterine Neoplasms",
            "Vaginal Cancer",
            "Vulvar Cancer",
            "Waldenström Macroglobulinemia/Lymphoplasmacytic Lymphoma",
            "Wilms Tumor (Nephroblastoma)"
        ]

        # LLM-based cancer type resolution
        prompt = f"""
        The user has provided the following query: "{query}".

        Identify the specific cancer type being discussed in the query. 
        If the query uses abbreviations or ambiguous terms, resolve them to the full cancer type name.
        Determine the most appropriate NCCN category for the identified cancer type, based on the following categories:
        {chr(10).join([f"- {category}" for category in nccn_categories])}

        If the identified cancer type does not fall into any of these categories, respond with "Other".

        Provide ONLY the simplified cancer type name or "None". Do not include any other text.
        """

        # Use the helper function to generate content
        cancer_type = generate_content_with_model(client, model_name, prompt)

        if cancer_type and cancer_type.lower() != "none" and cancer_type.lower() != "other":
            return cancer_type
        else:
            return None

    except Exception as e:
        print(f"Error detecting cancer type: {e}")
        return None
        
def create_answer_from_pdf_content(query, pdf_text, client, model_name):
    """Generate an answer based on PDF content using the LLM."""
    # Truncate content if too long (Gemini has context limits)
    max_content_length = 30000  # Adjust based on model limits
    if len(pdf_text) > max_content_length:
        pdf_text = pdf_text[:max_content_length]
    
    prompt = f"""
    Based on the following NCCN Guidelines content, please answer this question about cancer:
    
    Question: {query}
    
    NCCN Guidelines Content:
    {pdf_text}
    
    If the information to answer the question is not found in the provided content, respond with "INFORMATION_NOT_FOUND".
    """
    
    # No need for Google Search here as we're using the PDF content
    answer = generate_content_with_model(client, model_name, prompt, use_google_search=False)
    
    if not answer or "Error generating response" in answer or "INFORMATION_NOT_FOUND" in answer:
        return None
        
    return answer

In [8]:
def cancer_info_chatbot(query, api_key=None):
    """
    Main chatbot function that processes cancer queries with enhanced web search and few-shot prompting.

    Args:
        query: The user's question about cancer
        api_key: Optional API key to override the one from environment variables

    Returns:
        Dict with answer information including the source of information
    """
    # Set up the GenAI client and get selected model name
    client, model_name = setup_genai(api_key=api_key)

    if not client or not model_name:
        return {
            "source": "Error",
            "cancer_type": None,
            "answer": "Failed to set up the AI service. Please check your API key."
        }

    # Step 1: Detect cancer type from query
    cancer_type = detect_cancer_type(client, query, model_name)
    print(f"Detected cancer type: {cancer_type}")

    # Step 2: Try to retrieve NCCN PDF if cancer type is detected
    pdf_content = None
    if cancer_type:
        pdf_content = get_nccn_pdf(cancer_type)

    # Step 3: If we have PDF content, extract text and try to answer the question
    if pdf_content:
        pdf_text = extract_text_from_pdf(pdf_content)
        print(f"Extracted {len(pdf_text)} characters of text from PDF")

        if pdf_text:
            pdf_answer = create_answer_from_pdf_content(query, pdf_text, client, model_name)
            if pdf_answer:
                return {
                    "source": "NCCN Guidelines",
                    "cancer_type": cancer_type,
                    "answer": pdf_answer
                }

    # Step 4: If PDF approach didn't work, generate a response with Google Search enabled
    # Incorporate few-shot examples into the prompt
    # Only reach this point if the NCCN approach failed
    prompt = f"""
    As a helpful medical information assistant, provide accurate information about cancer-related questions, using the following examples as a guide:

    **Examples:**

    **Question:** What are appropriate treatments for double-hit lymphoma after salvage chemotherapy failure?
    **Answer:** Double-hit lymphoma (DHL) is an aggressive type of B-cell non-Hodgkin lymphoma characterized by rearrangements in the MYC gene and BCL2 
    and/or BCL6 genes. These genetic abnormalities lead to a poorer prognosis compared to other aggressive lymphomas.
    Here's a breakdown of treatment approaches after salvage chemotherapy failure:
    - The Challenge of Salvage Therapy
    - Treatment Options After Salvage Failure
    - Initial Treatment Strategies (Context for Salvage Failure)
    - Prognosis
    - Important considerations
    NCCN guidelines provide detailed recommendations for treatment.  Consult with a medical oncologist for personalized advice.

    **Question:** What mutations predict adverse outcomes for HER2-directed therapy in gastric cancer?
    **Answer:** Okay, I will provide information about mutations that predict adverse outcomes for HER2-directed therapy in colon cancer, incorporating details about treatment approaches, staging, and prognosis where relevant.
    HER2 (Human Epidermal Growth Factor Receptor 2) alterations, including gene amplification and mutations, are emerging biomarkers in colon cancer that can predict response to HER2-directed therapies. 
    While HER2 is well-established as a target in breast and gastric cancers, its role in colorectal cancer is still being defined.
    - Mutations Predicting Adverse Outcomes
    - Current Treatment Approaches
    - Prognosis
    - Important Considerations
    Regular follow-up is important to monitor for recurrence.  See a pulmonologist or oncologist for detailed information.

    **End of Examples.**

    Now, answer the following question, using your knowledge and search capabilities to find the most up-to-date and accurate information:

    **Question:** {query}
    {f"Cancer type: {cancer_type}" if cancer_type else ""}

    Include information about current treatment approaches, staging, and prognosis if relevant.
    Base your answer on reliable medical sources (like NCCN, ASCO, ACS, NIH, or medical journals).
    Consider citing your sources and provide concise, well-structured information.
    Always recommend consulting healthcare professionals for personalized medical advice.
    """

    # Enable Google Search for this query to get the most up-to-date information
    print("Searching for information on Google...\n")
    answer = generate_content_with_model(client, model_name, prompt, use_google_search=True)

    if not answer or "Error generating response" in answer:
        answer = "Unable to generate a response. Please consult a healthcare professional for accurate information."

    return {
        "source": "AI Knowledge with Web Search",
        "cancer_type": cancer_type,
        "answer": answer
    }

In [9]:
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output

# Create a larger text area for input
text_area = widgets.Textarea(
    value='',
    placeholder='Enter your cancer-related question here...',
    description='',
    disabled=False,
    layout=widgets.Layout(width='100%', height='150px')
)

# Create a button to submit the query
button = widgets.Button(
    description='Submit Question',
    button_style='primary', 
    tooltip='Click to submit your question',
    layout=widgets.Layout(width='200px')
)

output_area = widgets.Output()

# Define what happens when the button is clicked
def on_button_click(b):
    with output_area:
        clear_output()
        query = text_area.value
        
        if not query.strip():
            display(Markdown("**Please enter a question.**"))
            return
            
        # Show a loading message
        display(Markdown("**Processing your question...**"))
        
        try:
            # THIS IS WHERE YOUR FUNCTION IS CALLED
            response = cancer_info_chatbot(query)
            
            # Clear the loading message
            clear_output()
            
            # Display the results
            markdown_output = "Source:" + f"{response['source']}  \n"
            markdown_output += "Detected Cancer Type:" + f"{response['cancer_type'] or 'Not specified'}\n\n"
            markdown_output += "Answer:"+ f"\n{response['answer']}"
            display(Markdown(markdown_output))
        except Exception as e:
            clear_output()
            display(Markdown("**Error:**" + f"{str(e)}"))
            print(e)  # Print full error for debugging

# Connect the button to the function
button.on_click(on_button_click)

# Display the UI elements
display(text_area)
display(button)
display(output_area)

Textarea(value='', layout=Layout(height='150px', width='100%'), placeholder='Enter your cancer-related questio…

Button(button_style='primary', description='Submit Question', layout=Layout(width='200px'), style=ButtonStyle(…

Output()