## NCCN Guideline Chat Query Interface

In [19]:
#!pip install PyPDF
#!pip install pdfplumber
#%pip install langchain
#%pip install langchain-community
#%pip install langchain-openai
#%pip install langchain-chroma
#%pip install fastapi
#%pip install pypdf
#%pip install gradio
#%pip install dotenv
%pip install -U duckduckgo-search

Defaulting to user installation because normal site-packages is not writeable
Collecting duckduckgo-search
  Downloading duckduckgo_search-7.5.5-py3-none-any.whl.metadata (17 kB)
Collecting primp>=0.14.0 (from duckduckgo-search)
  Downloading primp-0.14.0-cp38-abi3-macosx_11_0_arm64.whl.metadata (13 kB)
Collecting lxml>=5.3.0 (from duckduckgo-search)
  Downloading lxml-5.3.1-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.7 kB)
Downloading duckduckgo_search-7.5.5-py3-none-any.whl (20 kB)
Downloading lxml-5.3.1-cp39-cp39-macosx_10_9_universal2.whl (8.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m31m17.9 MB/s[0m eta [36m0:00:01[0m
[?25hDownloading primp-0.14.0-cp38-abi3-macosx_11_0_arm64.whl (2.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: primp, lxml, duckduckgo-search


### Import the necessary libraries

In [3]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
import langchain
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
#from sklearn.manifold import TSNE
import numpy as np
#import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain.tools import DuckDuckGoSearchRun
from langchain.document_loaders import DirectoryLoader, PyPDFLoader, PDFPlumberLoader, UnstructuredPDFLoader, PDFMinerLoader
from pypdf.errors import PdfReadError

Matplotlib is building the font cache; this may take a moment.


### Load the API keys for OpenAI and HuggingFace

In [5]:
load_dotenv(dotenv_path="keys.env")
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

### Define the OpenAI LLM that will be used.  Set the name of the database

In [6]:
# Use a low cost model from OpenAI will be sufficient

MODEL = "gpt-4o-mini"
db_name = "nccn_guidelines_db"

### Load in the PDF documents from the NCCN folder. A variety of PDF loaders will stand ready if there is a problem with one of them. 

In [7]:
# Read in documents

folder = "NCCN"  # Your folder containing PDFs

pdf_files = [f for f in os.listdir(folder) if f.endswith('.pdf')]

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

def try_multiple_loaders(file_path):
    loaders = [
        (PyPDFLoader, "PyPDFLoader"),
        (PDFPlumberLoader, "PDFPlumberLoader"),
        (UnstructuredPDFLoader, "UnstructuredPDFLoader"),
        (PDFMinerLoader, "PDFMinerLoader")
    ]
    
    for LoaderClass, loader_name in loaders:
        try:
            loader = LoaderClass(file_path)
            docs = loader.load()
            print(f"Success with {loader_name} for {file_path}")
            return docs
        except Exception as e:
            print(f"{loader_name} failed: {str(e)}")
            continue
    
    return None  # If all loaders fail
    

documents = []
for pdf_file in pdf_files:
    full_path = os.path.join(folder, pdf_file)
    docs = try_multiple_loaders(full_path)
    if docs:
        doc_type = pdf_file  # or pdf_file.replace('.pdf', '') if you don't want the extension
        documents.extend([add_metadata(doc, doc_type) for doc in docs])
    else:
        print(f"All loaders failed for {pdf_file}")

print ("PDF ingestion completed.")

Success with PyPDFLoader for NCCN/cml2025.pdf
Success with PyPDFLoader for NCCN/anal2025.pdf
Success with PyPDFLoader for NCCN/biliary2025.pdf
Success with PyPDFLoader for NCCN/myeloma2025.pdf
Success with PyPDFLoader for NCCN/cervical2025.pdf
Success with PyPDFLoader for NCCN/uveal2025.pdf
Success with PyPDFLoader for NCCN/gist2025.pdf
Success with PyPDFLoader for NCCN/mpn2025.pdf
Success with PyPDFLoader for NCCN/cll_sll2025.pdf
Success with PyPDFLoader for NCCN/mds2025.pdf
Success with PyPDFLoader for NCCN/ovarian2025.pdf
Success with PyPDFLoader for NCCN/hcc2025.pdf
Success with PyPDFLoader for NCCN/neuroendocrine2025.pdf
Success with PyPDFLoader for NCCN/squamous2025.pdf
Success with PyPDFLoader for NCCN/colon2025.pdf
Success with PyPDFLoader for NCCN/thyroid2025.pdf
Success with PyPDFLoader for NCCN/small_bowel2024.pdf
Success with PyPDFLoader for NCCN/uterine2025.pdf
Success with PyPDFLoader for NCCN/kaposi2025.pdf
Success with PyPDFLoader for NCCN/sclc2025.pdf
Success with PyPD

### Here is where we define the chunk size.

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=60, separators=["\n\n", "\n", ".", ", "])
chunks = text_splitter.split_documents(documents)

### Looks at the "doc_type' of the metadata and identifies document types, such as PDF (or TXT). If PDF files were in the input stack, then one should see PDF files. This is quality control.

In [9]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: basal2025.pdf, anal2025.pdf, bladder2025.pdf, histiocytic_neoplasms2025.pdf, cml2025.pdf, esophageal2025.pdf, aml2025.pdf, cervical2025.pdf, small_bowel2024.pdf, colon2025.pdf, all2024.pdf, cns2025.pdf, mcc2025.pdf, NHL_B-cell2025.pdf, vulvar2024.pdf, pancreatic2025.pdf, gastric2025.pdf, kidney2025.pdf, prostate2025.pdf, cll_sll2025.pdf, bone2025.pdf, uterine2025.pdf, gist2025.pdf, gtn2025.pdf, head-and-neck2025.pdf, kaposi2025.pdf, rectal2025.pdf, ovarian2025.pdf, squamous2025.pdf, neuroendocrine2025.pdf, mds2025.pdf, ampullary2025.pdf, uveal2025.pdf, sarcoma2025.pdf, breast2025.pdf, biliary2025.pdf, hodgkins2025.pdf, NHL_T-cell2025.pdf, mpn2025.pdf, occult2025.pdf, thyroid2025.pdf, nsclc2025.pdf, hcc2025.pdf, waldenstroms2025.pdf, myeloma2025.pdf, sclc2025.pdf, testicular2025.pdf, thymic2025.pdf, cutaneous_melanoma2025.pdf, vaginal2025.pdf


### This code displays unique filenames instead of document types. 

In [10]:
unique_files = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Files processed: {', '.join(unique_files)}")
print(f"Total unique files: {len(unique_files)}")

Files processed: basal2025.pdf, anal2025.pdf, bladder2025.pdf, histiocytic_neoplasms2025.pdf, cml2025.pdf, esophageal2025.pdf, aml2025.pdf, cervical2025.pdf, small_bowel2024.pdf, colon2025.pdf, all2024.pdf, cns2025.pdf, mcc2025.pdf, NHL_B-cell2025.pdf, vulvar2024.pdf, pancreatic2025.pdf, gastric2025.pdf, kidney2025.pdf, prostate2025.pdf, cll_sll2025.pdf, bone2025.pdf, uterine2025.pdf, gist2025.pdf, gtn2025.pdf, head-and-neck2025.pdf, kaposi2025.pdf, rectal2025.pdf, ovarian2025.pdf, squamous2025.pdf, neuroendocrine2025.pdf, mds2025.pdf, ampullary2025.pdf, uveal2025.pdf, sarcoma2025.pdf, breast2025.pdf, biliary2025.pdf, hodgkins2025.pdf, NHL_T-cell2025.pdf, mpn2025.pdf, occult2025.pdf, thyroid2025.pdf, nsclc2025.pdf, hcc2025.pdf, waldenstroms2025.pdf, myeloma2025.pdf, sclc2025.pdf, testicular2025.pdf, thymic2025.pdf, cutaneous_melanoma2025.pdf, vaginal2025.pdf
Total unique files: 50


### A more comprehensive summary of the loading and chunking of the files. 

In [11]:
# First, let's see what files we started with
print(f"Total PDFs found: {len(pdf_files)}")
print(f"PDF files: {', '.join(pdf_files)}")

# After processing, let's see what made it into documents
doc_sources = set(doc.metadata['source'] for doc in documents)
print(f"\nSuccessfully processed documents: {len(doc_sources)}")
print(f"Sources: {', '.join(doc_sources)}")

# After chunking, let's see what we have
chunk_sources = set(chunk.metadata['source'] for chunk in chunks)
print(f"\nDocuments that were chunked: {len(chunk_sources)}")
print(f"Chunked sources: {', '.join(chunk_sources)}")

# Create sets of files
original_files = set(pdf_files)
processed_files = set(os.path.basename(source) for source in doc_sources)

# Find failed files
failed_files = original_files - processed_files
print("\nFiles that failed to process:")
if not failed_files:  # This checks if the set is empty
    print("No failed files. All were successfully processed.")
else:
    for file in failed_files:
        print(f"- {file}")

Total PDFs found: 50
PDF files: cml2025.pdf, anal2025.pdf, biliary2025.pdf, myeloma2025.pdf, cervical2025.pdf, uveal2025.pdf, gist2025.pdf, mpn2025.pdf, cll_sll2025.pdf, mds2025.pdf, ovarian2025.pdf, hcc2025.pdf, neuroendocrine2025.pdf, squamous2025.pdf, colon2025.pdf, thyroid2025.pdf, small_bowel2024.pdf, uterine2025.pdf, kaposi2025.pdf, sclc2025.pdf, bladder2025.pdf, bone2025.pdf, nsclc2025.pdf, vaginal2025.pdf, esophageal2025.pdf, NHL_B-cell2025.pdf, cutaneous_melanoma2025.pdf, sarcoma2025.pdf, kidney2025.pdf, all2024.pdf, head-and-neck2025.pdf, aml2025.pdf, vulvar2024.pdf, gtn2025.pdf, waldenstroms2025.pdf, rectal2025.pdf, mcc2025.pdf, prostate2025.pdf, testicular2025.pdf, thymic2025.pdf, basal2025.pdf, breast2025.pdf, hodgkins2025.pdf, NHL_T-cell2025.pdf, histiocytic_neoplasms2025.pdf, pancreatic2025.pdf, ampullary2025.pdf, gastric2025.pdf, occult2025.pdf, cns2025.pdf

Successfully processed documents: 50
Sources: NCCN/biliary2025.pdf, NCCN/mpn2025.pdf, NCCN/nsclc2025.pdf, NCCN/sa

### Here each chunk of text is mapped into a vector that represents the meaning of the text. This is known as an embedding, and OpenAI's embedding method will be used. The vectors will be stored in a vector database.

In [12]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

from tqdm import tqdm  # for progress bar

def create_vectorstore_in_batches(chunks, embedding, batch_size=100, db_name="db"):
    total_batches = (len(chunks) + batch_size - 1) // batch_size  # Calculate total number of batches
    
    # Initialize the vector store with the first batch
    first_batch = chunks[:batch_size]
    vectorstore = Chroma.from_documents(
        documents=first_batch,
        embedding=embedding,
        persist_directory=db_name,
        collection_metadata={"hnsw:space": "cosine"}
    )
    
    # Process the remaining chunks in batches
    with tqdm(total=total_batches-1, desc="Processing batches", ncols=150) as pbar:
        for i in range(batch_size, len(chunks), batch_size):
            batch = chunks[i:i + batch_size]
            vectorstore.add_documents(documents=batch)
            pbar.update(1)
    
    return vectorstore

# Use it like this:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vectorstore = create_vectorstore_in_batches(chunks, embeddings, batch_size=100, db_name=db_name)

Processing batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 584/584 [17:38<00:00,  1.81s/it]


### Chroma creates collections in the vector database the holds information, such as the embeddings (vectors), associated metadata, documents/text, and IDs for each entry.

In [13]:
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 175,491 vectors with 3,072 dimensions in the vector store


## Create the chat model using OpenAI or a local open-source model. Create the retriever and conversation chain. 

In [14]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# Alternative - if you'd like to use Ollama locally, uncomment this line instead
#llm = ChatOpenAI(temperature=0.7, model_name='llama3.3', base_url='http://localhost:11434/v1', api_key='ollama')

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k":5})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)


### Check to see if the answer to a query is inconclusive

In [15]:
def is_inconclusive_answer(answer):
    """Check if the answer is inconclusive or indicates lack of knowledge."""
    low_confidence_phrases = [
        "i don't know", 
        "i don't have", 
        "i cannot", 
        "i can't provide", 
        "i do not have", 
        "unable to find", 
        "no information",
        "not mentioned",
        "cannot determine",
        "doesn't mention",
        "does not mention",
        "no specific information",
        "no data",
        "insufficient information"
    ]
    
    answer_lower = answer.lower()
    return any(phrase in answer_lower for phrase in low_confidence_phrases)

### Run a quick test with a simple query.

In [16]:
from IPython.display import Markdown
# Let's try a simple question

query = "If a breast cancer patient fails fulvestrant, what would be suitable options for next-line therapy?"
result = conversation_chain.invoke({"question": query})
display(Markdown(result["answer"]))

For a breast cancer patient who has failed fulvestrant, suitable options for next-line therapy may include a CDK 4/6 inhibitor (such as palbociclib, ribociclib, or abemaciclib) in combination with another hormone therapy, or for those with tumor PIK3CA mutations, fulvestrant with alpelisib. Additionally, everolimus can be considered in combination with either an aromatase inhibitor (AI), tamoxifen, or fulvestrant. Monotherapy with fulvestrant could also be an option. It is important to consult with a healthcare provider for personalized treatment recommendations.

### Create a chat function. If the answer is not in the NCCN guidelines, we will acknowledge this and search the Internet as a fallback.

In [21]:
def chat(question, history):
    # First try getting answer from RAG system
    result = conversation_chain.invoke({"question": question})
    answer = result["answer"]
    
    # If the answer seems inconclusive, try DuckDuckGo
    if is_inconclusive_answer(answer):
        try:
            # Initialize DuckDuckGo search
            search = DuckDuckGoSearchRun()
            
            # Perform the search
            search_results = search.run(question)
            
            # Use the LLM to formulate an answer based on search results
            enhanced_prompt = f"""
            The original question was: {question}
            
            Based on the available medical documents, I couldn't find a conclusive answer.
            However, I found the following information from a web search:
            
            {search_results}
            
            Please provide a helpful response based on this information. 
            Start your response with "The answer to the query could not be found in the NCCN guidelines. But based on web search results:" to clearly indicate this information comes from outside the NCCN guidelines.
            """
            
            enhanced_result = llm.invoke(enhanced_prompt)
            
            # Add search results to memory to maintain conversation context
            memory.chat_memory.add_user_message(question)
            memory.chat_memory.add_ai_message(enhanced_result.content)
            
            return enhanced_result.content
        except Exception as e:
            # If search fails, return the original answer with a note
            return f"{answer}\n\nNote: I tried to search for additional information online, but encountered an error: {str(e)}"
    
    # If the answer is conclusive, just return it
    return answer

### Use Gradio for a chat user interface.

In [22]:
view = gr.ChatInterface(chat, 
    type="messages",
    css="""textarea {
        font-size: 18px !important;
    }
    .message-wrap textarea {
        font-size: 18px !important;
    }
    .message-content {
        font-size: 18px !important;
    }
    .chatbot.prose {
        font-size: 18px !important;
    }
    .svelte-7ddecg {
        font-size 18px !important;
    }
    """).launch(inbrowser=True)

Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.
