<a href="https://colab.research.google.com/github/erikrosen01/Document-RAG/blob/main/Document_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import os
%pip install langchain
%pip install -U langchain-community
%pip install openai
%pip install faiss-cpu # gpu-version seems to not work
%pip install tiktoken
%pip install PyMuPDF
%pip install pypdf
%pip install sentence-transformers
%pip install huggingface_hub
import faiss
import pickle
import numpy as np
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
# from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatHuggingFace
from huggingface_hub import login
from langchain.schema import Document
from langchain.chains import RetrievalQA
from tqdm import tqdm
import langchain.evaluation.qa as qa
from langchain.evaluation import load_evaluator
# from langchain.evaluation.qa import QAEvaluator

# import tkinter as tk
# from tkinter import filedialog, messagebox

os.environ["OPENAI_API_KEY"] = KEY # REPLACE WITH WORKING KEY

# Load documents
def load_documents(file_paths):
    documents = []
    for path in file_paths:
        if path.lower().endswith(".pdf"):
            loader = PyPDFLoader(path)
        else:
            loader = TextLoader(path)

        docs = loader.load()
        for doc in docs:
            doc.metadata["source"] = path
        documents.extend(docs)
    return documents


# Indexing documents
def create_vectorstore(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = text_splitter.split_documents(documents)
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore

# Question-Answering with RAG
def create_rag_pipeline(vectorstore):
    llm = ChatOpenAI(model_name="gpt-4")
    retriever = vectorstore.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        return_source_documents=True
    )
    return qa_chain

# Hallucination detection
def check_hallucination(query, retrieved_docs, generated_answer):
    eval_model = load_evaluator("qa", llm=ChatOpenAI())
    reference_text = "\n".join([doc.page_content for doc in retrieved_docs])
    # Create example dictionary
    example = {"query": query, "answer": reference_text}
    # Create predictions dictionary
    prediction = {"prediction": generated_answer}

    result = eval_model.evaluate(
        examples=[example],
        predictions=[prediction],
        question_key="query",
        answer_key="answer",
        prediction_key="prediction"
    )
    return result


# get all file_paths (specifically pdfs)
def get_file_paths(dir=None):
    file_paths = []

    if dir is None:
    # If dir is None, get files from the working directory
        for root, dirs, files in os.walk("."):
            for file in files:
                # if file is pdf append to file_paths
                if file.endswith(".pdf"):
                    file_paths.append(os.path.join(root, file))


    else:
        for file in os.listdir(dir):
            file_paths.append(os.path.join(dir, file))

    return file_paths

# let user manually click all files
def select_files_graphical(): # not available in colab
    root = tk.Tk()
    root.withdraw()
    file_paths = filedialog.askopenfilenames(title="Select Files to Upload")
    return file_paths



In [1]:
# run all timeconsuming bits in a seperate cell to improve iteration speed
file_paths = get_file_paths()
if len(file_paths) == 0:
    print("No files selected.")


documents = load_documents(file_paths)
vectorstore = create_vectorstore(documents) # for more advanced applications it should probably be saved to disk and loaded
qa_chain = create_rag_pipeline(vectorstore)

In [None]:
# Main function
def main(documents=documents, vectorsote=vectorstore, qa_chain=qa_chain):


    while True:
        query = input("Ask a question (or type 'exit' to quit): ")
        if query.lower() == 'exit':
            break

        result = qa_chain(query)  # returns a dict with 'result' and 'source_documents'


        generated_answer = result["result"]
        print("\nAnswer:", generated_answer)

        retrieved_docs = result["source_documents"]

        hallucination_result = check_hallucination(query, retrieved_docs, generated_answer)
        print("\nHallucination Check:", hallucination_result[0]["results"])

        if hallucination_result[0]["results"] == "CORRECT":
            print("Sources:")
            for i, doc in enumerate(retrieved_docs):
                print(f"[{i+1}] {doc.metadata.get('source', 'Unknown source')} - Snippet starts with: {doc.page_content[:50]}...")


if __name__ == "__main__":
    main()

Ask a question (or type 'exit' to quit): how has the performance highlights changed between 2013 and 2015

Answer: The text doesn't provide specific information about the change in performance highlights between 2013 and 2015.

Hallucination Check: INCORRECT
Ask a question (or type 'exit' to quit): exit
