In [2]:
import logging
import os
import pickle
from transformers import AutoModelForCausalLM, pipeline, AutoTokenizer
import faiss 
import transformers
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import streamlit as st
from streamlit_chat import message

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

EMBEDDING_MODEL_NAME = "thenlper/gte-small"
READER_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
FAISS_INDEX_PATH = "embeddings/knowledge_vector_database.faiss"
PDF_FILE_PATH = "data/raw/TA-9-2024-0138_EN.pdf"

In [4]:

# Function to split the document into chunks
def split_document_into_chunks(file_path: str, chunk_size: int, tokenizer_name: str = EMBEDDING_MODEL_NAME):
    """
    Load a document and split it into smaller chunks for processing.

    Args:
        file_path (str): Path to the document file.
        chunk_size (int): The maximum size of each chunk (number of tokens).
        tokenizer_name (str): The name of the tokenizer to use for splitting the document.

    Returns:
        List of split document chunks.
    """
    # Check if the document file exists
    if not os.path.isfile(file_path):
        logging.error(f"The file '{file_path}' does not exist.")
        return None

    # Load the document using PyPDFLoader
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    logging.info(f"The document has been loaded successfully. Total number of pages: {len(pages)}.")

    # Initialize a text splitter
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size * 0.1),  # 10% overlap between chunks
        add_start_index=True, 
        strip_whitespace=True
    )

    chunks = text_splitter.split_documents(pages)
    logging.info(f"The document has been split into {len(chunks)} chunks.")

    return chunks

In [5]:
# Function to generate embeddings for the document chunks
def generate_embeddings(chunks: list):
    """
    Generate embeddings for the given document chunks and store them using FAISS (uses the nearest neighbor search algorithm).
    
    Args:
        chunks (list): List of document chunks to generate embeddings for.
        
    Returns:
        FAISS index containing the document embeddings.
    """
    embedding_model = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        multi_process=True,
        model_kwargs={"device": "cpu"},  # Use CPU for embeddings
        encode_kwargs={"normalize_embeddings": True}
    )
    logging.info(f"Embedding model '{EMBEDDING_MODEL_NAME}' loaded successfully.")

    KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
        chunks, embedding_model, distance_strategy=DistanceStrategy.COSINE
    )
    logging.info("Embeddings generated successfully.")

    return KNOWLEDGE_VECTOR_DATABASE

In [6]:
# Function to save the entire knowledge vector database to a file
def save_knowledge_vector_database(knowledge_vector_database, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(knowledge_vector_database, f)
    logging.info(f"Knowledge vector database saved to {file_path}")

In [7]:
# Ensure the directory for saving the knowledge vector database exists
os.makedirs(os.path.dirname(FAISS_INDEX_PATH), exist_ok=True)

chunks = split_document_into_chunks(PDF_FILE_PATH, chunk_size=256)
if chunks is not None:
    # Generate embeddings for the document chunks
    knowledge_vector_database = generate_embeddings(chunks)
    # Save the entire knowledge vector database to a file
    save_knowledge_vector_database(knowledge_vector_database, FAISS_INDEX_PATH)
else:
    logging.error("Failed to split the document into chunks.")

2024-10-30 07:16:07,500 - INFO - The document has been loaded successfully. Total number of pages: 459.
2024-10-30 07:16:09,268 - INFO - The document has been split into 814 chunks.
2024-10-30 07:16:09,335 - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
2024-10-30 07:16:11,702 - INFO - Embedding model 'thenlper/gte-small' loaded successfully.
2024-10-30 07:16:11,717 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2024-10-30 07:16:11,717 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu
2024-10-30 07:17:04,824 - INFO - Embeddings generated successfully.
2024-10-30 07:17:04,981 - INFO - Knowledge vector database saved to embeddings/knowledge_vector_database.faiss


In [8]:
# Function to initialize the reader model
def initialize_reader_model(model_name: str = READER_MODEL_NAME):
    """
    Initialize the LLM model for text generation.
    
    Args:
        model_name (str): The name of the model to use for the LLM.
    
    Returns:
        A HuggingFace pipeline for text generation.
    """
    tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, device_map="auto", torch_dtype="auto")    

    reader_llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer, 
    do_sample=True,
    temperature=0.7,
    repetition_penalty=1.2,
    return_full_text=False,
    )
    logging.info(f"Reader LLM model '{model_name}' initialized successfully.")
    return reader_llm, tokenizer

In [9]:
# Function to load the entire knowledge vector database from a file
def load_knowledge_vector_database(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'rb') as f:
            knowledge_vector_database = pickle.load(f)
        logging.info(f"Knowledge vector database loaded from {file_path}")
        return knowledge_vector_database
    else:
        logging.error(f"Knowledge vector database file {file_path} does not exist.")
        return None

In [10]:
# Function to retrieve relevant documents from the knowledge base
def retrieve_relevant_docs(query: str, knowledge_vector_database, k: int = 5):
    """
    Retrieve the most relevant documents from the FAISS knowledge base.
    
    Args:
        query (str): The user query.
        knowledge_vector_database: The FAISS knowledge base for retrieval.
        k (int): The number of top documents to retrieve.
    
    Returns:
        A tuple containing the retrieved documents and their combined text.
    """
    logging.info(f"Starting retrieval for query: {query}")
    retrieved_docs = knowledge_vector_database.similarity_search(query=query, k=k)

    retrieved_docs_text = [doc.page_content for doc in retrieved_docs]
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {i}:::\n{doc}\n" for i, doc in enumerate(retrieved_docs_text)])

    return retrieved_docs, context

In [11]:
# Function to generate the final answer using the retrieved documents and LLM
def generate_answer_from_docs(query: str, context: str, reader_llm, tokenizer, max_new_tokens=512):
    """
    Generate an answer using the LLM based on the retrieved documents.

    Args:
        query (str): The user query.
        context (str): The text of the retrieved documents.
        reader_llm: The text generation pipeline (LLM).
        tokenizer: The tokenizer for formatting the chat-based prompt.
        max_new_tokens (int): Maximum number of tokens for the generated answer.

    Returns:
        The generated answer from the LLM.
    """
    # Chat-style prompt for the model
    prompt_in_chat_format = [
        {
            "role": "system",
            "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
        {
            "role": "user",
            "content": f"""Context:
            {context}
            ---
            Now here is the question you need to answer:

            Question: {query}"""
        },
    ]

    # Apply the chat-style template using tokenizer (if needed)
    rag_prompt_template = tokenizer.apply_chat_template(
        prompt_in_chat_format, tokenize=False, add_generation_prompt=True
    )

    # Generate the final answer
    generated_text = reader_llm(rag_prompt_template, truncation=True, max_new_tokens=max_new_tokens)

    # Process the generated text
    answer = generated_text[0]['generated_text']

    return answer

In [13]:
# Load the knowledge vector database
knowledge_vector_database = load_knowledge_vector_database(FAISS_INDEX_PATH)
if knowledge_vector_database is None:
    logging.error("Failed to load the knowledge vector database.")
    exit(1)

# Initialize the reader model
reader_llm, tokenizer = initialize_reader_model()

# Define the query
query = "What is the purpose of this Regulation?"

# Retrieve relevant documents
retrieved_docs, context = retrieve_relevant_docs(query, knowledge_vector_database)

# Generate the answer
answer = generate_answer_from_docs(query, context, reader_llm, tokenizer)
print("Answer:", answer)

2024-10-30 07:21:15,186 - INFO - Knowledge vector database loaded from embeddings/knowledge_vector_database.faiss
2024-10-30 07:21:16,387 - INFO - Reader LLM model 'Qwen/Qwen2.5-1.5B-Instruct' initialized successfully.
2024-10-30 07:21:16,396 - INFO - Starting retrieval for query: What is the purpose of this Regulation?
2024-10-30 07:21:16,397 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2024-10-30 07:21:16,398 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Answer: The purpose of this Regulation includes protecting natural persons, fostering democratic principles, ensuring legal governance, promoting employment growth through innovative applications of artificial intelligence technology, encouraging sustainable practices like environmental conservation, and establishing trustworthiness standards across Europe's digital landscape. Additionally, the regulation aims at enhancing cybersecurity protections during its execution phases such as inspections, prosecutions, safeguarding against breaches, maintaining operational integrity regarding personal privacy rights, managing sensitive governmental matters involving public safety concerns, preserving internal organizational dynamics under national laws compliance requirements, and overseeing international collaborations where EU entities operate abroad via cross-border agreements aimed at upholding regulatory consistency worldwide.
