In [16]:
# General purpose libraries
import logging
import os

# Document loading and splitting
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Tokenizer for the text processing
from transformers import AutoTokenizer

# Embeddings and vector storage
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

import faiss 

In [17]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [18]:
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

In [19]:
# Path to the document 
file_path = "../data/raw/TA-9-2024-0138_EN.pdf" 

# Function to split the document into chunks
def split_document_into_chunks(file_path: str, chunk_size: int, tokenizer_name: str = EMBEDDING_MODEL_NAME):
    """
    Load a document and split it into smaller chunks for processing.

    Args:
        file_path (str): Path to the document file.
        chunk_size (int): The maximum size of each chunk (number of tokens).
        tokenizer_name (str): The name of the tokenizer to use for splitting the document.

    Returns:
        List of split document chunks.
    """
    # Check if the document file exists
    if not os.path.isfile(file_path):
        logging.error(f"The file '{file_path}' does not exist.")
        return None

    # Load the document using PyPDFLoader
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    logging.info(f"The document has been loaded successfully. Total number of pages: {len(pages)}.")

    # Initialize a text splitter
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size * 0.1),  # 10% overlap between chunks
        add_start_index=True, 
        strip_whitespace=True
    )

    chunks = text_splitter.split_documents(pages)
    logging.info(f"The document has been split into {len(chunks)} chunks.")

    return chunks

# Split the document into chunks
chunk_size = 128  
chunks = split_document_into_chunks(file_path, chunk_size)

2024-10-22 09:22:34,898 - INFO - The document has been loaded successfully. Total number of pages: 459.
2024-10-22 09:22:38,792 - INFO - The document has been split into 1391 chunks.


In [20]:
# Function to generate embeddings for the document chunks
def generate_embeddings(chunks):
    """
    Generate embeddings for the given document chunks and store them using FAISS (uses the nearest neighbor search algorithm).
    
    Args:
        chunks (list): List of document chunks to generate embeddings for.
        
    Returns:
        FAISS index containing the document embeddings.
    """
    embedding_model = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        multi_process=True,
        model_kwargs={"device": "cpu"},  # Use CPU for embeddings
        encode_kwargs={"normalize_embeddings": True}
    )
    logging.info(f"Embedding model '{EMBEDDING_MODEL_NAME}' loaded successfully.")

    KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
        chunks, embedding_model, distance_strategy=DistanceStrategy.COSINE
    )
    logging.info("Embeddings generated successfully.")

    return KNOWLEDGE_VECTOR_DATABASE

# Generate embeddings for the document chunks
if chunks is not None:
    KNOWLEDGE_VECTOR_DATABASE = generate_embeddings(chunks)
else:
    logging.error("Chunks not generated. Please check the document splitting process.")

2024-10-22 09:22:38,808 - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
2024-10-22 09:22:40,946 - INFO - Embedding model 'thenlper/gte-small' loaded successfully.
2024-10-22 09:22:40,961 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2024-10-22 09:22:40,961 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu
2024-10-22 09:23:59,172 - INFO - Embeddings generated successfully.


In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

READER_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, device_map="auto", torch_dtype="auto")

# Initialize the pipeline for text generation using the new model
READER_LLM = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,  
    do_sample=True,
    temperature=0.7,
    repetition_penalty=1.2,
    return_full_text=False,
)



In [22]:
user_query = "What is the purpose of this Regulation?"
print(f"\nStarting retrieval for {user_query=}...")
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)
print(
    "\n==================================Top document=================================="
)
print(retrieved_docs[0].page_content)
print("==================================Metadata==================================")
print(retrieved_docs[0].metadata)

2024-10-22 09:24:01,521 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2024-10-22 09:24:01,521 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu



Starting retrieval for user_query='What is the purpose of this Regulation?'...

as required by this Regulation.
{'source': '../data/raw/TA-9-2024-0138_EN.pdf', 'page': 74, 'start_index': 1570}


In [23]:
retrieved_docs_text = [
    doc.page_content for doc in retrieved_docs
]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join(
    [f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)]
)

final_prompt = f"""
Context:
{context}

Question: {user_query}
"""

# Redact an answer
generated_text = READER_LLM(final_prompt, truncation=True, max_new_tokens=1000)[0]["generated_text"]

# Split the generated text at "Answer:"
parts = generated_text.split("Answer:")

# Extract the main answer
if len(parts) > 1:
    answer = parts[1].split("\n")[0].strip()
else:
    answer = "No answer found."

# Extract additional information if available
additional_info = "\n".join(parts[1].split("\n")[1:]).strip() if len(parts) > 1 else ""

print("Answer:", answer)
print("Additional Information:", additional_info)

Both `max_new_tokens` (=1000) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: Improve the functioning of the internal market and promote the uptake of human-centric and trustworthy artificial intelligence (AI)
Additional Information: User generated context please try again

To extract "The purpose" part directly related to your query:

**Purpose:** Improve the functioning of the internal market and promote the uptake of human-centric and trustworthy artificial intelligence (AI).

This summary captures what was extracted about the general provisions but focuses specifically on identifying the regulation's primary objective stated at its outset. If you have additional questions regarding these points or need further clarification, feel free to ask! 

### Explanation:

- **Improvement of Internal Market Functioning**: Ensures smooth operation across different sectors without unnecessary barriers.
  
- **Promotion of Uptake of Human-Centric AI Systems**: Encourages adoption of AI technologies tailored towards ethical standards and benefits users' well-being 