In [1]:
import faiss
import numpy as np
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch



  from tqdm.autonotebook import tqdm, trange


In [2]:
# 1. Split the document
loader = PyPDFLoader("data/raw/TA-9-2024-0138_EN.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
split_docs = text_splitter.split_documents(documents)


In [3]:
# 2. Generate and store the embeddings using FAISS
model = SentenceTransformer('all-MiniLM-L6-v2')
# Prepare FAISS index
embedding_dim = model.get_sentence_embedding_dimension()  # Get the embedding dimension
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance index for similarity search




In [4]:
# Generate embeddings and add to the FAISS index
embeddings = []
for doc in split_docs:
    embedding = model.encode(doc.page_content)
    embeddings.append(embedding)

embeddings = np.array(embeddings).astype('float32')  # Convert to float32
index.add(embeddings)  # Add embeddings to the FAISS index

In [5]:
# 3. Create a Retriever
def retrieve_documents(query, top_k=3):
    # 1. Create the embedding for the query
    query_embedding = model.encode(query).astype('float32')  # Prepare query embedding

    # 2. Search for the top_k closest embeddings in the FAISS index
    distances, indices = index.search(query_embedding.reshape(1, -1), top_k)

    # 3. Format the results
    results = []
    for i, idx in enumerate(indices[0]):
        if idx != -1:  # Verify if the index is valid
            # Access the content of the Document object
            results.append((split_docs[idx].page_content, distances[0][i]))

    return results


In [10]:
from transformers import pipeline
#4 - Load the question-answering pipeline
qa_pipeline = pipeline('question-answering', model='distilbert-base-cased-distilled-squad')



In [11]:
def generate_answer(context, question):
    # Ajuste o prompt para ser claro e fácil de entender
    input_text = f"You are a helpful assistant. Answer the question based on the context below. Context: {context}\nQuestion: {question}\nAnswer:"

    # Passing the question and context to the QA pipeline
    results = qa_pipeline(question=question, context=context)
    
    # A resposta estará no campo 'answer' e o score na chave 'score'
    answer = results['answer']
    score = results['score']  # Captura o score do modelo
    return answer, score  # Retorna a resposta e o score

In [12]:
# 5- Define the RAG chain
def retrieve_and_generate_answer(question):
    # Step 1: Retrieve the relevant chunks
    retrieved_docs = retrieve_documents(question)
    
    # Debugging: Check the structure of the retrieved documents
    print("Retrieved Documents:", retrieved_docs)  # Check what is returned

    # Join the content of the documents into a single context
    context = " ".join([doc[0] for doc in retrieved_docs])  # Access the first element of each tuple (the content)

    # Step 2: Generate the answer using the context and question
    answer, score = generate_answer(context, question)  # Agora captura o score também

    
    return answer, score  # Retorna a resposta e o score


In [13]:
# 6. Invoke the RAG chain with a sample question
question = "What is the main topic of the article Artificial Inteligence Act about?"
answer, score = retrieve_and_generate_answer(question)

print("Question:", question)
print("Answer:", answer)
print("Score:", score)  # Print the scores for each document

Retrieved Documents: [('6. The Commission shall adopt implementing acts setting out the detailed arrangements \nand the conditions for the evaluations, including the detailed arrangements for involving \nindependent experts, and the procedure for the selection thereof. Those implementing \nacts shall be adopted in accordance with the examination procedure referred to in \nArticle 98(2).\n7. Prior to requesting access to the general-purpose AI model concerned, the AI Office may', 0.8358497), ('carrying out their tasks and activities. They shall neither seek nor take instructions from \nanyone when exercising their tasks under paragraph 3. Each expert shall draw up a \ndeclaration of interests, which shall be made publicly available. The AI Office shall \nestablish systems and procedures to actively manage and prevent potential conflicts of \ninterest.\n5. The implementing act referred to in paragraph 1 shall include provisions on the', 0.8473401), ('Intelligence Act)\n(Text with EEA rel

In [14]:
question = "What is the purpose of this Regulation?"
answer, score = retrieve_and_generate_answer(question)

print("Question:", question)
print("Answer:", answer)
print("Score:", score)  # Print the scores for each document

Retrieved Documents: [('purposes of the effective enforcement of this Regulation:\n(a) any reference to an economic operator under Regulation (EU) 2019/1020 shall be \nunderstood as including all operators identified in Article 2(1) of this Regulation;\n(b) any reference to a product under Regulation (EU) 2019/1020 shall be understood as \nincluding all AI systems falling within the scope of this Regulation.', 0.7531761), ('(b) the effective implementation of this Regulation, in particular for the purposes of \ninspections, investigations or audits; ▌\n(c) public and national security interests;\n(d) the conduct of criminal or administrative proceedings;\n(e) information classified pursuant to Union or national law.\n2. The authorities involved in the application of this Regulation pursuant to paragraph 1 \nshall request only data that is strictly necessary for the assessment of the risk posed by', 0.78608364), ('Whereas:\n(1) The purpose of this Regulation is to improve the functionin

In [15]:
question = "What do I, as a Customer, need to know about Artifical Intelligence?"
answer, score = retrieve_and_generate_answer(question)

print("Question:", question)
print("Answer:", answer)
print("Score:", score)  # Print the scores for each document

Retrieved Documents: [('Providers and deployers of AI systems shall take measures to ensure, to their best extent, a \nsufficient level of AI literacy of their staff and other persons dealing with the operation and use \nof AI systems on their behalf, taking into account their technical knowledge, experience, \neducation and training and the context the AI systems are to be used in, and considering the \npersons or groups of persons on whom the AI systems are to be used.', 1.0787231), ('including by requesting documentation and information, by conducting evaluations, as \nwell as by requesting measures from providers of general-purpose AI models. When \nconducting evaluations, in order to make use of independent expertise, the AI Office \nshould be able to involve independent experts to carry out the evaluations on its behalf. \nCompliance with the obligations should be enforceable, inter alia, through requests to', 1.0811049), ('(20) In order to obtain the greatest benefits from AI sy