## **A simple Q&A application over a PDF data source**

### **Installing Required Libraries**

In [None]:
%pip install langchain langchain-google-genai langchain-community faiss-cpu PyPDF2

In [None]:
%pip install --upgrade pip

### **API Key Configuration**

In [3]:
import os

os.environ["GOOGLE_API_KEY"] = "AIzaSyAnflqxlOGcG4QwPtvSnML2QJkVVNeuEZo"

### **Basic RAG**

In [4]:
# Libriries
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
import PyPDF2

In [7]:
# Defines a variable 'path' with the path to the PDF file to be read.
path = "/content/document_arera.pdf"

# Initializes an empty text variable to store the PDF content
document = ""

# Opens the PDF file at the specified path
with open(path, 'rb') as pdf_file:

  # Loads the file into the tool that knows how to read PDF content
  pdf_reader = PyPDF2.PdfReader(pdf_file)

  # Creates a loop to "flip through" each page of the file, one by one
  for page in pdf_reader.pages:

    # Gets all the text from the current page and adds it to our variable
    document += page.extract_text() + "\n"

In [8]:
document

'\nThe Italian Regulatory Authority for Energy, \nNetworks and Environment (Autorità di Rego-\nlazione per Energia Reti e Ambiente - ARERA), \nestablished by Law No. 481 of 14 November 1995 \nand fully operational since 1997, carries out regu-\nlatory and monitoring activities in the sectors of \nelectricity, natural gas, water services, district he-\nating and municipal waste. \nARERA operates in full autonomy within the fra-\nmework of the general policy guidelines formula-\nted by the Italian Government, Parliament and the \nEuropean Union. \nThe Authority is a collegial body, made up of five \nmembers (including the President) chosen from \namong highly qualified professionals.  The procedure for appointing ARERA’s five Board \nmembers requires a broad institutional consensus \nin order to safeguard the principle of independen-\nce. They are appointed by decree of the President \nof the Republic, at the resolution of the Council \nof Ministers, following a proposal both from the \n

In [9]:
# Creates and configures our "smart scissors" to split the text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,   # Sets that each piece of text will have at most 1000 characters
    length_function=len,  # Specifies the function to measure the length of the text
    is_separator_regex=False,  # A technical setting for the separators below
    # Defines the cutting priority: the scissors will try to cut first
    # where there is a blank paragraph, to keep ideas together
    separators=["\n\n"],
)

# Takes our large document and uses the "scissors" to split it
# into a list of several smaller text pieces
texts = text_splitter.create_documents([document])

# Prepares Google's "meaning translator." It will convert
# the text pieces into numbers the computer can understand
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

# Creates the "library" or "index" of our document. It takes the text pieces,
# uses the "translator" to create a numerical version of each one,
# and organizes everything to allow very fast information searches
vector_store = FAISS.from_documents(texts, embedding=embeddings)

In [10]:
print(texts)

[Document(metadata={}, page_content='\nThe Italian Regulatory Authority for Energy, \nNetworks and Environment (Autorità di Rego-\nlazione per Energia Reti e Ambiente - ARERA), \nestablished by Law No. 481 of 14 November 1995 \nand fully operational since 1997, carries out regu-\nlatory and monitoring activities in the sectors of \nelectricity, natural gas, water services, district he-\nating and municipal waste. \nARERA operates in full autonomy within the fra-\nmework of the general policy guidelines formula-\nted by the Italian Government, Parliament and the \nEuropean Union. \nThe Authority is a collegial body, made up of five \nmembers (including the President) chosen from \namong highly qualified professionals.  The procedure for appointing ARERA’s five Board \nmembers requires a broad institutional consensus \nin order to safeguard the principle of independen-\nce. They are appointed by decree of the President \nof the Republic, at the resolution of the Council \nof Ministers, f

In [13]:
# This is a text that states exactly what the AI's job is
template="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.

Question: {question}

Context: {context}

Answer (english):"""

# This line takes the above script and turns it into a formal "template"
# that the system will use to automatically insert the question and texts
prompt = ChatPromptTemplate.from_template(template)

# Here, we "turn on" and configure the brain of the operation (Google's AI model).
llm = ChatGoogleGenerativeAI(
    # We choose the version of the brain to be used: "gemini-1.5-flash".
    model="gemini-1.5-flash",
    # With 'temperature' 0, the AI will be 100% objective and faithful to the text, with no creativity.
    temperature=0,
    # Sets the maximum length of the response (here, no specific limit).
    max_tokens=None,
    # Sets the maximum waiting time for a response.
    timeout=None,
    # If the connection fails, it will try to reconnect 2 more times.
    max_retries=2,
)

In [17]:
question = "What is the goal of ARERA?"

retrieved_docs = vector_store.similarity_search(question, k=2)

In [20]:
# Joins the retrieved text fragments into a single block of text,
# separated by a space, to make it easier for the AI to read
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

# Fills the "instruction template" we defined earlier. It inserts your
# question and the retrieved texts into their proper places
messages = prompt.invoke({"question": question, "context": docs_content})

# Sends the final prompt (with instructions, the question, and the context)
# to the AI "brain," which then generates the answer
response = llm.invoke(messages)

In [21]:
response.content

"ARERA's goals include promoting competition and efficiency in public utility services, defining and updating a transparent tariff system, protecting user and consumer interests, and harmonizing the economic and financial objectives of public utilities with social and environmental protection goals.  It also aims to promote infrastructure investments, ensure transparency of service conditions, and increase consumer protection, awareness, and information."