<a href="https://colab.research.google.com/github/eduardd76/RAG-naive/blob/main/RAG_Fundamentals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install langchain openai faiss-cpu

In [None]:
!pip install -U langchain-community

In [None]:
!pip install tiktoken

# Import necessary modules

In [10]:
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

# 1. Prepare some example documents

In [11]:
documents = [
    Document(page_content=(
        "OSPF (Open Shortest Path First) is a link-state routing protocol that uses a hierarchical area design. "
        "Area 0 is the backbone, and all other areas should connect to it to ensure optimal routing updates."
    )),
    Document(page_content=(
        "When designing a multi-site OSPF deployment, it's recommended to have smaller areas for sites that "
        "are geographically distant. This reduces the size of the link-state database (LSDB) and improves convergence."
    )),
    Document(page_content=(
        "Summarization between OSPF areas helps minimize routing table size and reduces unnecessary route advertisements. "
        "Ensure each site has a well-defined IP addressing scheme for effective summarization."
    ))
]

In [12]:
from google.colab import userdata
import os
openai_api_key = userdata.get('openai_api_key')
os.environ['OPENAI_API_KEY'] = openai_api_key

# 2. Create a vector store from documents using OpenAI embeddings and FAISS

In [7]:
# Initialize the OpenAI embeddings model
embeddings = OpenAIEmbeddings(openai_api_key = openai_api_key)


  embeddings = OpenAIEmbeddings(openai_api_key = openai_api_key)


In [13]:
# Create a FAISS vector store from the documents
vectorstore = FAISS.from_documents(documents, embeddings)

# 3. Create a retriever from the vector store

In [14]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})

# 4. Set up the OpenAI LLM and build a RetrievalQA chain

In [15]:
llm = OpenAI(openai_api_key = openai_api_key, temperature=0)  # Using a deterministic output by setting temperature=0

  llm = OpenAI(openai_api_key = openai_api_key, temperature=0)  # Using a deterministic output by setting temperature=0


In [16]:
# Create a RetrievalQA chain that uses the retriever and the LLM
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

# 5. Ask a query and get the result

In [17]:
query = "What is OSPF?"
result = qa_chain.run(query)
print("Query:", query)
print("Answer:", result)

  result = qa_chain.run(query)


Query: What is OSPF?
Answer:  OSPF is a link-state routing protocol that uses a hierarchical area design.


**2 USE CASE2:Lets use wikipedia as source of data**

In [18]:
!pip install  wikipedia-api

Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.8.1-py3-none-any.whl size=15384 sha256=c1d0adc230d2e8e66d2ea13a776638c91e36560f09553d5034131ac8f74f9aaf
  Stored in directory: /root/.cache/pip/wheels/0b/0f/39/e8214ec038ccd5aeb8c82b957289f2f3ab2251febeae5c2860
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.8.1


In [19]:
from langchain.document_loaders import WikipediaLoader

In [20]:
!pip install wikipedia

Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [21]:
import wikipedia

In [22]:
# Search what you want
search_results = wikipedia.search("OSPF")
print("Search results:", search_results)

Search results: ['Open Shortest Path First', 'IS-IS', 'Link-state routing protocol', 'Link-state advertisement', 'Interior gateway protocol', 'Stub network', 'Administrative distance', 'Routing Information Protocol', 'Bidirectional Forwarding Detection', 'Virtual IP address']


In [23]:
# Choose the first result as an example
if search_results:
    page_title = search_results[0]
    print("Selected page title:", page_title)

Selected page title: Open Shortest Path First


# 6. Load a Wikipedia page as a document

In [24]:
loader = WikipediaLoader("Open Shortest Path First")
documents = loader.load()

In [25]:
print(f"Loaded {len(documents)} document(s) from Wikipedia.")

Loaded 25 document(s) from Wikipedia.


In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 7. Chunk the document(s) into smaller pieces

In [27]:
# Create a text splitter with a chunk size and overlap (tune these values as needed)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

In [28]:
# Split the loaded documents into smaller chunks
chunked_documents = text_splitter.split_documents(documents)
print(f"Split documents into {len(chunked_documents)} chunks.")

Split documents into 222 chunks.


# 8. Create a vector store from the chunked documents using OpenAI embeddings and FAISS

In [29]:
from google.colab import userdata
import os
openai_api_key = userdata.get('openai_api_key')
os.environ['OPENAI_API_KEY'] = openai_api_key

In [30]:
# Initialize the OpenAI embeddings model
embeddings = OpenAIEmbeddings(openai_api_key = openai_api_key)


# Create a FAISS vector store from the chunked documents
vectorstore = FAISS.from_documents(chunked_documents, embeddings)

# 9. Create a retriever from the vector store

In [31]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [32]:
llm = OpenAI(openai_api_key = openai_api_key, temperature=0)  # Using a deterministic output by setting temperature=0

# Create a RetrievalQA chain that uses the retriever and the LLM
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

# 10. Ask a query and get the result

In [33]:
query = "What is OSPF?"
result = qa_chain.run(query)
print("Query:", query)
print("Answer:", result)

Query: What is OSPF?
Answer:  OSPF is an interior gateway protocol (IGP) for routing Internet Protocol (IP) packets within a single routing domain, such as an autonomous system. It gathers link state information from available routers and constructs a topology map of the network, which is presented as a routing table to the internet layer for routing packets by their destination IP address. It supports both IPv4 and IPv6 networks and is commonly used in large enterprise networks.


USE CASE3: LETS use a local PDF as data source

In [34]:
from google.colab import files
from langchain.document_loaders import PyPDFLoader

In [35]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.2.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.2.0-py3-none-any.whl (298 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/298.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m194.6/298.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.7/298.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.2.0


# 11. Upload the PDF from your local computer

In [36]:
print("Please upload your PDF file...")
uploaded = files.upload()

# Assuming you uploaded one PDF, extract its filename
pdf_filename = list(uploaded.keys())[0]
print(f"Uploaded file: {pdf_filename}")

Please upload your PDF file...


Saving OSPF.pdf to OSPF.pdf
Uploaded file: OSPF.pdf


# 12. Load the PDF document using PyPDFLoader

In [37]:
loader = PyPDFLoader(pdf_filename)
documents = loader.load()
print(f"Loaded {len(documents)} document(s) from the PDF.")

Loaded 4 document(s) from the PDF.


# 13. Chunk the document(s) into smaller pieces

In [38]:
# Create a text splitter: you can adjust chunk_size and chunk_overlap as needed
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunked_documents = text_splitter.split_documents(documents)
print(f"Split documents into {len(chunked_documents)} chunks.")

Split documents into 19 chunks.


# 14. Create a vector store from the chunked documents using OpenAI embeddings and FAISS

In [39]:
embeddings = OpenAIEmbeddings(openai_api_key = openai_api_key)
vectorstore = FAISS.from_documents(chunked_documents, embeddings)

# 15. Create a retriever from the vector store

In [40]:
# this object is in charge with fetching relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})

# 16. Set up the OpenAI LLM and build a RetrievalQA chain

In [41]:
llm = OpenAI(openai_api_key = openai_api_key, temperature=0)  # Using a deterministic output by setting temperature=0
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
# this is a class method that creates the RetrievalQA chain by uniting  retriever + LLM
# stuff parameter means that chunks are concateneted in one context

#qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="map-reduce", retriever=retriever)
# 1st phase is MAP where LLM processes each chuk individually to produce intermediate outcome. REDUCE phase perform the aggregated (reduced) to create the final answr

# 17. Ask a query and get the result

In [43]:
query = "What is OSPF?"
result = qa_chain.run(query)
print("Query:", query)
print("Answer:", result)

Query: What is OSPF?
Answer:  OSPF is a link-state routing protocol used to dynamically determine the best routing paths in IP networks. It is designed by the IETF and is used to distribute routing information within a single autonomous system. Unlike distance-vector protocols, OSPF relies on a more efficient link-state algorithm and supports larger and more complex networks.
