<a href="https://colab.research.google.com/github/dietmarja/LLM-Elements/blob/main/RAG/RAG_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To extend the code so that it becomes an example of Retrieval-Augmented Generation (RAG), we need to incorporate a language model that can generate text based on the retrieved documents. We will use LangChain to facilitate this process.

Here is the extended code:

Install necessary packages
Import necessary modules
Set up Pinecone and Hugging Face embeddings
Extract text from PDFs and store them in Pinecone
Perform RAG with the help of LangChain

In [None]:
!pip install -q transformers pinecone-client torch langchain langchain-community langchain-huggingface pymupdf sentence-transformers



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.2/216.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.6/315.6 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [None]:


import fitz  # PyMuPDF
from transformers import AutoModel, AutoTokenizer, GPT2LMHeadModel, GPT2Tokenizer
import pinecone
from pinecone import Pinecone, ServerlessSpec
import torch
from langchain.chains import LLMChain, RetrievalQA
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone


import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pinecone
import torch
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone



In [None]:
# Set Pinecone API Keys
import os
from pinecone import Pinecone, ServerlessSpec

from google.colab import userdata
pinecone_api_key = userdata.get('PINECONE_API_KEY')


# Create a pincecone client
pc = Pinecone(api_key='514affa0-529c-44cd-9c9e-c6ea474bc1ae')

# Create a pincecone client
index_name = "docs-quickstart-index"

# Check if the index exists and delete it if it does
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
    print("Existing index deleted")

# Create the index
pc.create_index(
    name=index_name,
    dimension=384,
    metric="euclidean",  # Available metrics: "euclidean"/"manhattan"/"dotproduct"/"cosine"
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

print("Index created successfully")

# Connect pc to the index
index = pc.Index(index_name)
print("Connected to the index")

Existing index deleted
Index created successfully
Connected to the index


In [None]:
# Load the embedding model and tokenizer
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Load the generation model and tokenizer
generation_model_name = "t5-small"
generation_model = AutoModelForSeq2SeqLM.from_pretrained(generation_model_name)
generation_tokenizer = AutoTokenizer.from_pretrained(generation_model_name)


In [None]:
# Functions to deal with text and embeddings

# Function for simple embeddings via a Hugging Face model
def embed_text(text):
    return embedding_model.embed_query(text)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    texts = []
    with fitz.open(pdf_path) as doc:
        for page_num, page in enumerate(doc):
            text = page.get_text()
            texts.append((page_num, text))
    return texts

# Function to fetch document embeddings from Pinecone
def fetch_document_embedding(doc_id):
    response = index.fetch(ids=[doc_id])
    return response['vectors'][doc_id]['values']

# Function to store document in Pinecone
def store_document(doc_id, text):
    embedding = embed_text(text)
    index.upsert(vectors=[(doc_id, embedding)], metadata=[{"text": text}])


In [None]:
# Path to your PDF files
pdf_path_1 = "Attention_Paper.pdf"
pdf_path_2 = "Lora_Paper.pdf"


In [None]:
# Extract text from PDF
texts_1 = extract_text_from_pdf(pdf_path_1)
texts_2 = extract_text_from_pdf(pdf_path_2)

In [None]:
# Store each page as a separate vector in Pinecone
for page_num, text in texts_1:
    store_document(f"doc_1_page_{page_num}", text)

for page_num, text in texts_2:
    store_document(f"doc_2_page_{page_num}", text)

ValueError: too many values to unpack (expected 2)

In [None]:
# NEW

In [6]:
# Install necessary libraries if not already installed
!pip install -q torch transformers sentence-transformers PyMuPDF langchain-community

# Imports
import fitz  # PyMuPDF
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Hugging Face token
hf_token = "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"

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    texts = []
    with fitz.open(pdf_path) as doc:
        for page_num, page in enumerate(doc):
            text = page.get_text()
            texts.append(text)
    return texts

# Paths to your PDF files
pdf_path_1 = "Attention_Paper.pdf"
pdf_path_2 = "Lora_Paper.pdf"

# Extract text from PDFs
texts_1 = extract_text_from_pdf(pdf_path_1)
texts_2 = extract_text_from_pdf(pdf_path_2)

# Concatenate all text from each document to form a single string
full_text_1 = " ".join(texts_1)
full_text_2 = " ".join(texts_2)

# Load the generation model and tokenizer with token
generation_model_name = "t5-large"  # Use a larger model for better performance
generation_model = AutoModelForSeq2SeqLM.from_pretrained(generation_model_name, use_auth_token=hf_token)
generation_tokenizer = AutoTokenizer.from_pretrained(generation_model_name, use_auth_token=hf_token)

# Function to generate a summary for the document
def generate_summary(text, max_length=200, chunk_size=512):
    input_text = f"summarize: {text}"
    inputs = generation_tokenizer(input_text, return_tensors="pt", truncation=True, max_length=chunk_size)
    outputs = generation_model.generate(inputs.input_ids, max_length=max_length)
    summary = generation_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Function to summarize long documents in chunks
def summarize_long_text(text, chunk_size=512, max_length=200):
    text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    chunk_summaries = [generate_summary(chunk, max_length=max_length) for chunk in text_chunks]
    combined_summary = " ".join(chunk_summaries)
    return generate_summary(combined_summary, max_length=max_length)  # Summarize the combined summary for coherence

# Generate summaries for each document
summary_1 = summarize_long_text(full_text_1)
summary_2 = summarize_long_text(full_text_2)

# Print the summaries
print(f"Summary of Attention Paper:\n{summary_1}\n")
print(f"Summary of Lora Paper:\n{summary_2}\n")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Summary of Attention Paper:
transformer is a new simple network architecture based solely on attention mechanisms. it is superior in quality while being more parallelizable and requiring less time to train. transformer can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs.

Summary of Lora Paper:
deploying indepen- dent instances of fine-tuned models, each with 175B parameters is prohibitively expensive. a large-scale, pre-trained language model is usually adapted to multiple down- stream applications. a low rank suffices even when the full rank (i.e., d) is as high as 12,288, making LoRA both storage- and compute-efficient.

