<a href="https://colab.research.google.com/github/charank45/RAG-pipeline/blob/main/RAGPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Model and Tokeniser Setup

Loading pretrained model (Bart) and tokeniser from Hugging Face

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')



In [None]:
import torch


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model.to(device)


BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

##Uploading doc for training

In [None]:
pip install pymupdf



In [None]:
import fitz  # PyMuPDF

doc = fitz.open('/content/ai pdf 2 team 2.pdf')
text = ""
for page in doc[:150]:  # first 10 pages
    text += page.get_text()


## Text Preprocessing & Chunking

In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
nltk.data.clear_cache()  # Clears any cached data
nltk.download('punkt')  # Re-download 'punkt' tokenizer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize

# Tokenizing text into sentences
sentences = sent_tokenize(text)

chunks = []
current_chunk = ""

# Chunking the sentences while respecting the model's token limits
for sentence in sentences:
    if len(tokenizer.encode(current_chunk + sentence)) < 1024:
        current_chunk += " " + sentence
    else:
        chunks.append(current_chunk.strip())
        current_chunk = sentence

chunks.append(current_chunk.strip())  # Append last chunk

##Summarize Each Chunk

In [None]:
summaries = []
for chunk in chunks:
    inputs = tokenizer.encode(chunk, return_tensors="pt", truncation=True, max_length=1024).to(device)
    summary_ids = model.generate(
        inputs,
        max_length=500,
        min_length=200,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summaries.append(summary)


## Meta Summarisation

In [None]:
combined_summary = " ".join(summaries)
# Optional second summarization
input_ids = tokenizer.encode(combined_summary, return_tensors="pt", truncation=True, max_length=1024).to(device)
final_summary_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
final_summary = tokenizer.decode(final_summary_ids[0], skip_special_tokens=True)


## Final Summary

In [None]:
print("Final Summary:\n", final_summary)


Final Summary:
 Artificial intelligence was founded as an academic discipline in 1956. Funding and interest vastly increased after 2012 when deep learning outperformed previous AItechniques. By the early 2020s, billions of dollars were being invested in AI and the field experienced rapid ongoing progress. The goals of AI research are centered around particular goals and the use of particular tools.


## Trying to summarise each page code

In [None]:
import fitz  # PyMuPDF

In [None]:
## Extract Text with Formatting Info (using PyMuPDF)
def extract_text_with_headings(pdf_path, max_pages=10):
    doc = fitz.open(pdf_path)
    all_pages = []

    for i, page in enumerate(doc[:max_pages]):
        blocks = page.get_text("dict")["blocks"]
        page_data = []

        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    line_text = ""
                    font_sizes = []
                    is_bold = False

                    for span in line["spans"]:
                        line_text += span["text"]
                        font_sizes.append(span["size"])
                        if "bold" in span["font"].lower():
                            is_bold = True

                    avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
                    tag = "**" if is_bold or avg_font_size > 12 else ""  # Assume headings have large font or bold
                    page_data.append(f"{tag}{line_text.strip()}{tag}")


        all_pages.append("\n".join(page_data))

    return all_pages

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
from nltk.tokenize import sent_tokenize

In [None]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [None]:
##Summarize Each Page Separately
def summarize_text(text):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(tokenizer.encode(current_chunk + sentence)) < 1024:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    chunks.append(current_chunk.strip())

    summaries = []
    for chunk in chunks:
        inputs = tokenizer.encode(chunk, return_tensors="pt", truncation=True, max_length=1024).to(device)
        summary_ids = model.generate(inputs, max_length=256, min_length=80, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return " ".join(summaries)


In [None]:
pdf_path = "/content/ai pdf 2 team 2.pdf"
pages = extract_text_with_headings(pdf_path)

page_summaries = []
for i, page_text in enumerate(pages):
    summary = summarize_text(page_text)
    page_summaries.append(f"📄 **Page {i+1} Summary:**\n{summary}\n")

final_output = "\n".join(page_summaries)
print(final_output)

📄 **Page 1 Summary:**
Artificial intelligence is the ability to complete any task performed by a human on an at least equal level. The field has gone through multiple cycles ofoptimism throughout its history. Funding and interest vastly increased after 2012 when deep learning outperformed previous AI techniques. By the early 2020s, billions of dollars were being invested in AI and the field experienced rapid ongoing progress in what has become known as the AI boom.

📄 **Page 2 Summary:**
The general problem of simulating (or creating) intelligence has been broken into subproblems. These consist of specific traits or capabilities that researchers expect an intelligent system to display. The traits described below have received the most attention and cover the scope of AI****research. These include reasoning and problem-solving, as well as regulatory policies to ensure the safety and benefits of the technology.

📄 **Page 3 Summary:**
An "agent" is anything that perceives and takes action

In [None]:
pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
from transformers import AutoTokenizer
import numpy as np

# Load tokenizer for BGE-small
bge_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")

# Token-level chunking
def chunk_text_fixed_token(text, chunk_size=350, stride=50):
    tokens = bge_tokenizer.encode(text, truncation=False)
    chunks = []
    for i in range(0, len(tokens), chunk_size - stride):
        chunk = tokens[i:i+chunk_size]
        chunk_text = bge_tokenizer.decode(chunk, skip_special_tokens=True)
        chunks.append(chunk_text)
    return chunks

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5")

# Generate embeddings
def embed_chunks(chunks):
    embeddings = embed_model.encode(chunks, normalize_embeddings=True)
    return embeddings

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
import faiss

def build_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return index

In [None]:
def retrieve(query, index, chunks, embed_model, top_k=3):
    query_embedding = embed_model.encode([query], normalize_embeddings=True)
    D, I = index.search(np.array(query_embedding), top_k)
    results = [chunks[i] for i in I[0]]
    return results

In [None]:
def generate_answer(context_chunks, question):
    context = " ".join(context_chunks)
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=1024, truncation=True).to(device)

    output_ids = model.generate(input_ids, max_length=256, num_beams=4, early_stopping=True)
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer

In [None]:
raw_chunks = chunk_text_fixed_token(combined_summary)  # or use original text
chunk_embeddings = embed_chunks(raw_chunks)
index = build_faiss_index(np.array(chunk_embeddings))

user_question = "what are goals of AI?"
retrieved_chunks = retrieve(user_question, index, raw_chunks, embed_model)
answer = generate_answer(retrieved_chunks, user_question)

print("🔍 Answer:", answer)

Token indices sequence length is longer than the specified maximum sequence length for this model (816 > 512). Running this sequence through the model will result in indexing errors


🔍 Answer:  artificial intelligence was founded as an academic discipline in 1956. by the early 2020s, billions of dollars were being invested in ai. The field experienced rapid ongoing progress in what has become known as the ai boom. The goals of ai research are centered around particular goals and the use of particular tools.


In [None]:
# 1. Imports
from transformers import BartTokenizer, BartForConditionalGeneration, AutoTokenizer
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
import nltk; nltk.download('punkt')
import numpy as np
import faiss
import torch

# 2. Load models
embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
embedding_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
qa_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
qa_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qa_model.to(device)

# 3. Chunking function (token-based)
def chunk_text_fixed_token(text, chunk_size=350, stride=50):
    tokens = embedding_tokenizer.encode(text, truncation=False)
    chunks = []
    for i in range(0, len(tokens), chunk_size - stride):
        chunk = tokens[i:i+chunk_size]
        chunk_text = embedding_tokenizer.decode(chunk, skip_special_tokens=True)
        chunks.append(chunk_text)
    return chunks

# 4. Embedding and FAISS setup
def embed_chunks(chunks):
    return embed_model.encode(chunks, normalize_embeddings=True)

def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    return index

# 5. Retrieval
def retrieve(query, index, chunks, embed_model, top_k=3):
    query_embedding = embed_model.encode([query], normalize_embeddings=True)
    D, I = index.search(np.array(query_embedding), top_k)
    print(f"🔎 Top Chunks for '{query}': {[chunks[i][:60] for i in I[0]]}")  # Debug
    return [chunks[i] for i in I[0]]

# 6. QA generation
def generate_answer(context_chunks, question):
    context = " ".join(context_chunks)
    input_text = f"question: {question} context: {context}"
    input_ids = qa_tokenizer.encode(input_text, return_tensors='pt', truncation=True, max_length=1024).to(device)

    output_ids = qa_model.generate(input_ids, max_length=256, num_beams=4, early_stopping=True)
    return qa_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# 7. Load your PDF and prepare text
import fitz  # PyMuPDF
doc = fitz.open("/content/ai pdf 2 team 2.pdf")
text = ""
for page in doc[:10]:  # or [:150] depending on size
    text += page.get_text()

# 8. Sentence split + chunk
sentences = sent_tokenize(text)
text = " ".join(sentences)
raw_chunks = chunk_text_fixed_token(text)

# 9. Embed and index
chunk_embeddings = embed_chunks(raw_chunks)
index = build_faiss_index(np.array(chunk_embeddings))

# 10. Ask questions!
while True:
    question = input("❓ Ask a question (or type 'exit'): ")
    if question.lower().strip() == "exit":
        break
    retrieved = retrieve(question, index, raw_chunks, embed_model)
    answer = generate_answer(retrieved, question)
    print("💬 Answer:", answer)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Token indices sequence length is longer than the specified maximum sequence length for this model (2653 > 512). Running this sequence through the model will result in indexing errors


❓ Ask a question (or type 'exit'): what is gpt
🔎 Top Chunks for 'what is gpt': ['computation in other symbolic programming languages. [ 84 ] ', '] current models and services include gemini ( formerly bard', '##tion [ f ] unless restricted to small domains called " mic']
💬 Answer: Gpt generative pre - trained transformers ( gpt) are large language models that generate text based on the semantic relationships between words in sentences. text - based gpt models are pretrained on a large corpus of text that can be from the internet. current models and services include gemini ( formerly bard ), chatgpt, grok, claude, copilot, and llama.
❓ Ask a question (or type 'exit'): exit


In [None]:
!pip install gradio



In [None]:
import gradio as gr

def toggle_question(choice):
    return gr.update(visible=choice == "Q&A")

In [None]:
def process_pdf(pdf_file, mode, question):
    import fitz
    from nltk.tokenize import sent_tokenize
    import numpy as np  # Needed if you're using numpy for FAISS

    # Step 1: Read up to 10 pages from PDF
    doc = fitz.open(pdf_file.name)
    pages = [page.get_text() for page in doc[:10]]
    full_text = " ".join(pages)

    if mode == "Page-by-page summary":
        summaries = []
        for i, page_text in enumerate(pages):
            summary = summarize_text(page_text)  # your existing function
            summaries.append(f"📄 Page {i+1} Summary:\n{summary}\n")
        return "\n".join(summaries)

    elif mode == "Q&A":
        sentences = sent_tokenize(full_text)
        prepared_text = " ".join(sentences)

        chunks = chunk_text_fixed_token(prepared_text)  # your function
        embeddings = embed_chunks(chunks)               # your function
        index = build_faiss_index(np.array(embeddings)) # your function

        retrieved = retrieve(question, index, chunks, embed_model)  # embed_model must be loaded
        answer = generate_answer(retrieved, question)               # your function
        return f"💬 Answer:\n{answer}"

    else:
        return "❌ Invalid mode selected."

In [None]:
with gr.Blocks() as demo:
    gr.Markdown("## 📘 AI PDF Analyzer — Summarize or Ask Questions")

    pdf_input = gr.File(label="📎 Upload a 10-page PDF document", type="filepath") # Changed 'file' to 'filepath'

    mode_selector = gr.Radio(["Page-by-page summary", "Q&A"], label="Choose an option", value="Page-by-page summary")

    question_input = gr.Textbox(label="❓ Your Question (if Q&A)", visible=False)
    mode_selector.change(fn=toggle_question, inputs=mode_selector, outputs=question_input)

    submit_btn = gr.Button("🔍 Run")
    output_box = gr.Textbox(label="📤 Output", lines=20)

    submit_btn.click(
        fn=process_pdf,
        inputs=[pdf_input, mode_selector, question_input],
        outputs=output_box
    )

demo.launch(share=True)  # Use share=True if you want a public link

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ca505d8f9aa5219dd4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


