Install all Libraries

In [1]:
!pip install streamlit PyPDF2 python-docx sentence-transformers transformers nltk pyngrok

Collecting streamlit
  Downloading streamlit-1.47.1-py3-none-any.whl.metadata (9.0 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.12-py3-none-any.whl.metadata (9.4 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cud

In [1]:
!pip install pinecone



In [2]:
!ngrok config add-authtoken 30Gyuu4xEdo5IleJmkBXc3ITPFf_59GCGnyngcyJ5AjGm8ABi

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [3]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=27a47b50ad4fb015263615c35216676bc1e9779a5c458ed232d40bdd7ea9d7be
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [24]:
# ==========================================
# AI-Powered Document Search and Summarization System
# ==========================================

# STEP 1: Import Libraries
# ------------------------
# These libraries handle: UI (streamlit), file reading (PyPDF2, docx),
# text preprocessing (re, nltk), embeddings (sentence-transformers),
# summarization (transformers), vector database (pinecone), and scoring (rouge_score).


%%writefile app.py
import streamlit as st
import PyPDF2
from docx import Document
import re
import nltk
nltk.download("punkt") # Download for sentence tokenization
nltk.download("punkt_tab")
from sentence_transformers import SentenceTransformer
import numpy as np
from transformers import pipeline
import pinecone
from pinecone import ServerlessSpec
import os
from transformers import pipeline, AutoTokenizer
from rouge_score import rouge_scorer


# STEP 2: Pinecone Vector Database Setup
# --------------------------------------
# Pinecone is a managed vector database for similarity search.
# Here, we connect to Pinecone using the API key and create (or connect to) an index.
# The index will store our document embeddings for fast search.

# Set API key (in Streamlit, read from secrets or env)
pinecone_api_key = os.environ.get('pinecone.api_key')
os.environ["pinecone.api_key"] = "pcsk_BiULZ_D2WASMPNY9mTsvkmf8rzTv65QEcZvH5mC5ANnKB4fsBd8kq6GJoA1661wsKF34b"

# Initialize Pinecone client
pc = pinecone.Pinecone(api_key=pinecone_api_key)

# Define index parameters
index_name = "doc-search-demo"
dim = 384  # Must match your embedding model (all-MiniLM-L6-v2 is 384)

# Create index if not exists
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        vector_type="dense",
        dimension=dim,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

# Connect to the index
index = pc.Index(index_name)
print("Pinecone index ready!")

# STEP 3: Text Extraction Functions
# ---------------------------------
# These functions read and extract raw text from PDFs, Word docs, or plain text files.

def extract_pdf_text(file):
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

def extract_docx_text(file):
    doc = Document(file)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_txt_text(file):
    return file.read().decode("utf-8")

# STEP 4: Text Cleaning and Chunking
# ----------------------------------
# Cleans extracted text (removing weird/control characters).
# Splits text into overlapping "chunks" of a few sentences — helps with retrieval.

def clean_text(text):
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def chunk_by_sentences(text, sentences_per_chunk=3, overlap=1):
    sentences = nltk.sent_tokenize(text)
    chunks = []
    i = 0
    while i < len(sentences):
        chunk = " ".join(sentences[i:i + sentences_per_chunk])
        chunks.append(chunk)
        i += sentences_per_chunk - overlap  # Overlapping chunks improve coverage for search
    return chunks

# STEP 5: Prepare Chunks for Pinecone Storage
# -------------------------------------------
# Converts embeddings and chunk texts into objects Pinecone can store and search.

def convert_embeddings_to_objects(
    embeddings,
    chunk_texts,
    document_id="document1",
    document_title="Untitled",
    document_url=None,
    created_at=None,
    document_type="pdf"
):
    """
    Convert embeddings and chunk texts to Pinecone upsert-ready objects.
    """
    result = []
    for i, vector in enumerate(embeddings):
        chunk_metadata = {
            "chunk_number": i + 1,  # For reference
            "chunk_text": chunk_texts[i]  # The actual text of the chunk
        }
        chunk_object = {
            "id": f"{document_id}#chunk{i+1}",
            "values": vector.tolist(),
            "metadata": chunk_metadata
        }
        result.append(chunk_object)
    return result

# STEP 6: Summarization Function
# ------------------------------
# Uses a small T5 model to summarize the top retrieved document chunks.
# Supports three modes: ratio-based, fixed-length, and chunkwise summarization.

def summarize_chunks(
    top_chunks,         # List of strings, top-k chunks from vector search
    mode="ratio",       # "ratio", "fixed", or "chunkwise"
    ratio=0.4,          # For "ratio" mode: summary will be ~40% of input tokens
    max_length_cap=60,  # For "ratio" mode: never exceed this max_length
    fixed_max=35,       # For "fixed" mode: max_length
    fixed_min=15,       # For "fixed" mode: min_length
    chunkwise_max=35,   # For "chunkwise" mode: per-chunk max_length
    chunkwise_min=15,   # For "chunkwise" mode: per-chunk min_length
    super_summary=False # For "chunkwise": do a second summary pass?
):
    # Try loading the summarizer model
    try:
        summarizer = pipeline("summarization", model="t5-small")
        tokenizer = AutoTokenizer.from_pretrained("t5-small")
    except Exception as e:
        print("⚠️ Failed to load summarizer model:", e)
        return None

    # Input validation
    if not isinstance(top_chunks, list) or not top_chunks:
        print("⚠️ No top_chunks provided for summarization.")
        return None

    # Prepare input text
    text_to_summarize = " ".join(top_chunks).strip()
    if len(text_to_summarize) < 15:
        print("⚠️ Input too short for summarization. Try a different query or document.")
        return None

    # Select summarization mode
    try:
        if mode == "ratio":
            num_tokens = len(tokenizer.encode(text_to_summarize))
            max_len = max(fixed_min, min(int(num_tokens * ratio), max_length_cap))
            print(f"Using ratio mode: num_tokens={num_tokens}, max_len={max_len}")
            summary = summarizer(
                text_to_summarize,
                max_length=max_len,
                min_length=fixed_min,
                do_sample=False,
                max_new_tokens=None   # Add this! is added to disable the default new token limit in Hugging Face Transformers, ensuring that your specified max_length value will control the summary length instead of being overridden by the library’s default.

            )
            return summary[0]['summary_text']

        elif mode == "fixed":
            print(f"Using fixed mode: max_length={fixed_max}, min_length={fixed_min}")
            summary = summarizer(
                text_to_summarize,
                max_length=fixed_max,
                min_length=fixed_min,
                do_sample=False,
                max_new_tokens=None
            )
            return summary[0]['summary_text']

        elif mode == "chunkwise":
            print(f"Using chunkwise mode: per-chunk max_length={chunkwise_max}")
            mini_summaries = []
            for i, chunk in enumerate(top_chunks):
                if len(chunk.strip()) < 10:
                    print(f"Chunk {i+1} is too short, skipping.")
                    continue
                try:
                    mini = summarizer(
                        chunk,
                        max_length=chunkwise_max,
                        min_length=chunkwise_min,
                        do_sample=False,
                        max_new_tokens=None
                    )
                    mini_summaries.append(mini[0]['summary_text'])
                except Exception as mini_e:
                    print(f"Failed to summarize chunk {i+1}: {mini_e}")
            if not mini_summaries:
                print("⚠️ All chunks too short or summarization failed.")
                return None
            if not super_summary:
                return "\n".join(mini_summaries)
            else:
                combined = " ".join(mini_summaries)
                try:
                    super_sum = summarizer(
                        combined,
                        max_length=chunkwise_max,
                        min_length=chunkwise_min,
                        do_sample=False,
                    )
                    return super_sum[0]['summary_text']
                except Exception as e:
                    print("Failed on super-summary:", e)
                    return "\n".join(mini_summaries)
        else:
            print("⚠️ Unknown mode:", mode)
            return None

    except Exception as e:
        print(f"⚠️ Summarization failed: {e}")
        return None

# 5. Evaluation Functions
def precision_at_k(pred_indices, ground_truth_indices, k):
    pred_top_k = set(pred_indices[:k])
    gt = set(ground_truth_indices)
    return len(pred_top_k & gt) / k

def recall_at_k(pred_indices, ground_truth_indices, k):
    pred_top_k = set(pred_indices[:k])
    gt = set(ground_truth_indices)
    return len(pred_top_k & gt) / len(gt) if gt else 0

def compute_rouge(pred_summary, ref_summary):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(ref_summary, pred_summary)
    return scores['rougeL'].fmeasure

def find_ground_truth_chunks(chunks, keywords):
    gt_indices = []
    for i, chunk in enumerate(chunks):
        if any(keyword.lower() in chunk.lower() for keyword in keywords):
            gt_indices.append(i)
    return gt_indices


# STEP 7: Streamlit App UI Logic
# ------------------------------
# The rest of the code is the app's workflow:
# 1. User uploads a file.
# 2. Text is extracted, cleaned, chunked, embedded, and stored in Pinecone.
# 3. User types a question. The app searches for the most relevant chunks and summarizes the answer.
# 4. The app can also evaluate retrieval performance.

# Main page UI header
st.title("AI-Powered Document Search and Summarization System")
st.write("Upload a document (PDF, DOCX, or TXT):")

# File Uploader (Streamlit)
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "docx", "txt"])
top_chunks = []


# File extraction, cleaning, chunking, embedding, upsert
if uploaded_file:
    filetype = uploaded_file.name.split('.')[-1].lower()
    if filetype == "pdf":
        text = extract_pdf_text(uploaded_file)
    elif filetype == "docx":
        text = extract_docx_text(uploaded_file)
    elif filetype == "txt":
        text = extract_txt_text(uploaded_file)
    else:
        st.warning("Unsupported file type!")
        text = ""

    if text:
        st.success("Text extracted!")
        cleaned_text = clean_text(text)
        st.write(f"First 500 characters:\n\n{cleaned_text[:500]}")
        chunks = chunk_by_sentences(cleaned_text, sentences_per_chunk=3, overlap=1)

        # Limit the number of chunks
        max_chunks = 100
        chunks = chunks[:max_chunks]

        st.write(f"Total chunks after limiting: {len(chunks)}")
        st.write("First chunk:", chunks[0])


        # Embed chunks using a MiniLM model
        model = SentenceTransformer("all-MiniLM-L6-v2")
        embeddings = model.encode(chunks, batch_size=4, show_progress_bar=True)
        st.write("Embedding shape:", embeddings.shape)

        # Save for later use
        st.session_state["model"] = model
        st.session_state["chunks"] = chunks

        # Pinecone upsert
        # Convert to Pinecone Vector Objects
        vectors = convert_embeddings_to_objects(
            embeddings,        # shape: (num_chunks, embedding_dim)
            chunks,            # list of chunk texts, length=num_chunks
            document_id="document1",             # Use a unique ID per doc!
            document_title="Introduction to Vector Databases",
            document_url="https://example.com/docs/document1",
            created_at="2024-01-15",
            document_type="tutorial"
        )

        # Upsert all vectors at once (Pinecone expects a list)
        index.upsert(
            namespace="example-namespace",
            vectors=vectors
        )

    else:
        st.error("No text could be extracted from this file.")

# ------ Query & SUMMARIZATION------
if "model" in st.session_state and "chunks" in st.session_state:
# Place this after all the code that loads, chunks, and embeds your document.
  query = st.text_input("Enter your question:")

  if query:
      model = st.session_state["model"]
      chunks = st.session_state["chunks"]

      query_embedding = model.encode([query])[0].tolist()
      filtered_results = index.query(
          namespace="example-namespace",   # Match your upsert namespace!
          vector=query_embedding,
          top_k=3,
          include_metadata=True,
          include_values=True
      )

      for result in filtered_results.matches[:3]:  # Take top 3 results
          # SAFE: Always prefer 'chunk_text' in metadata!
          chunk_text = result.metadata.get("chunk_text")
          if chunk_text:
              top_chunks.append(chunk_text)
          else:
              # Fallbacks for missing metadata
              chunk_index_value = result.metadata.get("chunk_index")
              if chunk_index_value is not None:
                  chunk_index = int(chunk_index_value)
                  if 0 <= chunk_index < len(chunks):
                      top_chunks.append(chunks[chunk_index])
                  else:
                      top_chunks.append("[Warning: chunk_index out of range]")
              else:
                  top_chunks.append("[No text available]")

      # Display for debugging or pass to your summarizer:
      st.subheader("Top results:")
      for i, chunk in enumerate(top_chunks, 1):
          st.write(f"Chunk {i}:")
          st.write(chunk)

      # Summarize retrieved chunks if button is pressed
      if top_chunks:
          st.subheader("Summary of Top Results")

        # You can add a mode selector if you want:
          mode = st.selectbox("Summarization mode", ["ratio", "fixed", "chunkwise"], key="summary_mode")
          if st.button("Summarize Top Results"):
              summary = summarize_chunks(top_chunks, mode=mode)
              if summary:
                  st.success("Summary generated!")
                  st.write(summary)
              else:
                  st.error("Failed to generate summary. Try different settings or input.")
      else:
          st.info("No top results available for summarization.")

# ==========================================
# STEP 8: EVALUATION SECTION
# ==========================================

# This section allows you to test the retrieval and summarization performance
# of your document search system on "standard" or custom test queries.
# It dynamically finds the ground truth answer chunks in your uploaded document
# by keyword search (instead of hardcoded indices).
# It then evaluates how well your system retrieves and summarizes answers.

# 8.1 Define sample evaluation queries and related answer keywords.
#    These represent the questions a user might ask and what "keywords"
#    should appear in a correct answer for scoring.
    # Define test queries and keywords for dynamic ground truth extraction
query_keywords_map = [
    {
        "query": "Who are the authors of this paper?",
        "keywords": ["Ashish Vaswani", "Noam Shazeer"]
    },
    {
        "query": "What is the main contribution?",
        "keywords": ["Transformer", "main contribution"]
    }
]

# 8.2 Dynamically create test cases for the current uploaded document.
#    For each query, search through all the document chunks and
#    collect the indices of any chunks containing the relevant keywords.
#    This makes evaluation adaptive to the actual document loaded.test_cases = []
test_cases = []
chunks = st.session_state["chunks"]   # Use current document's chunk list
for item in query_keywords_map:
    gt_chunks = find_ground_truth_chunks(chunks, item["keywords"])
    ref_summary = "Reference summary for: " + item["query"]
    # For simplicity, we use a placeholder reference summary.
    # For serious use, reference_summary should be a "gold" summary.
    test_cases.append({
        "query": item["query"],
        "ground_truth_chunks": gt_chunks,
        "reference_summary": ref_summary
    })


retrieval_k = 3 #number of chunks to retrieve for each query

# 8.3 If the user presses the "Run Evaluation" button, compute scores

if st.button("Run Evaluation"):
    rouge_scores = []
    precisions = []
    recalls = []
    for test in test_cases:
        # Encode the test query to get its embedding
        query_embedding = model.encode([test["query"]])[0].tolist()
        # Search for the top-k most relevant chunks using Pinecone
        search_results = index.query(
            namespace="example-namespace",
            vector=query_embedding,
            top_k=retrieval_k,
            include_metadata=True,
            include_values=True
        )

        # Get indices of the retrieved chunks for scoring (either by index or chunk_number)
        pred_indices = [int(r.metadata.get('chunk_index', 0)) for r in search_results.matches]
        # Optionally: If using chunk_number metadata: pred_indices = [int(r.metadata.get('chunk_number', 0)) - 1 for r in search_results.matches]
        # Retrieve the actual chunk texts to summarize
        top_chunks = [r.metadata.get("chunk_text", "[No text found]") for r in search_results.matches]
        # Run your summarization pipeline
        pred_summary = summarize_chunks(top_chunks, mode="ratio")
        # Calculate Precision, Recall, and ROUGE-L metrics
        p = precision_at_k(pred_indices, test["ground_truth_chunks"], retrieval_k)
        r_ = recall_at_k(pred_indices, test["ground_truth_chunks"], retrieval_k)
        rouge = compute_rouge(pred_summary, test["reference_summary"])
        precisions.append(p)
        recalls.append(r_)
        rouge_scores.append(rouge)
        # Show results for this test query
        st.write(f"**Query:** {test['query']}")
        st.write(f"Precision@{retrieval_k}: {p:.2f}, Recall@{retrieval_k}: {r_:.2f}, ROUGE-L: {rouge:.2f}")
        st.write(f"Predicted Summary: {pred_summary}")
        st.write(f"Reference Summary: {test['reference_summary']}")
        st.write("---")
    # Show overall average metrics
    st.write("\n### Overall Results")
    st.write("Avg Precision@k:", sum(precisions) / len(precisions))
    st.write("Avg Recall@k:", sum(recalls) / len(recalls))
    st.write("Avg ROUGE-L:", sum(rouge_scores) / len(rouge_scores))

Overwriting app.py


In [26]:
from pyngrok import ngrok

# Kill any previous tunnels
ngrok.kill()

# Start Streamlit app in the background
get_ipython().system_raw('streamlit run app.py &')

# Open an ngrok tunnel to streamlit on port 8501
public_url = ngrok.connect(8501)
print(f"Streamlit app is live at {public_url}")

Streamlit app is live at NgrokTunnel: "https://557fed6edf26.ngrok-free.app" -> "http://localhost:8501"
