RAG for Semantic kernel

In [1]:
import pandas as pd
import numpy as np
#For extraction
import fitz  # PyMuPDF
from typing import List
import pickle
#For Embedding 
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
#Vector Indexing
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_pdf_text(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    doc.close()
    return full_text

# Load your Semantic Kernel PDF
pdf_path = "semantic-kernel.pdf"  # update if different
raw_text = extract_pdf_text(pdf_path)

# Quick preview
print(raw_text[:100000])  # Show first 1000 characters

Tell us about your PDF experience.
Introduction to Semantic Kernel
Article • 06/24/2024
Semantic Kernel is a lightweight, open-source development kit that lets you easily build
AI agents and integrate the latest AI models into your C#, Python, or Java codebase. It
serves as an efficient middleware that enables rapid delivery of enterprise-grade
solutions.
Microsoft and other Fortune 500 companies are already leveraging Semantic Kernel
because it’s flexible, modular, and observable. Backed with security enhancing
capabilities like telemetry support, and hooks and filters so you’ll feel confident you’re
delivering responsible AI solutions at scale.
Version 1.0+ support across C#, Python, and Java means it’s reliable, committed to non
breaking changes. Any existing chat-based APIs are easily expanded to support
additional modalities like voice and video.
Semantic Kernel was designed to be future proof, easily connecting your code to the
latest AI models evolving with the technology as it 

In [3]:
# Load and chunk PDF page-wise
def extract_pdf_pagewise(pdf_path: str):
    doc = fitz.open(pdf_path)
    chunks = []

    for page_num, page in enumerate(doc, start=1):
        text = page.get_text().strip()
        if text:  # Avoid blank pages
            chunks.append({
                "page": page_num,
                "text": text
            })

    doc.close()
    return chunks

# Run the function
page_chunks = extract_pdf_pagewise("semantic-kernel.pdf")

# Example output
print(f"Total non-empty pages: {len(page_chunks)}")
print(f"Page 1 preview:\n{page_chunks[0]['text']}")


Total non-empty pages: 740
Page 1 preview:
Tell us about your PDF experience.
Introduction to Semantic Kernel
Article • 06/24/2024
Semantic Kernel is a lightweight, open-source development kit that lets you easily build
AI agents and integrate the latest AI models into your C#, Python, or Java codebase. It
serves as an efficient middleware that enables rapid delivery of enterprise-grade
solutions.
Microsoft and other Fortune 500 companies are already leveraging Semantic Kernel
because it’s flexible, modular, and observable. Backed with security enhancing
capabilities like telemetry support, and hooks and filters so you’ll feel confident you’re
delivering responsible AI solutions at scale.
Version 1.0+ support across C#, Python, and Java means it’s reliable, committed to non
breaking changes. Any existing chat-based APIs are easily expanded to support
additional modalities like voice and video.
Semantic Kernel was designed to be future proof, easily connecting your code to the
latest AI

In [4]:
# Load a free, local embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
# Extract just the text for embedding
texts = [chunk["text"] for chunk in page_chunks]

# Generate embeddings
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

# Sanity check
print(f"Total embeddings: {len(embeddings)}")
print(f"Shape of one embedding: {embeddings[0].shape}")

Batches: 100%|██████████| 24/24 [00:32<00:00,  1.35s/it]

Total embeddings: 740
Shape of one embedding: (384,)





In [6]:
#Link Embedding to Chunk
for i in range(len(page_chunks)):
    page_chunks[i]["embedding"] = embeddings[i]

In [7]:
# Embeddings must be a 2D float32 numpy array
embedding_matrix = np.array(embeddings).astype('float32')

# Create FAISS index
dimension = embedding_matrix.shape[1]  # typically 384 for MiniLM
index = faiss.IndexFlatL2(dimension)  # L2 = Euclidean Distance

# Add vectors
index.add(embedding_matrix)

print(f"FAISS index has {index.ntotal} vectors.")

FAISS index has 740 vectors.


In [8]:
#Save the Index for Later
faiss.write_index(index, "semantic_kernel.index")

## To load it later
# index = faiss.read_index("semantic_kernel.index")

In [13]:
from sentence_transformers import SentenceTransformer

# Example query
query = "Hi"

# Embed the query
query_embedding = model.encode([query]).astype('float32')

# Search top-k most similar pages
k = 3
distances, indices = index.search(query_embedding, k)

# Display results
for i, idx in enumerate(indices[0]):
    page_info = page_chunks[idx]
    print(f"\n🔹 Match {i+1} — Page {page_info['page']}")
    print(f"Text Preview:\n{page_info['text'][:200]}")

AttributeError: 'GenerativeModel' object has no attribute 'encode'

In [14]:
import google.generativeai as genai

# Replace with your actual key
GEMINI_API_KEY = "AIzaSyDDCT4dPbOM9chf_Uaveg-BDYS_82nfaWs"
genai.configure(api_key=GEMINI_API_KEY)

In [15]:
# Load Gemini model (text-only model)
model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")

In [16]:
# Prepare prompt from top-k results
retrieved_chunks = [page_chunks[idx]["text"] for idx in indices[0]]

# You can trim or merge them
context = "\n\n---\n\n".join(retrieved_chunks)

# Final prompt sent to Gemini
prompt = f"""
Hello! I have a task for you.
 
You're a helpful and honest assistant. Please follow these rules carefully:
1. Stick to the facts provided in the text.
2. If something is not in the text, say "Not mentioned" or "Cannot be determined."
3. Be concise and clear.
4. Do not make up or assume anything.
5. **Do NOT generate or include any mature, violent, or harmful content.**
6. **Avoid any language that could be interpreted as threatening or unsafe.**
7. Be clear, concise, and respectful in your tone
 
Now, based on the following content, please answer questions:
 
Context:
{context}
 
Question: {query}
 
Answer:"""
# Get answer
response = model.generate_content(prompt)
print(response.text)

Not mentioned.  The provided text describes an AI system's setup and a prompt template, but doesn't contain a response to "Hi".



In [5]:
Data = "I am Lakshya Bansal"

word = Data.split()
print(word)
extra = []

for i in word:
    if i != "I":
        extra.append(i)
print(extra)

result = " ".join(word)
print(result)

['I', 'am', 'Lakshya', 'Bansal']
['am', 'Lakshya', 'Bansal']
I am Lakshya Bansal


In [9]:
dic = {
    (1, 2, 3): "Number"
}

print(dic[(1, 2, 3)])

Number


In [None]:
import re

def extract_emails(text):
    pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(pattern, text)
    return emails

# Example
text = "Contact us at support@test.com or sales@company.org"
print(extract_emails(text)) 
print("hello") 
# ✅ Output: ['support@test.com', 'sales@company.org']


['support@test.com', 'sales@company.org']
hello


: 

In [13]:
def reverse_alternate_words(sentence):
    words = sentence.split()
    for i in range(len(words)):
        if i % 2 == 1:  # reverse alternate words (2nd, 4th, etc.)
            words[i] = words[i][::-1]
    return " ".join(words)

# Example
sentence = "Machine learning is really interesting"
print(reverse_alternate_words(sentence))  
# ✅ Output: "Machine gninrael is yllaer interesting"


Machine gninrael is yllaer interesting
