In [1]:
import pandas as pd
import numpy as np
#For extraction
import fitz  # PyMuPDF
from typing import List
import pickle
#For Embedding 
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
#Vector Indexing
import faiss




In [2]:
# def extract_pdf_text(pdf_path: str) -> str:
#     doc = fitz.open(pdf_path)
#     full_text = ""
#     for page in doc:
#         full_text += page.get_text()
#     doc.close()
#     return full_text

# Load your Semantic Kernel PDF
txt_path = "GEN_AI 3.txt"  # update if different
raw_text = ""

with open(txt_path,'r') as f:
    raw_text = f.read()

# Quick preview
print(raw_text[:12000])  # Show first 1000 characters

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Assessment Details
Type: Proctored certification

Total number of questions: 45

Time limit: 90 minutes

Registration fee: $200

Question types: Multiple choice

Languages: English

Delivery method: Online proctored

Prerequisites: None, but related training highly recommended

Recommended experience: 6+ months of hands-on experience performing the generative AI solutions tasks outlined in the exam guide

Validity period: 2 years

Recertification: Recertification is required every two years to maintain your certified status. To recertify, you must take the current version of the exam. Please review the “Getting Ready for the Exam” section below to prepare for your recertification exam.

Unscored content: Exams may include unscored items to gather statistical information for future use. These items are not identified on the form and do not impact your score. Additio

In [5]:
# Load and chunk PDF page-wise
import re

def extract_txt_queswise(raw_text: str):
    chunks = re.split(r"\++\n", raw_text)
    
    return chunks

# Run the function
page_chunks = extract_txt_queswise(raw_text)
page_chunks = page_chunks[3:]
page_chunks=  [{"text": chunk} for chunk in page_chunks]

# Example output
print(f"Total non-empty pages: {len(page_chunks)}")
print(f"Page 1 preview:\n{page_chunks[0]}")

Total non-empty pages: 130
Page 1 preview:
{'text': "Number: \n1\n\nSkill Section: \n\n \nQuestion:  \nA Generative Al Engineer has created a RAG application to look up answers to questions about a series of fantasy novels that are being asked on the author’s web forum. The fantasy novel texts are chunked and embedded into a vector store with metadata (page number, chapter number, book title), retrieved with the user’s query, and provided to an LLM for response generation. The Generative AI Engineer used their intuition to pick the chunking strategy and associated configurations but now wants to more methodically choose the best values.\nWhich TWO strategies should the Generative AI Engineer take to optimize their chunking strategy and parameters? (Choose two.)\n\nAnswer: \n3 and 5\n\n\nDifficulty Level ( Easy, Medium, Intense ):\nM\n\nGuideline Time in Seconds (30, 45, 60):\n45\n\nOption 1:\nChange embedding models and compare performance.\n\nOption 2:\nAdd a classifier for user queri

In [6]:
# Load a free, local embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
# Extract just the text for embedding
texts = [chunk for chunk in page_chunks]

# Generate embeddings
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

# Sanity check
print(f"Total embeddings: {len(embeddings)}")
print(f"Shape of one embedding: {embeddings[0].shape}")

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Total embeddings: 130
Shape of one embedding: (384,)


In [8]:
#Link Embedding to Chunk
for i in range(len(page_chunks)):
    page_chunks[i]["embedding"] = embeddings[i]

In [9]:
# Embeddings must be a 2D float32 numpy array
embedding_matrix = np.array(embeddings).astype('float32')

# Create FAISS index
dimension = embedding_matrix.shape[1]  # typically 384 for MiniLM
index = faiss.IndexFlatL2(dimension)  # L2 = Euclidean Distance

# Add vectors
index.add(embedding_matrix)

print(f"FAISS index has {index.ntotal} vectors.")

FAISS index has 130 vectors.


### Run this

In [11]:
# #Save the Index for Later
# faiss.write_index(index, "GenAI.index")

# To load it later
index = faiss.read_index("GenAI.index")

In [32]:
from sentence_transformers import SentenceTransformer

# Example query
query = """A Generative Al Engineer has been asked to design an LLM-based application that accomplishes the following business objective: answer employee HR questions using HR PDF documentation.
Which set of high level tasks should the Generative Al Engineer's system perform?
B
c
Calculate averaged embeddings for each HR document, compare embeddings to user query to find the best document. Pass the best document with the user query into an LLM with a large context window to generate a
response to the employee.
Use an LLM to summarize HR documentation. Provide summaries of documentation and user query into an LLM with a large context window to generate a response to the user.
Split HR documentation into chunks and embed into a vector store. Use the employee question to retrieve best matched chunks of documentation, and use the LLM to generate a response to the employee based upon the
documentation retrieved.
Create an interaction matrix of historical employee questions and HR documentation. use ALS to factorize the matrix and create embeddings. Calculate the embeddings of new queries and use them to find the best HR
documentation. Use an LLM to generate a response to the employee question based upon the documentation retrieved."""

# Embed the query
query_embedding = model.encode([query]).astype('float32')

# Search top-k most similar pages
k = 3
distances, indices = index.search(query_embedding, k)

# Display results
for i, idx in enumerate(indices[0]):
    page_info = page_chunks[idx]
    # print(f"\n🔹 Match {i+1} — Page {page_info['text']}")
    # print(f"Text Preview:\n{page_info['text'][:]}")

In [33]:
# Prepare prompt from top-k results
retrieved_chunks = [page_chunks[idx]["text"] for idx in indices[0]]

# You can trim or merge them
context = "\n\n---\n\n".join(retrieved_chunks)
prompt = f"""
# Overview
You are an AI-powered Q&A assistant designed to answer questions related to Generative AI engineering, RAG (Retrieval-Augmented Generation), Databricks features, and related technical topics.  
Your primary knowledge source is a text-based dataset containing question-answer pairs, explanations, and summaries related to AI engineering concepts.  
You are integrated within a Semantic Kernel framework that retrieves relevant chunks of data using semantic search and passes them to you for reasoning and response generation.

---

## Context
- You operate as part of a Retrieval-Augmented Generation (RAG) system.
- Your retrieved context will come from pre-processed text documents containing multiple-choice questions, answers, explanations, and summaries.
- If the relevant information is not found in the knowledge base, you may use your general knowledge of Databricks, LLMs, and RAG systems to infer an accurate response.
- Your answers must be concise and formatted in an exam-style tone—clear, direct, and to the point.
- You should not cite metadata such as question numbers, difficulty levels, or skill sections in responses.
- When external information is used, it should align with Databricks documentation and modern AI engineering practices.

---

## Instructions
1. Read the user’s question carefully.
2. Review the retrieved text chunks from the knowledge base.
3. Extract the most relevant facts, reasoning steps, and conclusions from the retrieved context.
4. Generate a concise, exam-style answer that directly addresses the user’s query.
5. If the answer is not explicitly found in the provided context:
   - Use general knowledge and reasoning.
   - Reference conceptual guidance consistent with Databricks official documentation and standard RAG methodologies.
6. Avoid unnecessary elaboration or background information unless it directly clarifies the answer.
7. Do not mention metadata fields (e.g., “Skill Section,” “Difficulty,” “Guideline Time”).
8. Never state that the answer was “not found.” Instead, provide the best possible inference based on relevant principles.
9. Maintain factual accuracy and professional tone throughout.

---

## Tools
- Semantic Kernel framework for retrieval and orchestration.
- Vector store for semantic document search.
- Large Language Model (LLM) for reasoning and answer generation.
- Databricks documentation as an external reference for general knowledge fallback.

---

## Examples

### Example 1
**User Input:**  
“What are two methods to optimize the chunking strategy in a RAG pipeline?”

**Expected Output:**  
Use evaluation metrics like recall or NDCG to compare chunking strategies, and apply an LLM-based evaluation to judge response relevance. These approaches provide data-driven and qualitative optimization of chunk parameters.

---

### Example 2
**User Input:**  
“How should a Generative AI Engineer design a RAG application for answering technical regulation queries?”

**Expected Output:**  
Ingest and index documents into vector search, allow the LLM to retrieve relevant content, generate responses, evaluate performance, and deploy using model serving.

---

### Example 3
**User Input:**  
“What Databricks feature can log requests and responses in model serving?”

**Expected Output:**  
Use Inference Tables to automatically capture and log incoming requests and outgoing responses for model monitoring and evaluation.

---

## SOP (Standard Operating Procedure)
1. Receive the user’s query.
2. Retrieve the most relevant context chunks using semantic search.
3. Summarize and synthesize the retrieved information.
4. Generate a precise, exam-style answer using:
   - Retrieved content if relevant.
   - Databricks and LLM general knowledge if missing.
5. Return the answer without metadata or references.
6. Maintain consistency, factual accuracy, and brevity.

---

## Final Notes
- The RAG assistant should emulate a subject-matter expert providing quick, correct, and concise answers.
- Always prioritize correctness, clarity, and conciseness.
- Default to authoritative Databricks or Generative AI best practices when context is insufficient.
- Avoid speculation or verbose explanations unless necessary for clarity.
---


Context:
{context}
  
Answer:
"""

In [34]:
from groq import Groq

# Replace with your actual Groq API key
GROQ_API_KEY = "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"

# Initialize the Groq client
client = Groq(api_key=GROQ_API_KEY)

# Example: Using the 'llama3-70b-8192' model
response = client.chat.completions.create(
    model="openai/gpt-oss-20b",
    messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": query}
    ]
)

# Print the model’s response
print(response.choices[0].message.content)


Split the HR PDFs into manageable chunks, embed the chunks into a vector store, retrieve the most semantically relevant chunks in response to an employee’s query, and then pass those retrieved chunks along with the query to an LLM for answer generation.


## for stream lit pickel

In [None]:
# Run this code once to save your chunk data
import pickle
import re

# --- Re-run your chunking logic from Cell 3 ---
with open("GEN_AI 3.txt", 'r') as f:
    raw_text = f.read()

def extract_txt_queswise(raw_text: str):
    chunks = re.split(r"\++\n", raw_text)
    return chunks

page_chunks = extract_txt_queswise(raw_text)
page_chunks = page_chunks[3:]
page_chunks = [{"text": chunk} for chunk in page_chunks]
# ---------------------------------------------

# We only need the text, not the embeddings
text_chunks = [chunk['text'] for chunk in page_chunks]

# Save the text chunks to a file
with open("text_chunks.pkl", "wb") as f:
    pickle.dump(text_chunks, f)

print("Saved text_chunks.pkl successfully.")

Saved text_chunks.pkl successfully.
