In [141]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [142]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(
    "data/2025-DOST-SEI-ST-Scholars-Handbook-Final.pdf",
)

all_documents = loader.load()

In [143]:
# Use all documents without limiting
print('Total pages', len(all_documents))
documents = all_documents
print('# of pages - All documents', len(documents))
print('First page', documents[0].page_content)

Total pages 52
# of pages - All documents 52
First page 


In [144]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap = 150,
    separators=["\n\n", "\n", ". ", " ", ""]
)

chunks = text_splitter.split_documents(documents)

Let's confirm we've split our document.

In [145]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002"
)

#### Creating a QDrant VectorStore

Now that we have documents - we'll need a place to store them alongside their embeddings.

In [146]:
from langchain_community.vectorstores import Qdrant

qdrant_vector_store = Qdrant.from_documents(
    chunks,
    embeddings,
    location=":memory:",
    collection_name="DOST_Handbook",
)

In [147]:
# --- Base Retriever ---
retriever = qdrant_vector_store.as_retriever(search_kwargs={"k": 7})

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(primary_qa_llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=retriever,
)

# Now wrap MultiQueryRetriever with compression
advanced_retriever = MultiQueryRetriever.from_llm(
    retriever=compression_retriever,
    llm=primary_qa_llm,
)

In [148]:
# --- Custom Prompt (encourages detailed answers) ---
from langchain.prompts import ChatPromptTemplate

template = """
You are an assistant for DOST-SEI scholars answering questions using the official handbook.

CRITICAL INSTRUCTIONS:
1. Answer ONLY based on the provided context below
2. Do NOT add information not present in the context
3. If you're unsure, say "I need more specific information from the handbook" or "Can you give mo more context to your question?"
4. Quote specific sections when possible
Context:
{context}

Question:
{input}
"""
prompt = ChatPromptTemplate.from_template(template)



In [149]:
# --- Document Chain ---
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain = create_stuff_documents_chain(primary_qa_llm, prompt)

In [150]:
# --- Retrieval Chain ---
from langchain.chains import create_retrieval_chain

retrieval_chain = create_retrieval_chain(advanced_retriever, document_chain)

In [151]:
# --- Guardrails Placeholder ---
def guardrails_pre_query(user_input: str) -> str:
    """Check if input is safe and relevant before querying retriever."""
    blocked = ["violence", "politics", "religion"]  # Example guardrails
    if any(b in user_input.lower() for b in blocked):
        return "Sorry, I cannot answer that type of question."
    return user_input

def guardrails_post_response(response: str) -> str:
    """Check if response is hallucinated or irrelevant before sending to user."""
    if "I don't know" in response or len(response.strip()) < 20:
        response += "\n\n⚠️ This answer may be incomplete. Please verify in the official handbook."
    return response


In [152]:
# --- Single Question ---
question = "I have a failed grade in my semester, am I terminated already? This is my first failed grade, I am still in my second year in college"

# Apply pre-query guardrail
safe_query = guardrails_pre_query(question)
if safe_query.startswith("Sorry"):
    print(safe_query)
else:
    # Invoke the retrieval-augmented chain
    result = retrieval_chain.invoke({"input": safe_query})
    
    # Apply post-response guardrail
    safe_response = guardrails_post_response(result["answer"])
    
    # Print formatted output
    print(f"\nQ: {question}\nA: {safe_response}\n{'-'*50}")



Q: I have a failed grade in my semester, am I terminated already? This is my first failed grade, I am still in my second year in college
A: Based on the context provided, if you have one grade of 5.0/F without any previous deficiency, you would be "Continued under Probation." Therefore, you are not terminated yet. However, if you have any previous deficiencies, the situation may differ. Since you mentioned this is your first failed grade, you should be under probation.
--------------------------------------------------


In [153]:
# load the CSV - lotr_testset.csv
import pandas as pd
import nest_asyncio

nest_asyncio.apply()
test_df = pd.read_csv('data/dost_testset.csv')

In [154]:
test_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What are the three main DOST-SEI undergraduate...,['The S&T Scholarship Programs aim to: a. Prov...,The three main DOST-SEI undergraduate scholars...,simple,[{'source': '2025-DOST-SEI-ST-Scholars-Handboo...,True
1,What is the monthly living allowance for DOST-...,['Scholarship Privileges Regular Academic Year...,The monthly living allowance for DOST-SEI scho...,simple,[{'source': '2025-DOST-SEI-ST-Scholars-Handboo...,True
2,What happens if a first-year scholar gets 2 gr...,['Academic Deficiency Scholarship Status First...,If a first-year scholar gets 2 grades of 5.0/F...,simple,[{'source': '2025-DOST-SEI-ST-Scholars-Handboo...,True
3,How long must scholars render service in the P...,"['After Graduation The scholar, immediately up...",Scholars must render service in the Philippine...,simple,[{'source': '2025-DOST-SEI-ST-Scholars-Handboo...,True
4,What are the eligibility criteria for RA 7687 ...,['Eligibility Criteria for the Undergraduate S...,"For RA 7687 scholarship, graduating Grade 12 s...",reasoning,[{'source': '2025-DOST-SEI-ST-Scholars-Handboo...,True
5,What is the maximum tuition fee subsidy and wh...,['Tuition and Other School Fees Subsidy Tuitio...,"The maximum tuition fee subsidy is PHP 40,000....",reasoning,[{'source': '2025-DOST-SEI-ST-Scholars-Handboo...,True
6,What are the grade requirements for second-yea...,['Second Year ? The scholar must have a semest...,Second-year scholars must have a semestral wei...,multi_context,[{'source': '2025-DOST-SEI-ST-Scholars-Handboo...,True
7,What is the difference between deferring schol...,['Deferring the DOST-SEI Undergraduate Scholar...,Deferring the DOST-SEI Undergraduate Scholarsh...,reasoning,[{'source': '2025-DOST-SEI-ST-Scholars-Handboo...,True
8,What are the requirements for practical traini...,['The scholar shall undergo 240-hour practical...,Scholars must undergo 240-hour practical train...,multi_context,[{'source': '2025-DOST-SEI-ST-Scholars-Handboo...,True
9,What are the conditions for shifting program o...,['The scholar may be allowed to shift to anoth...,Scholars may shift programs or transfer school...,multi_context,[{'source': '2025-DOST-SEI-ST-Scholars-Handboo...,True


In [155]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [157]:
answers = []
contexts = []

for question in test_questions:
    response = retrieval_chain.invoke({"input": question})
    answers.append(response["answer"])  # retrieval_chain outputs "answer"
    contexts.append([doc.page_content for doc in response["context"]])

In [158]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [159]:
response_dataset[0]

{'question': 'What are the three main DOST-SEI undergraduate scholarship programs?',
 'answer': 'The three main DOST-SEI undergraduate scholarship programs are:\n\na. Merit Scholarship Program (RA 2067)  \nb. RA 7687 Scholarship Program  \nc. RA 10612 (under the Junior Level Science Scholarships)',
 'contexts': ['The following are programs covered by the DOST-SEI Undergraduate Scholarship:\n•\t\nBachelor in Mathematics Education\n•\t\nBachelor in Technology and \nLivelihood Education with \nspecialization in Information and \nCommunications\n•\t\nBachelor of Library and Information \nScience\n•\t\nBS Aeronautical Engineering\n•\t\nBS Aerospace Engineering\n•\t\nBS Agribusiness\n•\t\nBS Agribusiness Economics\n•\t\nBS Agribusiness Management and \nEntrepreneurship',
  'The DOST-SEI Undergraduate Scholarship Programs......................................................3  \na.  Merit Scholarship Program (RA 2067)  \nb.  RA 7687 Scholarship Program  \nc.  RA 10612 (under the Junior Level 

In [160]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [161]:
import nest_asyncio

nest_asyncio.apply()
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

In [162]:
# Convert results to DataFrame
import pandas as pd

# Extract metrics from results
results_dict = results.to_pandas()

# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Metric': ['Faithfulness', 'Answer Relevancy', 'Context Recall', 'Context Precision', 'Answer Correctness'],
    'Score': [
        results_dict['faithfulness'].mean(),
        results_dict['answer_relevancy'].mean(), 
        results_dict['context_recall'].mean(),
        results_dict['context_precision'].mean(),
        results_dict['answer_correctness'].mean()
    ]
})

# Round scores to 3 decimal places
summary_df['Score'] = summary_df['Score'].round(3)

# Display the summary
print("RAGAS Evaluation Summary:")
print("=" * 40)
print(summary_df.to_string(index=False))

# Also display the full results DataFrame
print("\n\nDetailed Results:")
print("=" * 40)
print(results_dict)


RAGAS Evaluation Summary:
            Metric  Score
      Faithfulness  0.754
  Answer Relevancy  0.881
    Context Recall  0.667
 Context Precision  0.721
Answer Correctness  0.656


Detailed Results:
                                          user_input  \
0  What are the three main DOST-SEI undergraduate...   
1  What is the monthly living allowance for DOST-...   
2  What happens if a first-year scholar gets 2 gr...   
3  How long must scholars render service in the P...   
4  What are the eligibility criteria for RA 7687 ...   
5  What is the maximum tuition fee subsidy and wh...   
6  What are the grade requirements for second-yea...   
7  What is the difference between deferring schol...   
8  What are the requirements for practical traini...   
9  What are the conditions for shifting program o...   

                                  retrieved_contexts  \
0  [The following are programs covered by the DOS...   
1  [A monthly stipend will be provided for the du...   
2  [2 Grades 

In [163]:
os.environ["TAVILY_API_KEY"] = getpass("Please provide your Tavily Key: ")

In [164]:
from tavily import TavilyClient

ModuleNotFoundError: No module named 'tavily'