In [2]:
import torch
from deepeval.evaluate import EvaluationResult

print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)


2.6.0+cu124
True
12.4


In [None]:
from typing import Dict, List, Optional

class GemmaQuestionTemplates:
    def __init__(self):
        self.templates = self._initialize_templates()
    
    def _initialize_templates(self) -> Dict[str, str]:
        return {
            "mcq": self._create_mcq_template(),
            "true_false": self._create_tf_template(),
            "fill_in_blank": self._create_fill_in_blank_template(),
            "open_ended": self._create_open_ended_template(),
            "short_answer": self._create_short_answer_template()
        }
    
    def _create_mcq_template(self) -> str:
        return """You are a teacher at a university creating a Multiple Choice Question for your students learning about {course}.

The question difficulty should be {difficulty}/5, where:
1 - Beginner (no prior knowledge)
2 - Basic (some familiarity)
3 - Intermediate (general understanding)
4 - Advanced (good understanding)
5 - Expert (deep understanding)

Based on this context:
{context}

Create exactly ONE multiple-choice question following these steps:
1. Read and understand the context
2. Extract the key concept based on the difficulty level
3. Create a clear question
4. Generate one correct answer and three plausible wrong answers
5. Write a brief explanation (under 100 words)

Format your response exactly like this:
Question: [question text]
A. [option A]
B. [option B]
C. [option C]
D. [option D]
Answer: [correct letter]
Explanation: [brief explanation]

End your response after the explanation.
"""
    
    def _create_tf_template(self) -> str:
        return """You are a teacher creating a True/False question for your students in {course}.

The question difficulty should be {difficulty}/5, where:
1 - Beginner (no prior knowledge)
2 - Basic (some familiarity)
3 - Intermediate (general understanding)
4 - Advanced (good understanding)
5 - Expert (deep understanding)

Based on this context:
{context}

Create exactly ONE True/False question following these steps:
1. Read and understand the context
2. Identify a key fact that can be presented as true OR create a plausible but false statement
3. Create a clear statement to evaluate as true or false
4. Provide the correct answer
5. Write a brief explanation (under 100 words)

Format your response exactly like this:
Question: [statement to evaluate]
Answer: [True or False]
Explanation: [brief explanation]

End your response after the explanation.
"""
    
    def _create_fill_in_blank_template(self) -> str:
        return """You are a teacher creating a Fill-in-the-blank question for your students in {course}.

The question difficulty should be {difficulty}/5, where:
1 - Beginner (no prior knowledge)
2 - Basic (some familiarity)
3 - Intermediate (general understanding)
4 - Advanced (good understanding)
5 - Expert (deep understanding)

Based on this context:
{context}

Create exactly ONE fill-in-the-blank question following these steps:
1. Read and understand the context
2. Identify an important sentence with a key term that can be blanked out
3. Create the question with _____ for the blank
4. Provide the correct answer
5. Optionally provide 2-3 plausible wrong answers
6. Write a brief explanation (under 100 words)

Format your response exactly like this:
Question: [sentence with _____ for the blank]
Answer: [correct answer]
Possible wrong answers: [wrong answer 1], [wrong answer 2], [wrong answer 3] (optional)
Explanation: [brief explanation]

End your response after the explanation.
"""
    
    def _create_open_ended_template(self) -> str:
        return """You are creating an open-ended question about {course}.
    
    Based on this context:
    {context}
    
    Create ONE open-ended question that:
    1. Promotes critical thinking
    2. Cannot be answered with a simple yes/no
    3. Relates to key concepts in the context
    
    Format your response exactly like this:
    Question: [open-ended question]
    Guidelines: [3-5 brief bullet points about what to include]
    Sample Answer: [brief outline of key points, maximum 150 words]
    
    Do not use any markdown formatting like bold or italics.
    """
    def _create_short_answer_template(self) -> str:
        return """You are creating a short-answer question about {course}.
    
    Based on this context:
    {context}
    
    Create ONE question that can be answered with a single word or very short phrase (1-3 words).
    The question should test specific knowledge from the context.
    
    Format your response exactly like this:
    Question: [question text requiring a short/one-word answer]
    Answer: [the correct short answer]
    Explanation: [brief explanation why this is correct]
    
    Example format:
    Question: Who invented the World Wide Web?
    Answer: Tim Berners-Lee
    Explanation: Tim Berners-Lee invented the World Wide Web in 1989 while working at CERN.
    """
    
    def get_template(self, question_type: str) -> str:
        if question_type not in self.templates:
            raise ValueError(f"Unknown question type: {question_type}. Available types: {list(self.templates.keys())}")
        return self.templates[question_type]
    
    def format_template(self, question_type: str, **kwargs) -> str:
        template = self.get_template(question_type)
        try:
            return template.format(**kwargs)
        except KeyError as e:
            missing_key = str(e).strip("'")
            raise ValueError(f"Missing required parameter: {missing_key} for {question_type} template")
        
    def get_all_template_types(self) -> List[str]:
        return list(self.templates.keys())
    
    def add_custom_template(self, template_type: str, template_text: str) -> None:
        self.templates[template_type] = template_text
        
    def modify_template(self, template_type: str, template_text: str) -> None:
        if template_type not in self.templates:
            raise ValueError(f"Cannot modify unknown template type: {template_type}")
        self.templates[template_type] = template_text

In [3]:
class GemmaPromptManager:
    def __init__(self):
        self.templates = GemmaQuestionTemplates()
        self.max_context_length = 1500  # Default max context length
    
    def set_max_context_length(self, length: int) -> None:
        self.max_context_length = length
    
    def prepare_prompt(self,
                     question_type: str,
                     context: str,
                     course: str,
                     difficulty: int = 3,
                     distractors: Optional[List[str]] = None) -> str:
        # Truncate context if needed
        truncated_context = context[:self.max_context_length]
        
        # Format distractors if provided
        distractor_str = ""
        if distractors and question_type == "mcq":
            distractor_str = ", ".join(distractors[:10])
        
        # Prepare parameters for template
        params = {
            "context": truncated_context,
            "course": course,
            "difficulty": difficulty
        }
        
        # Add distractors if available
        if distractor_str:
            params["distractors"] = distractor_str
            
        # Format the template
        return self.templates.format_template(question_type, **params)
    
    def get_generation_params(self, creative: bool = False) -> Dict:
        if creative:
            return {
                "max_new_tokens": 350,
                "temperature": 0.8,  
                "top_p": 0.92,
                "repetition_penalty": 1.2,
                "do_sample": True
            }
        else:
            return {
                "max_new_tokens": 300,
                "temperature": 0.7, 
                "top_p": 0.9,
                "repetition_penalty": 1.3,
                "no_repeat_ngram_size": 3,
                "do_sample": True
            }

In [4]:
import json
from typing import List, Dict, Any, Optional
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
import chromadb
from rouge_score import rouge_scorer
from CSMetadataExtractor import CSMetadataExtractor
import os
import torch
from chromadb.utils import embedding_functions

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
rag_wiki_files_path="rag/en"
files = os.listdir(rag_wiki_files_path)
metadata_extractor=CSMetadataExtractor()

In [6]:
embedding_model="sentence-transformers/LaBSE"
llm_model_name="google/gemma-2-2b-it"
vector_db_path="./rag/chroma_db"


In [7]:
# Initialize NLTK components
lemmatizer=WordNetLemmatizer()
stopwords_en=set(stopwords.words('english'))
stopwords_ro=set(stopwords.words('romanian'))

In [8]:
embedding_model=SentenceTransformer(embedding_model,device="cuda")

In [9]:
chroma_client=chromadb.PersistentClient(path=vector_db_path)
sentence_ef=embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="sentence-transformers/LaBSE",
  
    )

In [10]:
# create a collection in the chroma client
# set the embedding function to use the SentenceTransformer model
# set the metadata to use cosine distance
# i want to first clean the database and then create a new collection
# delete the collection if it exists
if "overall_database" in chroma_client.list_collections():
    chroma_client.delete_collection("overall_database")
# create a new collection
collection=chroma_client.get_or_create_collection(
    name="overall_database",
    embedding_function=sentence_ef,
    metadata={"hnsw:space": "cosine"})


Functions to initiate the LLM and setup the prompts for Quiz Generation

In [1]:
llm=None
def optimize_model_configuration(quality_preference="balanced", force_gpu=True):
    import torch
    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
    global tokenizer, llm
    if torch.cuda.is_available():
        print(f"CUDA available: {torch.cuda.get_device_name(0)}")
        print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("CUDA not available, will use CPU")
        force_gpu = False
        
    if quality_preference == "speed":
        print("Optimizing for maximum speed (4-bit quantization)")
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            llm_int8_enable_fp32_cpu_offload=not force_gpu
        )
    elif quality_preference == "balanced":
        print("Using balanced settings (8-bit quantization)")
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_enable_fp32_cpu_offload=not force_gpu,
            llm_int8_skip_modules=["lm_head"]
        )
    elif quality_preference == "quality":
        print("Optimizing for quality (16-bit precision)")
        quantization_config = None
    else:
        raise ValueError("quality_preference must be 'speed', 'balanced', or 'quality'")
    
    print("Loading model...")
    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
    
    if force_gpu and torch.cuda.is_available():
        device_map = {"": 0} 
        print("Forcing model to load on GPU")
    else:
        device_map = "auto"
    
    model_kwargs = {
        "device_map": device_map,
        "torch_dtype": torch.float16,
    }
    
    if quantization_config:
        model_kwargs["quantization_config"] = quantization_config
    
    try:
        from transformers.utils import is_flash_attn_available
        if is_flash_attn_available():
            model_kwargs["attn_implementation"] = "flash_attention_2"
            print("Using Flash Attention 2 for faster inference")
    except:
        print("Flash Attention not available, using standard attention")
    
    try:
        llm = AutoModelForCausalLM.from_pretrained(
            "google/gemma-2-2b-it",
            **model_kwargs
        )
        
        device_location = next(llm.parameters()).device
        print(f"Model loaded on: {device_location}")
        
        if 'cuda' not in str(device_location) and torch.cuda.is_available():
            print("Warning: Model loaded on CPU despite CUDA being available")
            
    except Exception as e:
        print(f"Error loading model: {e}")
        if force_gpu:
            print("Falling back to CPU loading")
            return optimize_model_configuration(quality_preference, False)
    
    return {
        "model_size": sum(p.numel() for p in llm.parameters()) / 1e6,
        "model_device": next(llm.parameters()).device,
        "quantization": quality_preference
    }

# hope it works
result = optimize_model_configuration("speed", force_gpu=True)
print(result)

  from .autonotebook import tqdm as notebook_tqdm


CUDA available: NVIDIA GeForce RTX 3080 Laptop GPU
CUDA memory: 8.59 GB
Optimizing for maximum speed (4-bit quantization)
Loading model...
Forcing model to load on GPU
Flash Attention not available, using standard attention


Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.48s/it]


Model loaded on: cuda:0
{'model_size': 1602.203904, 'model_device': device(type='cuda', index=0), 'quantization': 'speed'}


In [12]:
template_manager=GemmaQuestionTemplates()
# get all available template types
template_types=template_manager.get_all_template_types()
print("Available template types:")
print(template_types)
tf_template=template_manager.get_template("true_false")
fill_in_the_blank_template=template_manager.get_template("fill_in_blank")
open_question_template=template_manager.get_template("open_ended")
short_answer=template_manager.get_template("short_answer")


eval_rouge=rouge_scorer.RougeScorer(
    ["rouge1", "rouge2", "rougeL"],
    use_stemmer=True,
)

Available template types:
['mcq', 'true_false', 'fill_in_blank', 'open_ended', 'short_answer']


In [None]:
def preprocess_text(text:str,language:str="en"):
    if language=="en":
        stopwords_set=stopwords_en
    elif language=="ro":
        stopwords_set=stopwords_ro
    else:
        raise ValueError("Language not supported")
    
    tokens=word_tokenize(text)
    tokens=[lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stopwords_set]
    return " ".join(tokens)

In [None]:

def semantic_chunk(text:str,chunk_size:int=512):
    sentences=sent_tokenize(text)
    chunks=[]
    current_chunk=""
    for sentence in sentences:
        if len(current_chunk)+len(sentence)<chunk_size:
            current_chunk+=sentence+" "
        else:
            chunks.append(current_chunk.strip())
            current_chunk=sentence+" "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

In [None]:
def embed_text(text:str,embedding_model:SentenceTransformer):
    return embedding_model.encode(text,show_progress_bar=False)

In [None]:
def add_document(document: Dict[str, Any], chunks: List[str], collection: chromadb.Collection, lang: str = "en"):
    title = document.get("title", "Untitled")
    section = document.get("section", "")
    # create basic metadata
    metadata_base = {
        "title": title,
        "section": section,
        "language": lang
    }
    
    try:
        if "text" in document and metadata_extractor:
            additional_metadata = metadata_extractor.extract_metadata(
                text=document["text"],
                title=title,
                lang=lang 
            )
            
            if isinstance(additional_metadata, dict):
                if "content_type" in additional_metadata:
                    metadata_base["content_type"] = additional_metadata["content_type"]
                if "difficulty_level" in additional_metadata:
                    if isinstance(additional_metadata["difficulty_level"], str):
                        metadata_base["difficulty"] = additional_metadata["difficulty_level"]
                    elif isinstance(additional_metadata["difficulty_level"], dict) and "level" in additional_metadata["difficulty_level"]:
                        metadata_base["difficulty"] = additional_metadata["difficulty_level"]["level"]
    except Exception as e:
        print(f"Error extracting metadata: {e}")
    
    for i, chunk in enumerate(chunks):
        chunk_id = f"{title}_{section}_{i}"
        
        metadata = metadata_base.copy()
        metadata["chunk_id"] = chunk_id
        metadata["chunk_index"] = i
        metadata["total_chunks"] = len(chunks)
        
        # Add to collection
        collection.add(
            documents=[chunk],
            metadatas=[metadata],
            ids=[chunk_id]
        )
        
        print(f"Added chunk {i} of document {title} to collection")
        
 

In [14]:
def load_documents_from_directory(directory: str, language: str = "en"):
    if not os.path.exists(directory):
        raise ValueError(f"Directory {directory} does not exist")
    
    for file_name in os.listdir(directory):
        if file_name.endswith(".json"):
            try:
                with open(os.path.join(directory, file_name), "r", encoding="utf-8", errors="ignore") as f:
                    documents = json.load(f)
                
                for document in documents:
                    if "text" in document and "section" in document and "title" in document:
                        text = document["text"]
                        chunks = semantic_chunk(text)
                        add_document(document, chunks, collection, lang=language)
                    else:
                        print(f"Document in {file_name} does not have the required fields")
            except Exception as e:
                print(f"Error processing file {file_name}: {e}")
        else:
            print(f"File {file_name} is not a JSON file")

In [None]:
 def query_vector_store(query: str, k: int = 5, 
                         filter_metadata: Optional[Dict] = None) -> List[Dict]:
        # Query the collection
        results = collection.query(
            query_texts=[query],
            n_results=k,
            where=filter_metadata
        )
        
        # Format the results
        formatted_results = []
        for i in range(len(results["documents"][0])):
            formatted_results.append({
                "text": results["documents"][0][i],
                "metadata": results["metadatas"][0][i],
                "distance": results["distances"][0][i] if "distances" in results else None,
                "id": results["ids"][0][i]
            })
            
        return formatted_results        
 

In [16]:
def cleanup_memory():
    # the llm leaves memory after each generation -> cleanup, we don't need to keep it
    import gc
    torch.cuda.empty_cache()
    gc.collect()    

In [None]:
def generate_quiz_question(
                         question_type: str, 
                         context: str, 
                         course: str, 
                         difficulty: int = 3,
                         distractors: List[str] = None) -> str:
    import torch
    
    prompt_manager = GemmaPromptManager()
    
    prompt_manager.set_max_context_length(1500)
    
    prompt = prompt_manager.prepare_prompt(
        question_type=question_type,
        context=context,
        course=course,
        difficulty=difficulty,
        distractors=distractors
    )
    
    print(f"Prompt prepared for {question_type} question (length: {len(prompt)} chars)")
    
    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
    print(f"Token count: {input_ids.input_ids.shape[1]} tokens")
    
    # Get generation parameters optimized for Gemma
    generation_params = prompt_manager.get_generation_params(creative=(question_type == "open_ended"))
    
    # Generate with Gemma
    with torch.no_grad():
        outputs = llm.generate(
            **input_ids,
            **generation_params
        )
    
    # Decode only the new tokens
    generated_text = tokenizer.decode(
        outputs[0][input_ids.input_ids.shape[1]:], 
        skip_special_tokens=True
    )
    
    # Clean up the output
    cleaned_text = clean_generated_text(generated_text)
    
    # Validate the output
    if len(cleaned_text.strip()) < 10:
        print("Warning: Generated text is very short or empty!")
        if len(generated_text.strip()) > 10:
            print("Using raw generated text instead.")
            return generated_text.strip()
        else:
            return f"Failed to generate a question for this content. The context might be insufficient."
    
    return cleaned_text

In [None]:
def clean_generated_text(text):
    """Clean up generated text for consistent MCQ formatting"""
    import re
    
    # remove any dashes, quotes, etc.
    text = re.sub(r'^[\s\-"\']+', '', text)
    text = re.sub(r'[\s\-"\']+$', '', text)
    
    # fix common formatting issues
    text = text.replace("A.", "A. ")
    text = text.replace("B.", "B. ")
    text = text.replace("C.", "C. ")
    text = text.replace("D.", "D. ")
    
    text = re.sub(r'([A-D]\.)([\w])', r'\1 \2', text)
    
    # if answer is missing, try to extract it from the options and explanation
    if "Answer:" not in text and "Explanation:" in text:
        explanation = text.split("Explanation:")[1].strip()
        
        # Look for indicators of the correct answer in the explanation
        possible_answers = ["A", "B", "C", "D"]
        for ans in possible_answers:
            if f"option {ans}" in explanation.lower() or f"{ans} is correct" in explanation.lower():
                # Insert the answer before the explanation
                parts = text.split("Explanation:")
                text = parts[0] + f"Answer: {ans}\nExplanation:" + parts[1]
                break
    
    #  add a placeholder
    if "Answer:" not in text:
        if "Explanation:" in text:
            parts = text.split("Explanation:")
            text = parts[0] + "Answer: [Missing - please determine from explanation]\nExplanation:" + parts[1]
        else:
            text = text + "\nAnswer: [Missing - please determine from context]\nExplanation: Answer cannot be determined from the generated content."
    
    return text

In [None]:
def generate_quiz(
                   topic: str, 
                   course: str, 
                   num_questions: int = 5,
                   question_types: List[str] = None,
                   difficulty: int = 3,
                   filter_metadata: Optional[Dict] = None) -> List[Dict]:
    """
    Generate a complete quiz using Gemma with optimized prompts.
    """
    import time
    
    # Initialize default question types if none provided
    if question_types is None:
        prompt_manager = GemmaPromptManager()
        question_types = prompt_manager.templates.get_all_template_types()
    
    start_time = time.time()
    
    # Query for relevant chunks
    print(f"Querying for chunks related to: {topic}")
    relevant_chunks = query_vector_store(
        query=topic,
        k=num_questions * 2,  
        filter_metadata=filter_metadata
    )
    
    valid_chunks = [chunk for chunk in relevant_chunks 
                   if len(chunk["text"].strip()) >= 200]
    
    if not valid_chunks:
        print("No chunks with sufficient content were found.")
        return []
    
    print(f"Found {len(valid_chunks)} usable chunks out of {len(relevant_chunks)} total chunks")
    
    cleanup_memory()
    
    quiz_questions = []
    for i in range(min(num_questions, len(valid_chunks))):
        try:
            q_type = question_types[i % len(question_types)]
            
            context = valid_chunks[i]["text"]
            
            print(f"Generating {q_type} question {i+1}/{min(num_questions, len(valid_chunks))}")
            
            distractors = None
            if q_type == "mcq":
                distractors = extract_distractors(valid_chunks, i)
            
            start = time.time()
            generated_question = generate_quiz_question(
                question_type=q_type,
                context=context,
                course=course,
                difficulty=difficulty,
                distractors=distractors
            )
            
            if len(generated_question.strip()) > 10:
                # Add to quiz
                quiz_questions.append({
                    "question_type": q_type,
                    "content": generated_question,
                    "source_chunk": {
                        "text": context[:200] + "..." if len(context) > 200 else context,
                        "metadata": valid_chunks[i]["metadata"]
                    }
                })
                
                end = time.time()
                print(f"Question {i+1} generated in {end-start:.2f} seconds")
            else:
                print(f" Failed to generate valid content for question {i+1}")
                try_backup_chunk(
                    quiz_questions, valid_chunks, i, num_questions, 
                    q_type, course, difficulty, distractors
                )
            
            # Clean up memory between questions
            if i < min(num_questions, len(valid_chunks)) - 1:
                cleanup_memory()
                
        except Exception as e:
            print(f"Error generating question {i+1}: {e}")
            import traceback
            traceback.print_exc()
            continue
    
    total_time = time.time() - start_time
    print(f"\nQuiz generation completed:")
    print(f"Total time: {total_time:.2f} seconds")
    print(f"Successfully generated {len(quiz_questions)}/{num_questions} questions")
    
    return quiz_questions

In [None]:
def extract_distractors(chunks, current_index, max_distractors=10):
    """Extract potential distractors from other chunks"""
    from nltk.tokenize import word_tokenize
    
    distractors = []
    for j, chunk in enumerate(chunks):
        if j != current_index:
            words = word_tokenize(chunk["text"])
            potential_distractors = [w for w in words if len(w) > 5 and w.isalpha()]
            distractors.extend(potential_distractors[:5]) 
    
    # Remove duplicates and limit
    return list(set(distractors))[:max_distractors]

In [None]:
def try_backup_chunk(quiz_questions, valid_chunks, current_index, num_questions, 
                    question_type, course, difficulty, distractors):
    """Try generating with a backup chunk if the primary fails"""
    if current_index + num_questions < len(valid_chunks):
        backup_index = current_index + num_questions
        backup_chunk = valid_chunks[backup_index]
        print(f"Trying again with a different chunk...")
        
        generated_question = generate_quiz_question(
            question_type=question_type,
            context=backup_chunk["text"],
            course=course,
            difficulty=difficulty,
            distractors=distractors
        )
        
        if len(generated_question.strip()) > 10:
            quiz_questions.append({
                "question_type": question_type,
                "content": generated_question,
                "source_chunk": {
                    "text": backup_chunk["text"][:200] + "..." if len(backup_chunk["text"]) > 200 else backup_chunk["text"],
                    "metadata": backup_chunk["metadata"]
                }
            })
            print(f"Question generated with backup chunk")
            return True
    
    print(f"Failed to generate question with backup chunk")
    return False


In [18]:
# load_documents_from_directory(rag_wiki_files_path,language="en")
# load_documents_from_directory("rag/ro",language="ro")
# # quiz = generate_quiz(
#         topic="Artificial Intelligence",
#         course="Artificial Intelligence",
#         num_questions=10,
#         question_types=["mcq", "true_false", "fill_in_blank", "open_ended","short_answer"],
#         difficulty=3
#     )
# print("Generated Quiz:")
# print(quiz)
# for i, question in enumerate(quiz, 1):
#     print(f"\nQuestion {i} ({question['question_type']}):")
#     print(question['content'])
#     print("-" * 50)


In [19]:
# # load_documents_from_directory(rag_wiki_files_path,language="en")
# # load_documents_from_directory("rag/ro",language="ro")
# quiz = generate_quiz(
#         topic="Artificial Intelligence",
#         course="Artificial Intelligence",
#         num_questions=1,
#         question_types=["short_answer"],
#         difficulty=5
#     )
# print("Generated Quiz:")
# print(quiz)
# for i, question in enumerate(quiz, 1):
#     print(f"\nQuestion {i} ({question['question_type']}):")
#     print(question['content'])
#     print("-" * 50)
 

In [20]:
def evaluate_with_rouge(generated_questions, reference_questions):
    from rouge_score import rouge_scorer
    import re

    def extract_core_question(text):
        if isinstance(text, dict) and "content" in text:
            text = text["content"]
            
        text = re.sub(r'Question:|Answer:|Explanation:|A\.|B\.|C\.|D\.', '', text)
        
        text = ' '.join(text.split())
        return text.lower()

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    results = {
        'rouge1': [],
        'rouge2': [],
        'rougeL': []
    }

    for gen_q, ref_q in zip(generated_questions, reference_questions):
        gen_text = extract_core_question(gen_q)
        ref_text = extract_core_question(ref_q)

        if not gen_text or not ref_text:
            continue

        scores = scorer.score(ref_text, gen_text)
        for metric in results.keys():
            results[metric].append(scores[metric].fmeasure)

    # Average results
    avg_results = {
        metric: sum(scores) / len(scores) if scores else 0
        for metric, scores in results.items()
    }

    return avg_results

In [2]:
def evaluate_with_deepeval(generated_questions, context_chunks):
    try:
        from deepeval import evaluate
        from deepeval.test_case import LLMTestCase
        from deepeval.metrics import AnswerRelevancyMetric
        from deepeval.metrics import ContextualRelevancyMetric

        available_metrics = []
        
        available_metrics.append(("answer_relevancy", AnswerRelevancyMetric(threshold=0.7)))
        print("Using AnswerRelevancyMetric")
    
        available_metrics.append(("contextual_relevancy", ContextualRelevancyMetric(threshold=0.7)))
        print("Using ContextualRelevancyMetric")
    except ImportError:
        print("Could not import something. Check metrics ")
        return {"error": "DeepEval not installed or not available"}

    # Store results for each metric
    results = {metric_name: [] for metric_name, _ in available_metrics}
    evaluations = []
    
    # Extract metrics for evaluation
    metric_objects = [metric for _, metric in available_metrics]
    
    for i, question in enumerate(generated_questions):
        # Get question text
        if isinstance(question, dict) and "content" in question:
            q_text = question["content"]
        else:
            q_text = str(question)

        # Try to extract just the question part
        if "Question:" in q_text:
            q_part = q_text.split("Question:")[1]
            # Further split if there's an Answer section
            if "Answer:" in q_part:
                q_part = q_part.split("Answer:")[0]
            q_part = q_part.strip()
        else:
            q_part = q_text

        # Get context text
        context = context_chunks[i].get("text", "") if i < len(context_chunks) else ""

        try:
            # Create a single test case
            test_case = LLMTestCase(
                input=context,
                actual_output=q_part
            )
            
            # Run evaluation with available metrics
            evaluation_result = evaluate(
                test_cases=[test_case],
                metrics=metric_objects
            )

            # Extract scores from results
            if evaluation_result and evaluation_result.test_results:
                test_result = evaluation_result.test_results[0]
                
                # Parse each metric result
                for j, (metric_name, _) in enumerate(available_metrics):
                    if j < len(test_result.metrics_data):
                        metric_data = test_result.metrics_data[j]
                        results[metric_name].append(metric_data.score)
                        
                        # Store the full evaluation
                        evaluations.append({
                            "metric": metric_name,
                            "score": metric_data.score,
                            "reason": getattr(metric_data, 'reason', None),
                            "question_index": i
                        })

        except Exception as e:
            import traceback
            print(f"DeepEval failed on question {i+1}: {str(e)}")
            traceback.print_exc()
            continue

    # If no successful evaluations, return error
    if not evaluations:
        return {"error": "no evaluations completed"}

    # Compute aggregate scores for each metric
    aggregated = {}
    for metric_name, scores in results.items():
        if scores:
            aggregated[metric_name] = sum(scores) / len(scores)
    
    # Add metadata
    aggregated["num_evaluated"] = len(evaluations) // len(available_metrics) if available_metrics else 0
    aggregated["evaluations"] = evaluations
    
    return aggregated

In [None]:
# !deepeval set-local-model --model-name="meta-llama-3.1-8b-instruct" --base-url="http://localhost:4500/v1/" --api-key="test"

In [None]:
from deepeval.key_handler import KEY_FILE_HANDLER, KeyValues

print(KEY_FILE_HANDLER.fetch_data(KeyValues.LOCAL_MODEL_NAME))
print(KEY_FILE_HANDLER.fetch_data(KeyValues.LOCAL_MODEL_BASE_URL))
print(KEY_FILE_HANDLER.fetch_data(KeyValues.LOCAL_MODEL_API_KEY))
import os
os.environ["DEEPEVAL_USE_LOCAL_MODEL"] = "YES"


In [25]:
def evaluate_with_lm_studio(generated_questions, api_url="http://localhost:4500/v1/completions"):
    import requests
    import json
    
    evaluation_prompt = """
    You are an expert evaluator for educational quiz questions. Rate the following question on a scale of 1-10 for:
    1. Clarity: How clear and unambiguous is the question?
    2. Relevance: How relevant is the question to the subject area?
    3. Educational Value: How valuable is this question for learning?
    4. Technical Accuracy: How factually correct is the content?
    
    Question to evaluate:
    {question}
    
    Provide your ratings and a brief explanation for each criterion.
    """
    
    results = []
    
    for question in generated_questions:
        q_content = question['content'] if isinstance(question, dict) and 'content' in question else str(question)
        
        request_data = {
            "model": "meta-llama-3.1-8b-instruct",  
            "prompt": evaluation_prompt.format(question=q_content),
            "max_tokens": 500,
            "temperature": 0.3 
        }
        
        try:
            response = requests.post(
                api_url,
                headers={"Content-Type": "application/json"},
                data=json.dumps(request_data)
            )
            
            if response.status_code == 200:
                evaluation = response.json()["choices"][0]["text"]
                
                import re
                ratings = {}
                for criterion in ["Clarity", "Relevance", "Educational Value", "Technical Accuracy"]:
                    match = re.search(f"{criterion}:\\s*(\\d+)", evaluation, re.IGNORECASE)
                    if match:
                        ratings[criterion.lower().replace(" ", "_")] = int(match.group(1))
                
                results.append({
                    "question": q_content[:100] + "...", 
                    "full_evaluation": evaluation,
                    "ratings": ratings
                })
            else:
                print(f"API request failed with status code: {response.status_code}")
                results.append({
                    "question": q_content[:100] + "...",
                    "error": f"API request failed: {response.status_code}"
                })
        except Exception as e:
            print(f"Error during LM Studio evaluation: {e}")
            results.append({
                "question": q_content[:100] + "...",
                "error": str(e)
            })
    
    return results


In [26]:
import requests

help_model="meta-llama-3.1-8b-instruct"
def generate_generic_quiz(
    topic: str,
    course: str,
    num_questions: int = 5,
    question_types: List[str] = None,
    difficulty: int = 3
) -> List[Dict]:
    # call for lm studio 
    api_url="http://localhost:4500/v1/completions"
    evaluation_prompt = """ Generate a quiz for the following topic and course.
    Topic: {topic}
    Course: {course}
    Number of questions: {num_questions}
    Question types: {question_types}
    Difficulty: {difficulty}
    """
    request_data = {
        "model": help_model,
        "prompt": evaluation_prompt.format(
            topic=topic,
            course=course,
            num_questions=num_questions,
            question_types=", ".join(question_types) if question_types else "mcq, true_false",
            difficulty=difficulty
        ),
        "max_tokens": 500,
        "temperature": 0.3
    }
    try:
        # Make the API call
        response = requests.post(
            api_url,
            headers={"Content-Type": "application/json"},
            data=json.dumps(request_data)
        )
        
        if response.status_code == 200:
            generated_questions = response.json()["choices"][0]["text"]
            print("Generated Questions:")   
            print(generated_questions)
            questions_list = generated_questions.split("\n")
            questions = []
            for q in questions_list:
                if q.strip():
                    question_dict = {
                        "question_type": "generic",
                        "content": q.strip(),
                        "source_chunk": {
                            "text": topic,
                            "metadata": {}
                        }
                    }
                    questions.append(question_dict)
            return questions
        
        else:
            print(f"API request failed with status code: {response.status_code}")
            return []
    except Exception as e:
        print(f"Error during API request: {e}")
        return []
    


In [27]:
def evaluate_quiz_quality(generated_questions, reference_questions=None):
    
    results = {
        "format_compliance": 0,
        "content_relevance": 0,
        "question_clarity": 0,
        "overall_quality": 0
    }
    
    # Format compliance check
    format_scores = []
    for q in generated_questions:
        q_type = q["question_type"]
        content = q["content"]
        
        # Check required elements based on question type
        if q_type == "mcq":
            required = ["Question:", "A.", "B.", "C.", "D.", "Answer:", "Explanation:"]
        elif q_type == "true_false":
            required = ["Question:", "Answer:", "Explanation:"]
        elif q_type == "fill_in_blank":
            required = ["Question:", "Answer:", "Explanation:"]
        elif q_type == "short_answer":
            required = ["Question:", "Answer:", "Explanation:"]
        elif q_type == "open_ended":
            required = ["Question:", "Guidelines:", "Sample Answer:"]
        else:
            required = ["Question:"]
        
        # Calculate format compliance
        present = sum(1 for r in required if r in content)
        format_score = present / len(required)
        format_scores.append(format_score)
    
    results["format_compliance"] = sum(format_scores) / len(format_scores) if format_scores else 0
    
    
    return results

In [28]:
def benchmark_performance(topic, course):
    """Benchmark the performance of the RAG system"""
    import time
    
    metrics = {
        "retrieval_time": 0,
        "generation_time": 0,
        "total_time": 0,
        "tokens_per_second": 0,
        "questions_per_minute": 0
    }
    
    # Measure overall time
    start_time = time.time()
    
    # Measure retrieval time
    retrieval_start = time.time()
    chunks = query_vector_store(
        query=topic,
        k=5
    )
    retrieval_time = time.time() - retrieval_start
    metrics["retrieval_time"] = retrieval_time
    
    if chunks:
        generation_start = time.time()
        question = generate_quiz_question(
            question_type="mcq",
            context=chunks[0]["text"],
            course=course,
            difficulty=3
        )
        generation_time = time.time() - generation_start
        metrics["generation_time"] = generation_time
    
    # Calculate overall metrics
    metrics["total_time"] = time.time() - start_time
    metrics["questions_per_minute"] = 60 / metrics["generation_time"] if metrics["generation_time"] > 0 else 0
    
    return metrics

In [None]:
from LSTMTextSimilarity import predict_similarity


def evaluate_with_lstm_similarity(generated_questions, reference_questions=None, context_chunks=None, lstm_model=None):
    results = {
        "question_to_reference_similarity": [],
        "question_to_context_similarity": [],
        "question_to_question_similarity_matrix": []
    }

    # Extract just question text
    def extract_question(text):
        if isinstance(text, dict) and "content" in text:
            text = text["content"]
        if "Question:" in text:
            return text.split("Question:")[1].split("Answer:")[0].strip()
        return text.strip()

    # Prepare questions
    gen_questions = [extract_question(q) for q in generated_questions]

    # Calculate similarity to reference questions (if provided)
    if reference_questions:
        ref_questions = [extract_question(q) for q in reference_questions]

        for i, gen_q in enumerate(gen_questions):
            if i < len(ref_questions):
                sim_score = predict_similarity(gen_q, ref_questions[i])
                results["question_to_reference_similarity"].append(sim_score)

    # Calculate similarity to context chunks (if provided)
    if context_chunks:
        context_texts = [c.get("text", "") if isinstance(c, dict) else str(c) for c in context_chunks]

        for i, gen_q in enumerate(gen_questions):
            if i < len(context_texts):
                sim_score = predict_similarity(gen_q, context_texts[i])
                results["question_to_context_similarity"].append(sim_score)

    # Calculate pairwise similarity between generated questions (for diversity measurement)
    similarity_matrix = []
    for i, q1 in enumerate(gen_questions):
        row = []
        for j, q2 in enumerate(gen_questions):
            if i == j:  # Same question
                row.append(100.0)
            else:
                sim_score = predict_similarity(q1, q2)
                row.append(sim_score)
        similarity_matrix.append(row)

    results["question_to_question_similarity_matrix"] = similarity_matrix

    # Calculate average diversity (100 - average non-diagonal similarity)
    if len(similarity_matrix) > 1:
        non_diagonal_similarities = []
        for i in range(len(similarity_matrix)):
            for j in range(len(similarity_matrix[i])):
                if i != j:
                    non_diagonal_similarities.append(similarity_matrix[i][j])

        avg_similarity = sum(non_diagonal_similarities) / len(non_diagonal_similarities) if non_diagonal_similarities else 0
        results["diversity_score"] = 100 - avg_similarity  # Higher value means more diverse questions

    # Calculate averages for other metrics
    for key in ["question_to_reference_similarity", "question_to_context_similarity"]:
        if results[key]:
            results[f"avg_{key}"] = sum(results[key]) / len(results[key])

    return results

In [None]:
# Integration with the comprehensive evaluation framework
def run_lstm_enhanced_evaluation(topic, course, num_questions=5, lstm_model=None):
    # Step 1: Generate questions with RAG system
    rag_questions = generate_quiz(
        topic=topic,
        course=course,
        num_questions=num_questions,
        question_types=["mcq", "true_false", "short_answer"]
    )

    # Step 2: Generate reference questions
    reference_questions = generate_generic_quiz(
        topic=topic,
        course=course,
        num_questions=num_questions,
        question_types=["mcq", "true_false", "short_answer"]
    )

    # Step 3: Get the source contexts used
    source_contexts = [q.get("source_chunk", {"text": ""}) for q in rag_questions]
    # Step 5: Run evaluation metrics
    results = {
        "basic_metrics": evaluate_quiz_quality(rag_questions),
        "rouge_metrics": evaluate_with_rouge(rag_questions, reference_questions),
        "lstm_similarity": evaluate_with_lstm_similarity(
            rag_questions,
            reference_questions,
            source_contexts,
            lstm_model
        ),
        "performance_metrics": benchmark_performance(topic, course)
    }

    # Step 6: Try to run DeepEval if available
    try:
        results["deepeval_metrics"] = evaluate_with_deepeval(rag_questions, source_contexts)
    except Exception as e:
        print(f"DeepEval evaluation failed: {e}")
        results["deepeval_metrics"] = {"error": str(e)}

    # Step 7: Display results
    print(f"\n=== LSTM-Enhanced Evaluation Results for {topic} ===")

    # Display basic metrics
    print(f"Format Compliance: {results['basic_metrics']['format_compliance']:.2f}")

    # Display ROUGE metrics
    print(f"ROUGE-L F1 Score: {results['rouge_metrics']['rougeL']:.4f}")

    # Display LSTM similarity metrics
    lstm_sim = results.get("lstm_similarity", {})
    if "avg_question_to_context_similarity" in lstm_sim:
        print(f"Avg. Context Similarity: {lstm_sim['avg_question_to_context_similarity']:.2f}%")
    if "avg_question_to_reference_similarity" in lstm_sim:
        print(f"Avg. Reference Similarity: {lstm_sim['avg_question_to_reference_similarity']:.2f}%")
    if "diversity_score" in lstm_sim:
        print(f"Question Diversity Score: {lstm_sim['diversity_score']:.2f}%")

    # Performance metrics
    print(f"Generation Speed: {results['performance_metrics']['questions_per_minute']:.2f} questions/minute")

    return results

# Example usage:
# Load your LSTM model once
# lstm_model = LSTMTextSimilarity(path="models/lstm_similarity_model.h5")
# results = run_lstm_enhanced_evaluation("Artificial Intelligence", "Computer Science", lstm_model=lstm_model)

In [30]:
def analyze_question_complexity(generated_questions):
    """
    Analyze complexity metrics of generated questions
    
    Args:
        generated_questions: List of generated questions
    
    Returns:
        Dictionary of complexity metrics
    """
    import nltk
    from nltk.tokenize import word_tokenize, sent_tokenize
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    
    metrics = {
        "avg_word_count": 0,
        "avg_sentence_count": 0,
        "avg_word_length": 0,
        "readability_scores": {
            "flesch_reading_ease": 0,
            "flesch_kincaid_grade": 0
        },
        "question_types": {
            "factual": 0,
            "conceptual": 0,
            "analytical": 0
        }
    }
    
    total_words = 0
    total_sentences = 0
    total_chars = 0
    processed = 0
    
    # Keywords suggesting different question types
    factual_keywords = ["what", "when", "where", "who", "list", "name", "identify"]
    conceptual_keywords = ["why", "how", "explain", "describe", "compare", "contrast", "define"]
    analytical_keywords = ["analyze", "evaluate", "examine", "interpret", "assess", "critique", "justify"]
    
    for question in generated_questions:
        # Extract question text
        if isinstance(question, dict) and "content" in question:
            q_text = question["content"]
        else:
            q_text = str(question)
            
        # Extract just the question part if possible
        if "Question:" in q_text:
            q_text = q_text.split("Question:")[1].split("Answer:")[0].strip()
        
        # Tokenize
        words = word_tokenize(q_text.lower())
        sentences = sent_tokenize(q_text)
        
        # Update counters
        total_words += len(words)
        total_sentences += len(sentences)
        total_chars += sum(len(word) for word in words if word.isalnum())
        processed += 1
        
        # Determine question type based on keywords
        for word in words[:4]:  # Check first few words for question type indicators
            if word in factual_keywords:
                metrics["question_types"]["factual"] += 1
                break
            elif word in conceptual_keywords:
                metrics["question_types"]["conceptual"] += 1
                break
            elif word in analytical_keywords:
                metrics["question_types"]["analytical"] += 1
                break
    
    # Calculate averages
    if processed > 0:
        metrics["avg_word_count"] = total_words / processed
        metrics["avg_sentence_count"] = total_sentences / processed
        if total_words > 0:
            metrics["avg_word_length"] = total_chars / total_words
    
        # Calculate Flesch Reading Ease score (simplified)
        if total_sentences > 0:
            words_per_sentence = total_words / total_sentences
            syllables_per_word = total_chars / total_words / 3  # Rough approximation
            
            metrics["readability_scores"]["flesch_reading_ease"] = 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
            metrics["readability_scores"]["flesch_kincaid_grade"] = (0.39 * words_per_sentence) + (11.8 * syllables_per_word) - 15.59
    
    # Convert question type counts to percentages
    total_classified = sum(metrics["question_types"].values())
    if total_classified > 0:
        for qtype in metrics["question_types"]:
            metrics["question_types"][qtype] = (metrics["question_types"][qtype] / total_classified) * 100
    
    return metrics

In [31]:
def calculate_interpretation(quality_score):
    if quality_score >= 0.85:
        return "Excellent: Questions are of high quality across all dimensions"
    elif quality_score >= 0.75:
        return "Very Good: Questions are effective with minor areas for improvement"
    elif quality_score >= 0.65:
        return "Good: Questions meet basic requirements but would benefit from refinement"
    elif quality_score >= 0.55:
        return "Acceptable: Questions are usable but have significant room for improvement"
    elif quality_score >= 0.45:
        return "Needs Improvement: Questions have issues that should be addressed"
    else:
        return "Poor: Questions require major revision across multiple dimensions"



In [32]:
def run_comprehensive_evaluation(topic, course, use_lstm=True):
    # Step 1: Generate questions with your RAG system
    rag_questions = generate_quiz(
        topic=topic,
        course=course,
        num_questions=5,
        question_types=["mcq", "true_false", "short_answer"]
    )
    
    # Step 2: Generate reference questions
    reference_questions = generate_generic_quiz(
        topic=topic,
        course=course,
        num_questions=5,
        question_types=["mcq", "true_false", "short_answer"]
    )
    
    # Step 3: Get the source contexts used
    source_contexts = [q.get("source_chunk", {"text": ""}) for q in rag_questions]
    
    # Step 4: Run evaluation metrics
    results = {
        "basic_metrics": evaluate_quiz_quality(rag_questions),
        "rouge_metrics": evaluate_with_rouge(rag_questions, reference_questions),
        "performance_metrics": benchmark_performance(topic, course)
    }
    
    # Step 5: Add complexity analysis
    try:
        results["complexity_metrics"] = analyze_question_complexity(rag_questions)
    except Exception as e:
        print(f"Complexity analysis failed: {e}")
        results["complexity_metrics"] = {"error": str(e)}
    
    # Step 6: Add LSTM similarity metrics if requested
    if use_lstm:
        try:
            results["lstm_similarity"] = evaluate_with_lstm_similarity(
                rag_questions,
                reference_questions,
                source_contexts
            )
            print("LSTM similarity metrics successfully calculated")
        except Exception as e:
            print(f"LSTM similarity evaluation failed: {e}")
            import traceback
            traceback.print_exc()
            results["lstm_similarity"] = {"error": str(e)}
    
    # Step 7: DeepEval metrics
    try:
        results["deepeval_metrics"] = evaluate_with_deepeval(rag_questions, source_contexts)
    except Exception as e:
        print(f"DeepEval evaluation failed: {e}")
        results["deepeval_metrics"] = {"error": str(e)}
    
    # Step 8: LM Studio evaluation if available
    try:
        results["lm_studio_evaluation"] = evaluate_with_lm_studio(
            rag_questions, 
            api_url="http://localhost:4500/v1/completions"
        )
    except Exception as e:
        print(f"LM Studio evaluation failed: {e}")
        results["lm_studio_evaluation"] = {"error": str(e)}
    
    # Calculate an aggregate quality score (weighted average of various metrics)
    weights = {
        "format_compliance": 0.15,
        "rouge": 0.05,
        "context_similarity": 0.2,
        "diversity": 0.15,
        "answer_relevancy": 0.3,
        "complexity": 0.15
    }
    
    quality_score = 0
    quality_components = {}
    
    # Basic format compliance
    if "basic_metrics" in results:
        format_score = results["basic_metrics"].get("format_compliance", 0)
        quality_components["format_compliance"] = format_score
        quality_score += format_score * weights["format_compliance"]
    
    # ROUGE-L score (lexical similarity)
    if "rouge_metrics" in results:
        rouge_score = results["rouge_metrics"].get("rougeL", 0)
        quality_components["rouge"] = rouge_score
        quality_score += rouge_score * weights["rouge"]
    
    # LSTM similarity to context
    if "lstm_similarity" in results and isinstance(results["lstm_similarity"], dict):
        lstm = results["lstm_similarity"]
        if "avg_question_to_context_similarity" in lstm:
            context_sim = lstm["avg_question_to_context_similarity"] / 100  # Convert to 0-1 scale
            quality_components["context_similarity"] = context_sim
            quality_score += context_sim * weights["context_similarity"]
        if "diversity_score" in lstm:
            diversity = lstm["diversity_score"] / 100  # Convert to 0-1 scale
            quality_components["diversity"] = diversity
            quality_score += diversity * weights["diversity"]
    
    # Answer relevancy from DeepEval
    if "deepeval_metrics" in results:
        de = results["deepeval_metrics"]
        if "answer_relevancy" in de:
            relevancy = de["answer_relevancy"]
            quality_components["answer_relevancy"] = relevancy
            quality_score += relevancy * weights["answer_relevancy"]
    
    if "complexity_metrics" in results:
        complexity = results["complexity_metrics"]
        if "question_types" in complexity:
            import math
            types = complexity["question_types"]
            if any(types.values()):
                entropy = 0
                for v in types.values():
                    if v > 0:
                        p = v / 100  
                        entropy -= p * math.log2(p)
                normalized_entropy = min(entropy / 1.585, 1.0)
                quality_components["complexity"] = normalized_entropy
                quality_score += normalized_entropy * weights["complexity"]
    
    results["quality_score"] = quality_score
    results["quality_components"] = quality_components
    
    # Step 9: Save and display results
    print(f"\n=== Comprehensive Evaluation Results for {topic} ===")
    
    # Basic metrics
    print(f"Format Compliance: {results['basic_metrics'].get('format_compliance', 0):.2f}")
    
    # ROUGE metrics
    print(f"ROUGE-L F1 Score: {results['rouge_metrics'].get('rougeL', 0):.4f}")
    
    # Complexity metrics
    if "complexity_metrics" in results and isinstance(results["complexity_metrics"], dict):
        complexity = results["complexity_metrics"]
        if "question_types" in complexity:
            print("\nQuestion Type Distribution:")
            for qtype, percentage in complexity["question_types"].items():
                print(f"- {qtype}: {percentage:.1f}%")
        if "readability_scores" in complexity:
            print(f"Readability: {complexity['readability_scores'].get('flesch_reading_ease', 'N/A'):.1f}")
            print(f"Avg. Word Count: {complexity.get('avg_word_count', 'N/A'):.1f}")
    
    if "lstm_similarity" in results and isinstance(results["lstm_similarity"], dict):
        lstm = results["lstm_similarity"]
        if "avg_question_to_context_similarity" in lstm:
            print(f"\nLSTM Context Similarity: {lstm['avg_question_to_context_similarity']:.2f}%")
        if "avg_question_to_reference_similarity" in lstm:
            print(f"LSTM Reference Similarity: {lstm['avg_question_to_reference_similarity']:.2f}%")
        if "diversity_score" in lstm:
            print(f"LSTM Question Diversity: {lstm['diversity_score']:.2f}%")
    
    # DeepEval metrics
    if "deepeval_metrics" in results:
        de = results["deepeval_metrics"]
        if "answer_relevancy" in de:
            print(f"\nDeepEval Answer Relevancy: {de['answer_relevancy']:.2f}")
            print(f"Questions Evaluated: {de.get('num_evaluated', 0)}")
    
    # Performance metrics
    print(f"\nGeneration Speed: {results['performance_metrics'].get('questions_per_minute', 0):.2f} questions/minute")
    
    # Overall quality score
    print(f"\nOverall Quality Score: {quality_score:.2f} / 1.0")
    if quality_components:
        print("Quality Component Scores:")
        for component, score in quality_components.items():
            print(f"- {component}: {score:.2f}")
    
    return results

In [34]:
run_comprehensive_evaluation(
    topic="Artificial Intelligence",
    course="Artificial Intelligence"
)

Querying for chunks related to: Artificial Intelligence
Found 7 usable chunks out of 10 total chunks
Generating mcq question 1/5
Prompt prepared for mcq question (length: 1161 chars)
Token count: 306 tokens
Question 1 generated in 14.56 seconds
Generating true_false question 2/5
Prompt prepared for true_false question (length: 1288 chars)
Token count: 275 tokens
Question 2 generated in 6.97 seconds
Generating short_answer question 3/5
Prompt prepared for short_answer question (length: 916 chars)
Token count: 201 tokens
Question 3 generated in 1.68 seconds
Generating mcq question 4/5
Prompt prepared for mcq question (length: 1273 chars)
Token count: 301 tokens
Question 4 generated in 17.54 seconds
Generating true_false question 5/5
Prompt prepared for true_false question (length: 1341 chars)
Token count: 280 tokens
Question 5 generated in 2.77 seconds

Quiz generation completed:
Total time: 45.64 seconds
Successfully generated 5/5 questions
Generated Questions:
 """
    # Define the qui

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:21, 21.65s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 0.8, threshold: 0.7, strict: False, evaluation model: meta-llama-3.1-8b-instruct (Local Model), reason: The score is 0.80 because the statement explicitly stating that Predictive Analytics is NOT an area commonly associated with artificial intelligence is irrelevant to the topic of Introduction to Artificial Intelligence, preventing the score from being higher., error: None)

For test case:

  - input: Introduction to Artificial Intelligence (2nd ed.). Springer. ISBN 978-3-3195-8486-7. Ciaramella, Alberto; Ciaramella, Marco (2024). Introduction to Artificial Intelligence: from data analysis to gener...
  - actual output: ** Which of the following is NOT an area commonly associated with artificial intelligence?

A.  Natural Language Processing
B. ** Predictive Analytics**
C. ** Image Recognition**
D. ** Robotics & Automation **
  - expected output: None
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Rel




Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:10, 10.70s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: meta-llama-3.1-8b-instruct (Local Model), reason: The score is 1.00 because the response is highly relevant to the input, addressing the concept of superintelligence directly and providing a clear connection to the topic of artificial general intelligence., error: None)

For test case:

  - input: A superintelligence is a hypothetical agent that would possess intelligence far surpassing that of the brightest and most gifted human mind. If research into artificial general intelligence produced s...
  - actual output: ** A superintelligent AI could potentially self-improve indefinitely without any external input from humans.
  - expected output: None
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate







Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:16, 16.10s/test case]



Metrics Summary

  - ❌ Answer Relevancy (score: 0.5, threshold: 0.7, strict: False, evaluation model: meta-llama-3.1-8b-instruct (Local Model), reason: The score is 0.50 because it's dragged down by an irrelevant statement about a prompt or instruction, which doesn't contribute to discussing AI or related topics., error: None)

For test case:

  - input: Artificial Intelligence and Ex Machina, as well as the novel Do Androids Dream of Electric Sheep?, by Philip K. Dick. Dick considers the idea that our understanding of human subjectivity is altered by...
  - actual output: Let me know when you're ready for the AI Question!  😊
Answer: [Missing - please determine from context]
Explanation: Answer cannot be determined from the generated content.
  - expected output: None
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 0.00% pass rate







Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:16, 16.59s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 0.8, threshold: 0.7, strict: False, evaluation model: meta-llama-3.1-8b-instruct (Local Model), reason: The score is 0.80 because the actual output mentions precision farming techniques, which are not relevant to high-profile applications of AI discussed in the input., error: None)

For test case:

  - input: High-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); virtual assistants (e.g., Google Assistant, Si...
  - actual output: ** Which high-tech application represents an example of how Artificial Intelligence has impacted daily life today?

A.  Automated car manufacturing lines
B. ** Generative content creation platforms such as Midjourney or Dall-E 2**,
C. ** Improved agricultural practices through precision farming techniques,**
D. ** Social media algorithms used for targeted advertising
  - expected output: None
  - context: None
  




Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:14, 14.26s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: meta-llama-3.1-8b-instruct (Local Model), reason: The score is 1.00 because the actual output perfectly addresses and responds to all aspects of the input, making it highly relevant., error: None)

For test case:

  - input: Artificial intelligence (AI) refers to the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, ...
  - actual output: **  Machine Learning algorithms are an integral part of artificial intelligence (IA).
Answer : [True/False]
Explaination: [Briefly explain why.]
  - expected output: None
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate








=== Comprehensive Evaluation Results for Artificial Intelligence ===
Format Compliance: 0.81
ROUGE-L F1 Score: 0.0270

Question Type Distribution:
- factual: 100.0%
- conceptual: 0.0%
- analytical: 0.0%
Readability: 78.6
Avg. Word Count: 37.2

LSTM Context Similarity: 3.11%
LSTM Reference Similarity: 1.19%
LSTM Question Diversity: 94.92%

DeepEval Answer Relevancy: 0.82
Questions Evaluated: 5

Generation Speed: 49.31 questions/minute

Overall Quality Score: 0.52 / 1.0
Quality Component Scores:
- format_compliance: 0.81
- rouge: 0.03
- context_similarity: 0.03
- diversity: 0.95
- answer_relevancy: 0.82
- complexity: 0.00


{'basic_metrics': {'format_compliance': 0.8095238095238095,
  'content_relevance': 0,
  'question_clarity': 0,
  'overall_quality': 0},
 'rouge_metrics': {'rouge1': 0.027022780832678717,
  'rouge2': 0.01414141414141414,
  'rougeL': 0.027022780832678717},
 'performance_metrics': {'retrieval_time': 0.0464625358581543,
  'generation_time': 1.216700792312622,
  'total_time': 1.2631633281707764,
  'tokens_per_second': 0,
  'questions_per_minute': 49.31368531942523},
 'complexity_metrics': {'avg_word_count': 37.2,
  'avg_sentence_count': 2.6,
  'avg_word_length': 4.032258064516129,
  'readability_scores': {'flesch_reading_ease': 78.6030148883375,
   'flesch_kincaid_grade': 5.8502150537634385},
  'question_types': {'factual': 100.0, 'conceptual': 0.0, 'analytical': 0.0}},
 'lstm_similarity': {'question_to_reference_similarity': [0.0,
   0.0,
   0.0,
   1.9607843137254901,
   4.0],
  'question_to_context_similarity': [2.2222222222222223,
   2.564102564102564,
   1.8518518518518516,
   1.408450

In [35]:
run_comprehensive_evaluation(
    topic="Baze de date",
    course="Baze de date",
)

Querying for chunks related to: Baze de date
Found 8 usable chunks out of 10 total chunks
Generating mcq question 1/5
Prompt prepared for mcq question (length: 1357 chars)
Token count: 338 tokens
Question 1 generated in 10.40 seconds
Generating true_false question 2/5
Prompt prepared for true_false question (length: 1337 chars)
Token count: 322 tokens
Question 2 generated in 8.43 seconds
Generating short_answer question 3/5
Prompt prepared for short_answer question (length: 1061 chars)
Token count: 266 tokens
Question 3 generated in 2.95 seconds
Generating mcq question 4/5
Prompt prepared for mcq question (length: 1330 chars)
Token count: 334 tokens
Question 4 generated in 8.51 seconds
Generating true_false question 5/5
Prompt prepared for true_false question (length: 1256 chars)
Token count: 311 tokens
Question 5 generated in 0.16 seconds

Quiz generation completed:
Total time: 33.98 seconds
Successfully generated 5/5 questions
Generated Questions:
 """
    # Define the quiz
    quiz 

Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s]INFO:backoff:Backing off send_request(...) for 0.1s (requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:24, 24.47s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 0.8571428571428571, threshold: 0.7, strict: False, evaluation model: meta-llama-3.1-8b-instruct (Local Model), reason: The score is 0.86 because the model provided some relevant information about relational databases, but also included an irrelevant statement about hierarchical databases, which detracted from its overall relevance to the input question about different types of databases., error: None)

For test case:

  - input: Există multe tipuri de baze de date:

Baze de date relaționale: elementele dintr-o bază de date relațională sunt organizate ca un set de tabele, cu rânduri și coloane. Tehnologia bazelor de date relaț...
  - actual output: **  What is the primary characteristic that differentiates "baze de data" in general? What type of database system can offer a high degree of efficiency when accessing information structured according to specific rules or relationships? 

A.  Distributed databases 
B. ** Relational Databases**




Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:20, 20.74s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 0.8, threshold: 0.7, strict: False, evaluation model: meta-llama-3.1-8b-instruct (Local Model), reason: The score is 0.80 because the model provided relevant information about multi-model databases, but included irrelevant details that seem to be part of an assessment or quiz, which detracted from its overall relevance., error: None)

For test case:

  - input: Baze de date multi-model: combină diferite tipuri de modele de baze de date într-un singur sistem back-end integrat cu care se pot acomoda diverse tipuri de date. Baze de date pentru documente: concep...
  - actual output: A database is used to store information about dates. 
Answer : False
Explanation : While databases may contain data related to dates, they primarily focus on storing various types of information beyond just dates.


## Your turn!
  - expected output: None
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass r




Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:16, 16.87s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: meta-llama-3.1-8b-instruct (Local Model), reason: The score is 1.00 because the actual output is highly relevant to the input, with no irrelevant statements present., error: None)

For test case:

  - input: Datele sunt tranzacționate și înregistrate prin interfața bazei de date susținută de mai multe straturi de blockchain. Baza de date în sine este partajată sub forma unui registru criptat/imuabil, care...
  - actual output: What type of database uses tables for data storage?
  - expected output: None
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate







Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:17, 17.24s/test case]



Metrics Summary

  - ❌ Answer Relevancy (score: 0.5, threshold: 0.7, strict: False, evaluation model: meta-llama-3.1-8b-instruct (Local Model), reason: The score is 0.50 because the actual output includes irrelevant statements about cloud database systems not depending on personal hardware and software configurations, as well as mentioning data types handled by cloud databases, which do not address the open source aspect of the input., error: None)

For test case:

  - input: Baze de date open source: au codul sursă de tip open source, cele mai cunoscute fiind cele SQL și NoSQL
Baze de date în cloud: colectie de date, structurate sau nestructurate aflate în platformă cloud...
  - actual output: ** What is an important characteristic of databases in the cloud?
A .  They can only handle structured data.
B . They must always utilize Open Source software.
C .  Their operations depend entirely on personal hardware and software configurations.
D .  Cloud database systems offer diverse mode




Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:15, 15.90s/test case]



Metrics Summary

  - ❌ Answer Relevancy (score: 0.6666666666666666, threshold: 0.7, strict: False, evaluation model: meta-llama-3.1-8b-instruct (Local Model), reason: The score is 0.67 because the statement about not being able to determine the answer from the generated content is irrelevant to the input, which asks about database storage capabilities., error: None)

For test case:

  - input: Baza de date poate fi stocată pe mai multe computere aflate în aceeași locație fizică sau aflate în diferite rețele. Baze de date NoSQL:  denumite si baze de date nerelationale, permit stocarea și ges...
  - actual output: 
Answer: [Missing - please determine from context]
Explanation: Answer cannot be determined from the generated content.
  - expected output: None
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 0.00% pass rate








=== Comprehensive Evaluation Results for Baze de date ===
Format Compliance: 0.70
ROUGE-L F1 Score: 0.0000

Question Type Distribution:
- factual: 100.0%
- conceptual: 0.0%
- analytical: 0.0%
Readability: 72.5
Avg. Word Count: 40.6

LSTM Context Similarity: 0.55%
LSTM Reference Similarity: 0.00%
LSTM Question Diversity: 93.91%

DeepEval Answer Relevancy: 0.76
Questions Evaluated: 5

Generation Speed: 4.28 questions/minute

Overall Quality Score: 0.48 / 1.0
Quality Component Scores:
- format_compliance: 0.70
- rouge: 0.00
- context_similarity: 0.01
- diversity: 0.94
- answer_relevancy: 0.76
- complexity: 0.00


{'basic_metrics': {'format_compliance': 0.6952380952380952,
  'content_relevance': 0,
  'question_clarity': 0,
  'overall_quality': 0},
 'rouge_metrics': {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0},
 'performance_metrics': {'retrieval_time': 0.5100522041320801,
  'generation_time': 14.020061016082764,
  'total_time': 14.530113220214844,
  'tokens_per_second': 0,
  'questions_per_minute': 4.27958194555448},
 'complexity_metrics': {'avg_word_count': 40.6,
  'avg_sentence_count': 3.8,
  'avg_word_length': 4.379310344827586,
  'readability_scores': {'flesch_reading_ease': 72.49397459165155,
   'flesch_kincaid_grade': 5.802129461585},
  'question_types': {'factual': 100.0, 'conceptual': 0.0, 'analytical': 0.0}},
 'lstm_similarity': {'question_to_reference_similarity': [0.0,
   0.0,
   0.0,
   0.0,
   0.0],
  'question_to_context_similarity': [1.4492753623188406,
   0.0,
   0.0,
   1.2987012987012987,
   0.0],
  'question_to_question_similarity_matrix': [[100.0,
    9.859154929577464,
    