<a href="https://colab.research.google.com/github/cipB14/Questify/blob/main/Review3_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Libraries and Their Use in Questify Project

| Library/Tool        | Use Case in Questify                                                                       |
|---------------------|----------------------------------------------------------------------|
| pdf4llm           | Extracts structured content (text, images, tables) from PDFs         |
| transformers      | Loads and runs LLMs (e.g., Mistral for question generation, BERT for classification) |
| accelerate        | Speeds up model inference across GPU/CPU environments                |
| bitsandbytes      | Enables low-bit quantization for memory-efficient LLMs               |
| lancedb           | Stores SBERT embeddings for hybrid search of study content           |
| tantivy           | Provides fast keyword-based full-text indexing and search            |


| Model Name                     | Type                          | Layers | Max Seq Length | Use Case                              | Labels / Output      |
|-------------------------------|-------------------------------|--------|----------------|----------------------------------------|-----------------------|
| Mistral-7B-Instruct-v0.3      | MistralForCausalLM            | 32     | 32,768         | Question Generation                    | Text (Generated Qs)   |
| all-MiniLM-L6-v2              | BertModel                     | 6      | 512            | Sentence Embeddings for Retrieval      | Embeddings            |
| ms-marco-TinyBERT-L6          | BertForSequenceClassification | 6      | 512            | Passage Reranking                      | Relevance Score       |
| cip29/blooms_bert             | BertForSequenceClassification | 12     | 512            | Bloom’s Taxonomy Classification        | 6 Bloom’s Levels      |


In [None]:
!pip install -qU  pymupdf4llm transformers accelerate bitsandbytes tantivy gradio lancedb==0.20.0

In [None]:
!huggingface-cli login
#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

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BertTokenizer, BertForSequenceClassification, BitsAndBytesConfig

#Enable 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision
    bnb_4bit_compute_dtype="float16",  # Use float16 for faster computation
    bnb_4bit_use_double_quant=True,  # Improves efficiency
    bnb_4bit_quant_type="nf4"  # NF4 quantization for better accuracy
)

#Load Tokenizer & Model with Quantization
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    quantization_config=bnb_config,
    device_map="cuda"  # Automatically assigns model to GPU
)

#Load Bloom’s Taxonomy BERT Model
blooms_model_name = "cip29/blooms_bert"
blooms_tokenizer = BertTokenizer.from_pretrained(blooms_model_name)
blooms_model = BertForSequenceClassification.from_pretrained(blooms_model_name, num_labels=6).to("cuda")

### sentence-transformers/all-MiniLM-L6-v2
{
  "max_seq_length": 256,
  "do_lower_case": false
}
### Tokenizer Config
{
  "do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512
  }

  This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.

######  pdf4llm.to_markdown()
outputs a list of dictinoaries containing

- metadata:
  - format: "Image"
  - title: ""
  - author: ""
  - subject: ""
  - keywords: ""
  - creator: ""
  - producer: ""
  - creationDate: ""
  - modDate: ""
  - trapped: ""
  - encryption: ""
  - file_path: "1-2.png"
  - page_count: 1
  - page: 1

- toc_items: []

- tables: []

- images:
  - number: 0
  - bbox: Rect(0.0, 50.0, 648.0, 310.0)
  - transform: (648.0, 0.0, 0.0, 360.0, 0.0, 0.0)
  - width: 2700
  - height: 1500
  - colorspace: 3
  - cs-name: "DeviceRGB"
  - xres: 300
  - yres: 300
  - bpc: 8
  - size: 88631
  - has-mask: False

- graphics: []

- text: "-----"

- words: []


In [None]:
!rm -rf /content/*.pdf

import pymupdf4llm
from google.colab import files
import numpy as np
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from transformers import AutoTokenizer

# Connect to LanceDB
db = lancedb.connect("/content")

# Initialize SBERT Embedder
embedder = get_registry().get("huggingface").create(
    name='sentence-transformers/all-MiniLM-L6-v2',
    device="cuda"
)

# Load tokenizer to chunk text
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


class PDFSchema(LanceModel):
    text: str = embedder.SourceField()              # chunk text (embedding input)
    vector: Vector(embedder.ndims()) = embedder.VectorField()
    page_name: str                                  # image/visual ID
    full_text: str                                   # full page text for reference
    page: int                                        # page number to detect duplicates



# Upload PDFs
uploaded = files.upload()
print(list(uploaded.keys()))


- Including overlap between chunks helps preserve context across chunk boundaries — especially useful when text is split in the middle of a sentence or paragraph(sliding window approach)



In [None]:
# Define chunking parameters
CHUNK_SIZE = 448
OVERLAP = 32

# Function to process and chunk text with a sliding window
def split_text_into_chunks(text, page_path, full_text, page_number):
    input_ids = tokenizer.encode(text, truncation=False, add_special_tokens=False)
    chunks = []

    for i in range(0, len(input_ids), CHUNK_SIZE - OVERLAP):
        chunk_ids = input_ids[i:i + CHUNK_SIZE]

        if len(chunk_ids) < 10:  # Skip very small chunks
            continue

        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        chunk_name = f"{page_path}_chunk_{i // (CHUNK_SIZE - OVERLAP) + 1}"

        chunks.append({
            "text": chunk_text,
            "page_name": chunk_name,
            "full_text": full_text,
            "page": page_number
        })

    return chunks

# Collect all entries
entries = []

# Process each uploaded file
uploaded_files = list(uploaded.keys())[:2]  # Limiting to 2 files for processing
for pdf_filename in uploaded_files:
    print(f"\nProcessing: {pdf_filename}")

    # Ask for page numbers or ranges
    page_input = input(f"Enter pages or ranges for {pdf_filename} (e.g., 1,3-5,7): ")

    # Parse user input into zero-based page indices
    selected_pages = []
    for part in page_input.split(","):
        part = part.strip()
        if "-" in part:
            start, end = map(int, part.split("-"))
            selected_pages.extend(range(start - 1, end))  # Convert to zero-based index
        else:
            selected_pages.append(int(part) - 1)

    # Extract specified pages
    selected_page_data = pymupdf4llm.to_markdown(pdf_filename, page_chunks=True, pages=selected_pages)

    # Process each page
    for page_data in selected_page_data:
        full_text = page_data["text"]
        page_path = page_data["metadata"]["file_path"]
        page_number = page_data["metadata"]["page"]

        if not full_text.strip():  # Skip empty pages
            continue

        # Split text into overlapping chunks with full page context
        chunks = split_text_into_chunks(full_text, page_path, full_text, page_number)

        # Add chunks to entries
        entries.extend(chunks)

# Store all entries in LanceDB
tbl = db.create_table("pdf_data", schema=PDFSchema, mode="overwrite")
tbl.add(entries)

print("\nAll selected pages have been chunked and stored in LanceDB with full page context!")


In [None]:
from lancedb.rerankers import CrossEncoderReranker

# Initialize reranker
reranker = CrossEncoderReranker()

# User query
query = input("\nEnter your query: ")

# Create full-text search index on the 'text' field
tbl.create_fts_index("text", replace=True)

# Search and rerank
results = tbl.search(query, query_type="hybrid").rerank(reranker=reranker).limit(5).to_list()

# Dictionary to hold unique pages
unique_pages = {}

# Filter out duplicates using the 'page' key
for res in results:
    page_number = res.get("page")
    if page_number not in unique_pages:
        unique_pages[page_number] = res["full_text"]

# Final list of unique full_text values with page number
final_full_texts = [{"page": page, "text": text} for page, text in unique_pages.items()]

# Optional: Display them
print("\nUnique full_text entries by page:\n")
for i, entry in enumerate(final_full_texts, 1):
    print(f"[{i}] Page {entry['page']}:\n{entry['text'][:500]}...\n{'-'*80}")


### QUESTION DIVERSITY EVALUATION


In [None]:
from sentence_transformers import SentenceTransformer, util
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate multiple question sets
def generate_multiple_question_sets(pages_text, user_query, num_sets=5):
    question_sets = []

    for _ in range(num_sets):
        prompt = f"""
You are an AI assistant specialized in question generation.
Your goal is to generate insightful questions based on the given context and user query.

Context:
{pages_text}

User Query (Focus Topic): {user_query}

### Reasoning:
- Step 1: Identify key points and concepts from the context relevant to the query
- Step 2: Consider what types of questions best explore the topic of interest
- Step 3: Formulate meaningful and topic-specific questions

Generate atleast **15 questions** from the context

### Questions:
"""
        inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")
        output = mistral_model.generate(
            **inputs,
            max_new_tokens=2048,
            temperature=0.8,
            top_p=0.8,
            do_sample=True,
            pad_token_id=mistral_tokenizer.eos_token_id
        )
        generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
        questions = generated_text.split("### Questions:")[-1].strip().split("\n")
        print(questions)
        questions = [q.strip("- ").strip() for q in questions if q.strip()]
        question_sets.append(questions)


    return question_sets

# Compute pairwise similarities
def compute_similarity_between_sets(question_sets):
    similarities = []
    for i in range(len(question_sets)):
        for j in range(i + 1, len(question_sets)):
            set1_embeddings = sentence_model.encode(question_sets[i], convert_to_tensor=True)
            set2_embeddings = sentence_model.encode(question_sets[j], convert_to_tensor=True)

            sim_matrix = util.pytorch_cos_sim(set1_embeddings, set2_embeddings)
            avg_sim1 = sim_matrix.max(dim=1).values.mean().item()
            avg_sim2 = sim_matrix.max(dim=0).values.mean().item()
            avg_pairwise_sim = (avg_sim1 + avg_sim2) / 2
            similarities.append(((i, j), avg_pairwise_sim))

    return similarities

# Plot heatmap
def plot_similarity_heatmap(similarities, num_sets):
    sim_matrix = np.eye(num_sets)
    for (i, j), score in similarities:
        sim_matrix[i][j] = sim_matrix[j][i] = score

    sns.heatmap(sim_matrix, annot=True, cmap="coolwarm", xticklabels=range(num_sets), yticklabels=range(num_sets))
    plt.title("Question Set Similarity Heatmap")
    plt.xlabel("Set Index")
    plt.ylabel("Set Index")
    plt.show()


sets = generate_multiple_question_sets(final_full_texts,query, num_sets=5)
similarities = compute_similarity_between_sets(sets)
plot_similarity_heatmap(similarities, len(sets))


### Mistral-7B-Instruct-v0.3/config.json

{
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  `"max_position_embeddings": 32768,`
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.0.dev0",
  "use_cache": true,
  "vocab_size": 32768
}

In [23]:
import re
import csv
import torch
import torch.nn.functional as F
from google.colab import files


# Bloom's Taxonomy Labels
bloom_labels = {
    0: "BT1 (Remembering)",
    1: "BT2 (Understanding)",
    2: "BT3 (Applying)",
    3: "BT4 (Analyzing)",
    4: "BT5 (Evaluating)",
    5: "BT6 (Creating)"
}

difficulty_labels = {
    0: "Easy",
    1: "Medium",
    2: "Hard"
}

mark_labels = {
    0: "2 Marks",
    1: "4 Marks",
    2: "8 Marks"
}

def classify_blooms_taxonomy(question):
    inputs = blooms_tokenizer(question, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        outputs = blooms_model(**inputs)
    probs = F.softmax(outputs.logits, dim=1).squeeze().tolist()
    predicted_idx = torch.argmax(outputs.logits, dim=1).item()
    predicted_label = bloom_labels[predicted_idx]
    prob_dict = {bloom_labels[i]: round(probs[i], 4) for i in range(6)}
    return predicted_label, prob_dict

def extract_numbered_list(text):
    items = re.split(r'\n(?=\d+\.\s)', text.strip())
    return [item.strip() for item in items if item.strip()]

def split_number_and_text(item):
    match = re.match(r'^(\d+)\.\s+(.*)', item, re.DOTALL)
    if match:
        return int(match.group(1)), match.group(2).strip()
    return None, item.strip()

In [25]:
def generate_questions_with_mistral_bulk(pages_text, user_query,no=12):
    prompt = f"""
You are an AI assistant specialized in question generation.
Your goal is to generate insightful questions based on the given context and user query.

Context:
{pages_text}

User Query (Focus Topic): {user_query}

### Reasoning:
- Step 1: Identify key points and concepts from the context relevant to the query
- Step 2: Consider what types of questions best explore the topic of interest
- Step 3: Formulate meaningful and topic-specific questions

Generate atleast **{no} questions** from the context

### Questions:
"""
    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")
    output = mistral_model.generate(
        **inputs,
        max_new_tokens=2048,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text.split("### Questions:")[-1].strip()

def write_full_batch_classified_output_to_csv_2(
    questions_raw,
    context_text,
    output_csv="final_questions_classified.csv"
):
    def extract_numbered_list(text):
      pattern = r"(?:^|\n)(\d+)[\.\:\)\-]\s*(.+?)(?=\n\d+[\.\:\)\-]|\Z)"
      matches = re.findall(pattern, text.strip(), flags=re.DOTALL)
      return [f"{num}. {q.strip()}" for num, q in matches]


    # Step 1: Extract numbered question list
    question_items = extract_numbered_list(questions_raw)
    questions = [split_number_and_text(q)[1].strip() for q in question_items]
    numbers = [split_number_and_text(q)[0] for q in question_items]


    # Step 2: Batch classify difficulty
    bulk_difficulty_prompt = "\n".join(
        [f"{i+1}. {q}" for i, q in enumerate(questions)]
    )
    difficulty_prompt = f"""
You are a question analysis assistant.

You must classify each question into one of the following difficulty levels based on its complexity, required depth of understanding, and reasoning effort.

Guidelines:
- **Easy**: Requires simple recall or basic understanding. Usually direct questions that can be answered in a sentence or less.
- **Medium**: Requires moderate understanding, application of concepts, or comparison. Involves 2–3 steps of reasoning or synthesis.
- **Hard**: Requires deep understanding, critical thinking, and multi-step reasoning. May involve evaluation, derivation, or synthesis of concepts.

Now analyze the following:

Question:
{bulk_difficulty_prompt}

### Difficulty levels:

"""
    inputs = mistral_tokenizer(difficulty_prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")
    difficulty_output = mistral_model.generate(
        **inputs,
        max_new_tokens=512,
        #top_p=0.9,
        do_sample=False,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    difficulty_response = mistral_tokenizer.decode(difficulty_output[0], skip_special_tokens=True).split("### Difficulty levels:")[-1].strip()
    print(difficulty_response)
    difficulties = [d.strip() for d in extract_numbered_list(difficulty_response)]

    # Step 3: Batch classify marks
    bulk_marks_prompt = "\n".join(
        [f"{i+1}. {q}" for i, q in enumerate(questions)]
    )
    marks_prompt = f"""
You are a question evaluation assistant.

You must classify each question into **2, 4, or 8 marks** based on its depth, length of answer required, and reasoning effort.

Guidelines:
- **2 Marks**: Simple questions requiring one-word or one-line answers. Factual recall.
- **4 Marks**: Medium complexity. Conceptual explanation or comparisons in 3-4 sentences.
- **8 Marks**: High complexity. Analytical, application-based, or multi-point descriptive answers.

Now analyze the following:

Question:
{bulk_marks_prompt}

### Marks Classified:

"""
    inputs = mistral_tokenizer(marks_prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")
    marks_output = mistral_model.generate(
        **inputs,
        max_new_tokens=512,
        #top_p=0.9,
        do_sample=False,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    marks_response = mistral_tokenizer.decode(marks_output[0], skip_special_tokens=True).split("### Marks Classified:")[-1].strip()
    print(marks_response)
    marks_list = [m.strip() for m in extract_numbered_list(marks_response)]

    # Step 4: Classify Bloom’s taxonomy (batch using your classifier)
    blooms_output = [classify_blooms_taxonomy(q) for q in questions]

    # Step 5: Compile final rows
    all_rows = []
    for idx, question in enumerate(questions):
        bloom_label, bloom_probs = blooms_output[idx]

        difficulty_match = re.search(r"(Easy|Medium|Hard)", difficulties[idx], re.IGNORECASE)
        difficulty = difficulty_match.group(1).capitalize() if difficulty_match else "Unknown"

        marks_match = re.search(r"\b(2|4|8)\b", marks_list[idx])
        marks = f"{marks_match.group(1)} marks" if marks_match else "Unknown"

        all_rows.append([
            numbers[idx], question, bloom_label, difficulty, marks,
            bloom_probs.get("BT1 (Remembering)", 0), bloom_probs.get("BT2 (Understanding)", 0),
            bloom_probs.get("BT3 (Applying)", 0), bloom_probs.get("BT4 (Analyzing)", 0),
            bloom_probs.get("BT5 (Evaluating)", 0), bloom_probs.get("BT6 (Creating)", 0)
        ])

    # Step 6: Save to CSV
    with open(output_csv, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            "No", "Question", "Bloom's Taxonomy Level", "Difficulty", "Marks",
            "BT1 (Remembering)", "BT2 (Understanding)", "BT3 (Applying)",
            "BT4 (Analyzing)", "BT5 (Evaluating)", "BT6 (Creating)"
        ])
        writer.writerows(all_rows)

    print(f"\n✅ Saved {len(all_rows)} questions with classification to '{output_csv}'")
    files.download(output_csv)
    return output_csv


In [None]:
no = int(input("Enter No of Questions: "))
questions_raw = generate_questions_with_mistral_bulk(final_full_texts,query,no)
print(questions_raw)
write_full_batch_classified_output_to_csv_2(questions_raw, final_full_texts)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load CSV (update path if necessary)
df = pd.read_csv("/content/final_questions_classified.csv")


# Clean column names
df.columns = df.columns.str.strip()

# Optional: Normalize "Marks" (remove " marks" string and convert to int)
df['Marks'] = df['Marks'].str.replace(' marks', '').astype(int)

# ------------------ PART 1: CATEGORY COUNTS ------------------
print("🔹 Count by Difficulty:\n", df['Difficulty'].value_counts(), "\n")
print("🔹 Count by Marks:\n", df['Marks'].value_counts().sort_index(), "\n")
print("🔹 Count by Bloom's Taxonomy Level:\n", df["Bloom's Taxonomy Level"].value_counts(), "\n")

# ------------------ PART 2: DIFFICULTY vs MARKS ------------------
matrix_diff_marks = pd.crosstab(df['Difficulty'], df['Marks'])
print("\n📊 Matrix: Difficulty vs Marks\n", matrix_diff_marks)

plt.figure(figsize=(8, 5))
sns.heatmap(matrix_diff_marks, annot=True, fmt='d', cmap='Blues')
plt.title("Difficulty vs Marks")
plt.xlabel("Marks")
plt.ylabel("Difficulty")
plt.tight_layout()
plt.show()

# ------------------ PART 3: MARKS vs BLOOM ------------------
matrix_marks_bloom = pd.crosstab(df['Marks'], df["Bloom's Taxonomy Level"])
print("\n📊 Matrix: Marks vs Bloom's Taxonomy\n", matrix_marks_bloom)

plt.figure(figsize=(10, 6))
sns.heatmap(matrix_marks_bloom, annot=True, fmt='d', cmap='Greens')
plt.title("Marks vs Bloom's Taxonomy Level")
plt.xlabel("Bloom's Taxonomy Level")
plt.ylabel("Marks")
plt.tight_layout()
plt.show()



# Extract unique sorted filter values
bloom_options = ['Any'] + sorted(df["Bloom's Taxonomy Level"].unique())
difficulty_options = ['Any'] + sorted(df['Difficulty'].unique())
marks_options = ['Any'] + sorted(df['Marks'].unique())

# Filter function
def filter_questions(bloom, difficulty, marks):
    filtered_df = df.copy()

    if bloom != 'Any':
        filtered_df = filtered_df[filtered_df["Bloom's Taxonomy Level"] == bloom]
    if difficulty != 'Any':
        filtered_df = filtered_df[filtered_df['Difficulty'] == difficulty]
    if marks != 'Any':
        filtered_df = filtered_df[filtered_df['Marks'] == int(marks)]

    if filtered_df.empty:
        return "❌ No questions match the selected filters."

    # Build output string
    output = ""
    for idx, row in filtered_df.iterrows():
        q_no = row['No']
        question = row['Question']
        bloom_level = row["Bloom's Taxonomy Level"]
        difficulty = row['Difficulty']
        marks_val = row['Marks']
        output += f"🔸 **Q{q_no}** ({difficulty}, {marks_val} marks, {bloom_level}):\n{question}\n\n"

    return output

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## 🔍 Filter and View Questions")

    with gr.Row():
        bloom_dd = gr.Dropdown(choices=bloom_options, label="Bloom's Taxonomy Level", value='Any')
        diff_dd = gr.Dropdown(choices=difficulty_options, label="Difficulty", value='Any')
        marks_dd = gr.Dropdown(choices=marks_options, label="Marks", value='Any')

    output_box = gr.Markdown()
    btn = gr.Button("🔎 Show Questions")

    btn.click(fn=filter_questions,
              inputs=[bloom_dd, diff_dd, marks_dd],
              outputs=output_box)

# Launch the app
demo.launch(share=True)


In [None]:
'''def generate_answer_key_with_mistral(questions_output, context):
    prompt = f"""
You are an AI assistant specialized in answering technical questions.

Context:
{context}

Questions:
{questions_output}

Generate answers for each questions listed above:

### Answer Key:
"""
    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")
    output = mistral_model.generate(
        **inputs,
        max_new_tokens=4096,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
    print(generated_text)
    return generated_text.split("### Answer Key:")[-1].strip()

answers_raw = generate_answer_key_with_mistral(questions_raw, final_full_texts)

print("Answers:\n", answers_raw)
#query = input("\nEnter your query: ")
bulk_process_pages(final_full_texts, query, batch_size=5)
'''

In [None]:
'''import re
import csv
import torch
import torch.nn.functional as F
from google.colab import files

# Bloom's Taxonomy Labels
bloom_labels = {
    0: "BT1 (Remembering)",
    1: "BT2 (Understanding)",
    2: "BT3 (Applying)",
    3: "BT4 (Analyzing)",
    4: "BT5 (Evaluating)",
    5: "BT6 (Creating)"
}

difficulty_labels = {
    0: "Easy",
    1: "Medium",
    2: "Hard"
}

mark_labels = {
    0: "2 Marks",
    1: "4 Marks",
    2: "8 Marks"
}

# Utility functions
def extract_numbered_list(text):
    items = re.split(r'\n(?=\d+\.\s)', text.strip())
    return [item.strip() for item in items if item.strip()]

def split_number_and_text(item):
    match = re.match(r'^(\d+)\.\s+(.*)', item, re.DOTALL)
    if match:
        return int(match.group(1)), match.group(2).strip()
    return None, item.strip()

# Mistral Question Generation
def generate_questions_with_mistral_bulk(pages_text, user_query):
    prompt = f"""
You are an AI assistant specialized in question generation.
Your goal is to generate insightful questions based on the given context and user query.

Context:
{pages_text}

User Query (Focus Topic): {user_query}

### Reasoning:
- Step 1: Identify key points and concepts from the context relevant to the query
- Step 2: Consider what types of questions best explore the topic of interest
- Step 3: Formulate meaningful and topic-specific questions

### Questions:
"""
    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")
    output = mistral_model.generate(
        **inputs,
        max_new_tokens=1024,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
    print(generated_text.split("### Questions:")[-1].strip())
    return generated_text.split("### Questions:")[-1].strip()


# Mistral Answer Generation
def generate_answer_key_with_mistral(questions_output, context):
    prompt = f"""
You are an AI assistant specialized in answering technical questions.

Context:
{context}

Questions:
{questions_output}

### Answer Key:
"""
    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")
    output = mistral_model.generate(
        **inputs,
        max_new_tokens=2048,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
    print(generated_text.split("### Answer Key:")[-1].strip())
    return generated_text.split("### Answer Key:")[-1].strip()

# Bloom’s Classifier
def classify_blooms_taxonomy(question):
    inputs = blooms_tokenizer(question, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        outputs = blooms_model(**inputs)
    probs = F.softmax(outputs.logits, dim=1).squeeze().tolist()
    predicted_idx = torch.argmax(outputs.logits, dim=1).item()
    predicted_label = bloom_labels[predicted_idx]
    prob_dict = {bloom_labels[i]: round(probs[i], 4) for i in range(6)}
    return predicted_label, prob_dict

def classify_difficulty(question):
    inputs = difficulty_tokenizer(question, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        outputs = difficulty_model(**inputs)
    probs = F.softmax(outputs.logits, dim=1).squeeze().tolist()
    predicted_idx = torch.argmax(outputs.logits, dim=1).item()
    predicted_label = difficulty_labels[predicted_idx]
    prob_dict = {difficulty_labels[i]: round(probs[i], 4) for i in range(3)}
    return predicted_label, prob_dict

def classify_marks(question):
    inputs = marks_tokenizer(question, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        outputs = marks_model(**inputs)
    probs = F.softmax(outputs.logits, dim=1).squeeze().tolist()
    predicted_idx = torch.argmax(outputs.logits, dim=1).item()
    predicted_label = mark_labels[predicted_idx]
    prob_dict = {mark_labels[i]: round(probs[i], 4) for i in range(3)}
    return predicted_label, prob_dict

def bulk_process_pages(pages_array, query, batch_size, output_csv="final_questions_classified.csv"):
    all_qa_pairs = []

    for i in range(0, len(pages_array), batch_size):
        batch_pages = pages_array[i:i+batch_size]
        combined_text = "\n".join(page["text"] for page in batch_pages)
        print(f"\n📄 Processing Pages {i+1} to {min(i+batch_size, len(pages_array))}...")

        if len(mistral_tokenizer.tokenize(combined_text)) > 4096:
            print("⚠️ Combined text exceeds token limit. Consider reducing batch size.")
            continue

        questions = generate_questions_with_mistral_bulk(combined_text, query)
        answers = generate_answer_key_with_mistral(questions_output=questions, context=combined_text)

        question_items = extract_numbered_list(questions)
        answer_items = extract_numbered_list(answers)

        for q_item, a_item in zip(question_items, answer_items):
            q_num, q_text = split_number_and_text(q_item)
            a_num, a_text = split_number_and_text(a_item)

            if q_num == a_num and q_text and a_text:
                print(f"\nQ{q_num}: {q_text}\nA{a_num}: {a_text}")
                all_qa_pairs.append((q_num, q_text, a_text))
            else:
                print(f"⚠️ Mismatch: Question {q_num} doesn't match Answer {a_num}")

    with open(output_csv, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            "No", "Question", "Bloom's Taxonomy Level", "Difficulty Level", "Marks",
            "Answer",
            "BT1 (Remembering)", "BT2 (Understanding)", "BT3 (Applying)",
            "BT4 (Analyzing)", "BT5 (Evaluating)", "BT6 (Creating)",
            "Easy", "Medium", "Hard",
            "2 Marks", "4 Marks", "8 Marks"
        ])

        for idx, (num, question, answer) in enumerate(all_qa_pairs, 1):
            bloom_level, bloom_probs = classify_blooms_taxonomy(question)
            difficulty_level, diff_probs = classify_difficulty(question)
            marks_level, marks_probs = classify_marks(question)

            writer.writerow([
                idx, question, bloom_level, difficulty_level, marks_level,
                answer,
                bloom_probs["BT1 (Remembering)"], bloom_probs["BT2 (Understanding)"],
                bloom_probs["BT3 (Applying)"], bloom_probs["BT4 (Analyzing)"],
                bloom_probs["BT5 (Evaluating)"], bloom_probs["BT6 (Creating)"],
                diff_probs["Easy"], diff_probs["Medium"], diff_probs["Hard"],
                marks_probs["2 Marks"], marks_probs["4 Marks"], marks_probs["8 Marks"]
            ])

    print(f"\n✅ Saved {len(all_qa_pairs)} classified Q&A pairs to '{output_csv}'")
    files.download(output_csv)
'''

In [None]:
'''
def extract_numbered_list(text):
    """Extracts numbered items (e.g., 1. ... 2. ...) from a single string."""
    items = re.split(r'\n(?=\d+\.\s)', text.strip())
    return [item.strip() for item in items if item.strip()]

def split_number_and_text(item):
    """Splits '1. Some text here' into (1, 'Some text here')"""
    match = re.match(r'^(\d+)\.\s+(.*)', item, re.DOTALL)
    if match:
        return int(match.group(1)), match.group(2).strip()
    return None, item.strip()

def generate_questions_with_mistral_bulk(pages_text, user_query):
    prompt = f"""
You are an AI assistant specialized in question generation.
Your goal is to generate insightful questions based on the given context and user query.

Please follow these steps:
1. Analyze the provided context to identify key points related to the user query.
2. Focus on the topic specified by the user while framing the questions.
3. Generate questions first.

Context:
{pages_text}

User Query (Focus Topic): {user_query}

### Reasoning:
- Step 1: Identify key points and concepts from the context relevant to the query
- Step 2: Consider what types of questions best explore the topic of interest
- Step 3: Formulate meaningful and topic-specific questions

### Questions:
"""

    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")

    output = mistral_model.generate(
        **inputs,
        max_new_tokens=4096,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
    #print(generated_text)
    return generated_text.split("### Questions:")[-1].strip()



def generate_answer_key_with_mistral(questions_output, context):
    prompt = f"""
You are an AI assistant specialized in answering technical questions.

Please use the following context to generate a precise answer key for each question.

Context:
{context}

Questions:
{questions_output}


### Answer Key:
"""

    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")

    output = mistral_model.generate(
        **inputs,
        max_new_tokens=4096,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
    #print(generated_text)
    return generated_text.split("### Answer Key:")[-1].strip()



def generate_questions_and_answers(pages_text, user_query,csv_filename="generated_questions_answers.csv"):
    questions = generate_questions_with_mistral_bulk(pages_text, user_query)
    answers = generate_answer_key_with_mistral(questions_output=questions, context=pages_text)
    print(questions)
    print(answers)
    question_items = extract_numbered_list(questions)
    answer_items = extract_numbered_list(answers)

    # Step 3: Map question and answer numbers
    qa_pairs = []
    for q_item, a_item in zip(question_items, answer_items):
        q_num, q_text = split_number_and_text(q_item)
        a_num, a_text = split_number_and_text(a_item)

        if q_num == a_num:
            qa_pairs.append((q_num, q_text, a_text))
        else:
            print(f"⚠️ Mismatch: Question {q_num} doesn't match Answer {a_num}")

    # Step 4: Write to CSV
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['No', 'Question', 'Answer'])

        for num, question, answer in qa_pairs:
            writer.writerow([num, question, answer])

    print(f"\n✅ Saved {len(qa_pairs)} Q&A pairs to '{csv_filename}'")

    files.download(csv_filename)



import torch.nn.functional as F
bloom_labels = {
    0: "BT1 (Remembering)",
    1: "BT2 (Understanding)",
    2: "BT3 (Applying)",
    3: "BT4 (Analyzing)",
    4: "BT5 (Evaluating)",
    5: "BT6 (Creating)"
}

def classify_blooms_taxonomy(question):
    inputs = blooms_tokenizer(question, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        outputs = blooms_model(**inputs)
    probs = F.softmax(outputs.logits, dim=1).squeeze().tolist()

    predicted_idx = torch.argmax(outputs.logits, dim=1).item()
    predicted_label = bloom_labels[predicted_idx]

    prob_dict = {bloom_labels[i]: round(probs[i], 4) for i in range(6)}
    return predicted_label, prob_dict

def classify_questions_and_save(input_csv="generated_questions_answers.csv", output_csv="final_questions_classified.csv"):
    with open(input_csv, mode='r', encoding='utf-8') as infile, open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        # Write header
        writer.writerow([
            "No","Question", "Answer", "Bloom's Taxonomy Level",
            "BT1 (Remembering)", "BT2 (Understanding)", "BT3 (Applying)",
            "BT4 (Analyzing)", "BT5 (Evaluating)", "BT6 (Creating)"
        ])

        next(reader)  # Skip header

        for row in reader:
            no,question,answer = row[0],row[1],row[2]
            bloom_level, probs = classify_blooms_taxonomy(question)

            writer.writerow([
                no,question, answer, bloom_level,
                probs["BT1 (Remembering)"], probs["BT2 (Understanding)"],
                probs["BT3 (Applying)"], probs["BT4 (Analyzing)"],
                probs["BT5 (Evaluating)"], probs["BT6 (Creating)"]
            ])

    from google.colab import files
    files.download(output_csv)



def bulk_process_pages(pages_array,query,batch_size):



      for i in range(0, len(pages_array), batch_size):
        batch_pages = pages_array[i:i+batch_size]  # Get a batch of pages
        combined_text = "\n".join(page["text"] for page in batch_pages)
        print(f"📄 Processing Pages {i+1} to {min(i+batch_size, len(pages_array))}...")

        # Generate questions with CoT

        generate_questions_and_answers(combined_text,query)
        classify_questions_and_save()



query = input("\nEnter your query: ")
bulk_process_pages(final_full_texts,query,batch_size=5)
'''
