<a href="https://colab.research.google.com/github/cipB14/Questify/blob/main/fewshot_cot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Libraries and Their Use in Questify Project

| Library/Tool        | Use Case in Questify                                                                       |
|---------------------|----------------------------------------------------------------------|
| pdf4llm           | Extracts structured content (text, images, tables) from PDFs         |
| transformers      | Loads and runs LLMs (e.g., Mistral for question generation, BERT for classification) |
| accelerate        | Speeds up model inference across GPU/CPU environments                |
| bitsandbytes      | Enables low-bit quantization for memory-efficient LLMs               |
| lancedb           | Stores SBERT embeddings for hybrid search of study content           |
| tantivy           | Provides fast keyword-based full-text indexing and search            |


| Model Name                     | Type                          | Layers | Max Seq Length | Use Case                              | Labels / Output      |
|-------------------------------|-------------------------------|--------|----------------|----------------------------------------|-----------------------|
| Mistral-7B-Instruct-v0.3      | MistralForCausalLM            | 32     | 32,768         | Question Generation                    | Text (Generated Qs)   |
| all-MiniLM-L6-v2              | BertModel                     | 6      | 512            | Sentence Embeddings for Retrieval      | Embeddings            |
| ms-marco-TinyBERT-L6          | BertForSequenceClassification | 6      | 512            | Passage Reranking                      | Relevance Score       |
| cip29/blooms_bert             | BertForSequenceClassification | 12     | 512            | Bloom’s Taxonomy Classification        | 6 Bloom’s Levels      |


In [1]:
!pip install -qU pdf4llm pymupdf transformers accelerate bitsandbytes tantivy gradio lancedb==0.20.0

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.6/32.6 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m75.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.2/322.2 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!huggingface-cli login
#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


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `cipb14` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authe

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BertTokenizer, BertForSequenceClassification, BitsAndBytesConfig

#Enable 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision
    bnb_4bit_compute_dtype="float16",  # Use float16 for faster computation
    bnb_4bit_use_double_quant=True,  # Improves efficiency
    bnb_4bit_quant_type="nf4"  # NF4 quantization for better accuracy
)

#Load Tokenizer & Model with Quantization
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    quantization_config=bnb_config,
    device_map="cuda"  # Automatically assigns model to GPU
)

#Load Bloom’s Taxonomy BERT Model
blooms_model_name = "cip29/blooms_bert"
blooms_tokenizer = BertTokenizer.from_pretrained(blooms_model_name)
blooms_model = BertForSequenceClassification.from_pretrained(blooms_model_name, num_labels=6).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

### sentence-transformers/all-MiniLM-L6-v2
{
  "max_seq_length": 256,
  "do_lower_case": false
}
### Tokenizer Config
{
  "do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512
  }

  This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.

######  pdf4llm.to_markdown()
outputs a list of dictinoaries containing

- metadata:
  - format: "Image"
  - title: ""
  - author: ""
  - subject: ""
  - keywords: ""
  - creator: ""
  - producer: ""
  - creationDate: ""
  - modDate: ""
  - trapped: ""
  - encryption: ""
  - file_path: "1-2.png"
  - page_count: 1
  - page: 1

- toc_items: []

- tables: []

- images:
  - number: 0
  - bbox: Rect(0.0, 50.0, 648.0, 310.0)
  - transform: (648.0, 0.0, 0.0, 360.0, 0.0, 0.0)
  - width: 2700
  - height: 1500
  - colorspace: 3
  - cs-name: "DeviceRGB"
  - xres: 300
  - yres: 300
  - bpc: 8
  - size: 88631
  - has-mask: False

- graphics: []

- text: "-----"

- words: []


In [4]:
!rm -rf /content/*.pdf

import pdf4llm
from google.colab import files
import numpy as np
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from transformers import AutoTokenizer

# Connect to LanceDB
db = lancedb.connect("/content")

# Initialize SBERT Embedder
embedder = get_registry().get("huggingface").create(
    name='sentence-transformers/all-MiniLM-L6-v2',
    device="cuda"
)

# Load tokenizer to chunk text
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


class PDFSchema(LanceModel):
    text: str = embedder.SourceField()              # chunk text (embedding input)
    vector: Vector(embedder.ndims()) = embedder.VectorField()
    page_name: str                                  # image/visual ID
    full_text: str                                   # full page text for reference
    page: int                                        # page number to detect duplicates



# Upload PDFs
uploaded = files.upload()
print(list(uploaded.keys()))


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Saving Textbook-AjayDKshemkalyani_MukeshSinghal.pdf to Textbook-AjayDKshemkalyani_MukeshSinghal.pdf
['Textbook-AjayDKshemkalyani_MukeshSinghal.pdf']


- Including overlap between chunks helps preserve context across chunk boundaries — especially useful when text is split in the middle of a sentence or paragraph(sliding window approach)



In [5]:
# Define chunking parameters
CHUNK_SIZE = 480
OVERLAP = 64

# Function to process and chunk text with a sliding window
def split_text_into_chunks(text, page_path, full_text, page_number):
    input_ids = tokenizer.encode(text, truncation=False, add_special_tokens=False)
    chunks = []

    for i in range(0, len(input_ids), CHUNK_SIZE - OVERLAP):
        chunk_ids = input_ids[i:i + CHUNK_SIZE]

        if len(chunk_ids) < 10:  # Skip very small chunks
            continue

        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        chunk_name = f"{page_path}_chunk_{i // (CHUNK_SIZE - OVERLAP) + 1}"

        chunks.append({
            "text": chunk_text,
            "page_name": chunk_name,
            "full_text": full_text,
            "page": page_number
        })

    return chunks

# Collect all entries
entries = []

# Process each uploaded file
uploaded_files = list(uploaded.keys())[:2]  # Limiting to 2 files for processing
for pdf_filename in uploaded_files:
    print(f"\nProcessing: {pdf_filename}")

    # Ask for page numbers or ranges
    page_input = input(f"Enter pages or ranges for {pdf_filename} (e.g., 1,3-5,7): ")

    # Parse user input into zero-based page indices
    selected_pages = []
    for part in page_input.split(","):
        part = part.strip()
        if "-" in part:
            start, end = map(int, part.split("-"))
            selected_pages.extend(range(start - 1, end))  # Convert to zero-based index
        else:
            selected_pages.append(int(part) - 1)

    # Extract specified pages
    selected_page_data = pdf4llm.to_markdown(pdf_filename, page_chunks=True, pages=selected_pages)

    # Process each page
    for page_data in selected_page_data:
        full_text = page_data["text"]
        page_path = page_data["metadata"]["file_path"]
        page_number = page_data["metadata"]["page"]

        if not full_text.strip():  # Skip empty pages
            continue

        # Split text into overlapping chunks with full page context
        chunks = split_text_into_chunks(full_text, page_path, full_text, page_number)

        # Add chunks to entries
        entries.extend(chunks)

# Store all entries in LanceDB
tbl = db.create_table("pdf_data", schema=PDFSchema, mode="overwrite")
tbl.add(entries)

print("\nAll selected pages have been chunked and stored in LanceDB with full page context!")



Processing: Textbook-AjayDKshemkalyani_MukeshSinghal.pdf
Enter pages or ranges for Textbook-AjayDKshemkalyani_MukeshSinghal.pdf (e.g., 1,3-5,7): 1-756


Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors



All selected pages have been chunked and stored in LanceDB with full page context!


In [9]:
from lancedb.rerankers import CrossEncoderReranker

# Initialize reranker
reranker = CrossEncoderReranker()

# User query
query = input("\nEnter your query: ")

# Create full-text search index on the 'text' field
tbl.create_fts_index("text", replace=True)

# Search and rerank
results = tbl.search(query, query_type="hybrid").rerank(reranker=reranker).limit(5).to_list()

# Dictionary to hold unique pages
unique_pages = {}

# Filter out duplicates using the 'page' key
for res in results:
    page_number = res.get("page")
    if page_number not in unique_pages:
        unique_pages[page_number] = res["full_text"]

# Final list of unique full_text values with page number
final_full_texts = [{"page": page, "text": text} for page, text in unique_pages.items()]

# Optional: Display them
print("\nUnique full_text entries by page:\n")
for i, entry in enumerate(final_full_texts, 1):
    print(f"{i} Page {entry['page']}:\n{entry['text'][:]}")



Enter your query: Chandy Lamport

Unique full_text entries by page:

1 Page 117:
**97** **4.4 Variations of the Chandy–Lamport algorithm**

Thus, the recorded global state is a valid state in an equivalent execution
and if a stable property (i.e., a property that persists such as termination or
deadlock) holds in the system before the snapshot algorithm begins, it holds
in the recorded global snapshot. Therefore, a recorded global state is useful
in detecting stable properties.
A physical interpretation of the collected global state is as follows: consider
the two instants of recording of the local states in the banking example.
If the cut formed by these instants is viewed as being an elastic band and if
the elastic band is stretched so that it is vertical, then recorded states of all

processes occur simultaneously at one physical instant, and the recorded
global state occurs in the execution that is depicted in this modified space–
time diagram. This is called the rubber-band crite

### Mistral-7B-Instruct-v0.3/config.json

{
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  `"max_position_embeddings": 32768,`
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.0.dev0",
  "use_cache": true,
  "vocab_size": 32768
}

In [10]:
import re
import csv
import torch
import torch.nn.functional as F
from google.colab import files


# Mistral Question Generation
def generate_questions_with_mistral_bulk(pages_text, user_query):
    prompt = f"""

You are an AI assistant specialized in educational question generation.

Your task is to generate insightful, diverse questions from the provided **context**, focusing on the given **user query topic**.

Each question must be categorized based on:
- **Difficulty**: Easy / Medium / Hard
- **Bloom’s Taxonomy Level**:
  - L1: Remember
  - L2: Understand
  - L3: Apply
  - L4: Analyze
  - L5: Evaluate
  - L6: Create
- **Marks**: 2 / 4 / 8

Use the following few-shot examples to guide your reasoning on how to assign **difficulty**, **Bloom’s level**, and **marks**.

---

### Few-Shot Examples

**Example 1:**
Q: What is overfitting in machine learning models?
→ Difficulty: Easy
→ Bloom’s Level: L1 (Remember)
→ Marks: 2
**Reasoning:** This question asks for the definition of a basic, frequently taught concept. It does not require reasoning or explanation—only memory recall.

---

**Example 2:**
Q: Explain how regularization techniques help in avoiding overfitting.
→ Difficulty: Medium
→ Bloom’s Level: L2 (Understand)
→ Marks: 4
**Reasoning:** Requires understanding two concepts—overfitting and regularization—and the relationship between them. It involves comprehension, not just recall.


Now, apply this structure to the given user query and context below:

---

### 📌 User Query (Focus Topic):
{user_query}

### 📚 Context:
{pages_text}
"""
    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")
    output = mistral_model.generate(
        **inputs,
        max_new_tokens=2048,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
    print(generated_text)



def bulk_process_pages(pages_array, query, batch_size):
    all_qa_pairs = []

    for i in range(0, len(pages_array), batch_size):
        batch_pages = pages_array[i:i+batch_size]
        combined_text = "\n".join(page["text"] for page in batch_pages)
        print(f"\n📄 Processing Pages {i+1} to {min(i+batch_size, len(pages_array))}...")

        # Generate questions and answers
        questions = generate_questions_with_mistral_bulk(combined_text, query)






In [11]:

#query = input("\nEnter your query: ")
bulk_process_pages(final_full_texts, query, batch_size=5)


📄 Processing Pages 1 to 5...


You are an AI assistant specialized in educational question generation.

Your task is to generate insightful, diverse questions from the provided **context**, focusing on the given **user query topic**.

Each question must be categorized based on:
- **Difficulty**: Easy / Medium / Hard
- **Bloom’s Taxonomy Level**:
  - L1: Remember
  - L2: Understand
  - L3: Apply
  - L4: Analyze
  - L5: Evaluate
  - L6: Create
- **Marks**: 2 / 4 / 8

Use the following few-shot examples to guide your reasoning on how to assign **difficulty**, **Bloom’s level**, and **marks**.

---

### Few-Shot Examples 

**Example 1:**
Q: What is overfitting in machine learning models?
→ Difficulty: Easy
→ Bloom’s Level: L1 (Remember)
→ Marks: 2
**Reasoning:** This question asks for the definition of a basic, frequently taught concept. It does not require reasoning or explanation—only memory recall.

---

**Example 2:**
Q: Explain how regularization techniques help in avoiding overfitti

In [None]:
'''
def extract_numbered_list(text):
    """Extracts numbered items (e.g., 1. ... 2. ...) from a single string."""
    items = re.split(r'\n(?=\d+\.\s)', text.strip())
    return [item.strip() for item in items if item.strip()]

def split_number_and_text(item):
    """Splits '1. Some text here' into (1, 'Some text here')"""
    match = re.match(r'^(\d+)\.\s+(.*)', item, re.DOTALL)
    if match:
        return int(match.group(1)), match.group(2).strip()
    return None, item.strip()

def generate_questions_with_mistral_bulk(pages_text, user_query):
    prompt = f"""
You are an AI assistant specialized in question generation.
Your goal is to generate insightful questions based on the given context and user query.

Please follow these steps:
1. Analyze the provided context to identify key points related to the user query.
2. Focus on the topic specified by the user while framing the questions.
3. Generate questions first.

Context:
{pages_text}

User Query (Focus Topic): {user_query}

### Reasoning:
- Step 1: Identify key points and concepts from the context relevant to the query
- Step 2: Consider what types of questions best explore the topic of interest
- Step 3: Formulate meaningful and topic-specific questions

### Questions:
"""

    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")

    output = mistral_model.generate(
        **inputs,
        max_new_tokens=4096,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
    #print(generated_text)
    return generated_text.split("### Questions:")[-1].strip()



def generate_answer_key_with_mistral(questions_output, context):
    prompt = f"""
You are an AI assistant specialized in answering technical questions.

Please use the following context to generate a precise answer key for each question.

Context:
{context}

Questions:
{questions_output}


### Answer Key:
"""

    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")

    output = mistral_model.generate(
        **inputs,
        max_new_tokens=4096,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
    #print(generated_text)
    return generated_text.split("### Answer Key:")[-1].strip()



def generate_questions_and_answers(pages_text, user_query,csv_filename="generated_questions_answers.csv"):
    questions = generate_questions_with_mistral_bulk(pages_text, user_query)
    answers = generate_answer_key_with_mistral(questions_output=questions, context=pages_text)
    print(questions)
    print(answers)
    question_items = extract_numbered_list(questions)
    answer_items = extract_numbered_list(answers)

    # Step 3: Map question and answer numbers
    qa_pairs = []
    for q_item, a_item in zip(question_items, answer_items):
        q_num, q_text = split_number_and_text(q_item)
        a_num, a_text = split_number_and_text(a_item)

        if q_num == a_num:
            qa_pairs.append((q_num, q_text, a_text))
        else:
            print(f"⚠️ Mismatch: Question {q_num} doesn't match Answer {a_num}")

    # Step 4: Write to CSV
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['No', 'Question', 'Answer'])

        for num, question, answer in qa_pairs:
            writer.writerow([num, question, answer])

    print(f"\n✅ Saved {len(qa_pairs)} Q&A pairs to '{csv_filename}'")

    files.download(csv_filename)



import torch.nn.functional as F
bloom_labels = {
    0: "BT1 (Remembering)",
    1: "BT2 (Understanding)",
    2: "BT3 (Applying)",
    3: "BT4 (Analyzing)",
    4: "BT5 (Evaluating)",
    5: "BT6 (Creating)"
}

def classify_blooms_taxonomy(question):
    inputs = blooms_tokenizer(question, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        outputs = blooms_model(**inputs)
    probs = F.softmax(outputs.logits, dim=1).squeeze().tolist()

    predicted_idx = torch.argmax(outputs.logits, dim=1).item()
    predicted_label = bloom_labels[predicted_idx]

    prob_dict = {bloom_labels[i]: round(probs[i], 4) for i in range(6)}
    return predicted_label, prob_dict

def classify_questions_and_save(input_csv="generated_questions_answers.csv", output_csv="final_questions_classified.csv"):
    with open(input_csv, mode='r', encoding='utf-8') as infile, open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        # Write header
        writer.writerow([
            "No","Question", "Answer", "Bloom's Taxonomy Level",
            "BT1 (Remembering)", "BT2 (Understanding)", "BT3 (Applying)",
            "BT4 (Analyzing)", "BT5 (Evaluating)", "BT6 (Creating)"
        ])

        next(reader)  # Skip header

        for row in reader:
            no,question,answer = row[0],row[1],row[2]
            bloom_level, probs = classify_blooms_taxonomy(question)

            writer.writerow([
                no,question, answer, bloom_level,
                probs["BT1 (Remembering)"], probs["BT2 (Understanding)"],
                probs["BT3 (Applying)"], probs["BT4 (Analyzing)"],
                probs["BT5 (Evaluating)"], probs["BT6 (Creating)"]
            ])

    from google.colab import files
    files.download(output_csv)



def bulk_process_pages(pages_array,query,batch_size):



      for i in range(0, len(pages_array), batch_size):
        batch_pages = pages_array[i:i+batch_size]  # Get a batch of pages
        combined_text = "\n".join(page["text"] for page in batch_pages)
        print(f"📄 Processing Pages {i+1} to {min(i+batch_size, len(pages_array))}...")

        # Generate questions with CoT

        generate_questions_and_answers(combined_text,query)
        classify_questions_and_save()



query = input("\nEnter your query: ")
bulk_process_pages(final_full_texts,query,batch_size=5)
'''


'\ndef extract_numbered_list(text):\n    """Extracts numbered items (e.g., 1. ... 2. ...) from a single string."""\n    items = re.split(r\'\n(?=\\d+\\.\\s)\', text.strip())\n    return [item.strip() for item in items if item.strip()]\n\ndef split_number_and_text(item):\n    """Splits \'1. Some text here\' into (1, \'Some text here\')"""\n    match = re.match(r\'^(\\d+)\\.\\s+(.*)\', item, re.DOTALL)\n    if match:\n        return int(match.group(1)), match.group(2).strip()\n    return None, item.strip()\n\ndef generate_questions_with_mistral_bulk(pages_text, user_query):\n    prompt = f"""\nYou are an AI assistant specialized in question generation.\nYour goal is to generate insightful questions based on the given context and user query.\n\nPlease follow these steps:\n1. Analyze the provided context to identify key points related to the user query.\n2. Focus on the topic specified by the user while framing the questions.\n3. Generate questions first.\n\nContext:\n{pages_text}\n\nUser 