<a href="https://colab.research.google.com/github/cipB14/Questify/blob/patch1/Final_v1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Libraries and Their Use in Questify Project

| Library/Tool        | Use Case in Questify                                                                       |
|---------------------|----------------------------------------------------------------------|
| pdf4llm           | Extracts structured content (text, images, tables) from PDFs         |
| transformers      | Loads and runs LLMs (e.g., Mistral for question generation, BERT for classification) |
| accelerate        | Speeds up model inference across GPU/CPU environments                |
| bitsandbytes      | Enables low-bit quantization for memory-efficient LLMs               |
| lancedb           | Stores SBERT embeddings for hybrid search of study content           |
| tantivy           | Provides fast keyword-based full-text indexing and search            |




In [1]:
!pip install -qU pdf4llm pymupdf transformers accelerate bitsandbytes lancedb tantivy

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m437.0 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.1/342.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.6/32.6 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.7/35.7 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!huggingface-cli login
#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


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `cipb14` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authe

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BertTokenizer, BertForSequenceClassification, BitsAndBytesConfig

#Enable 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision
    bnb_4bit_compute_dtype="float16",  # Use float16 for faster computation
    bnb_4bit_use_double_quant=True,  # Improves efficiency
    bnb_4bit_quant_type="nf4"  # NF4 quantization for better accuracy
)

#Load Tokenizer & Model with Quantization
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    quantization_config=bnb_config,
    device_map="cuda"  # Automatically assigns model to GPU
)

#Load Bloom’s Taxonomy BERT Model
blooms_model_name = "cip29/blooms_bert"
blooms_tokenizer = BertTokenizer.from_pretrained(blooms_model_name)
blooms_model = BertForSequenceClassification.from_pretrained(blooms_model_name, num_labels=6).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

### sentence-transformers/all-MiniLM-L6-v2
{
  "max_seq_length": 256,
  "do_lower_case": false
}
### Tokenizer Config
{
  "do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512
  }

  This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.

######  pdf4llm.to_markdown()
outputs a list of dictinoaries containing

- metadata:
  - format: "Image"
  - title: ""
  - author: ""
  - subject: ""
  - keywords: ""
  - creator: ""
  - producer: ""
  - creationDate: ""
  - modDate: ""
  - trapped: ""
  - encryption: ""
  - file_path: "1-2.png"
  - page_count: 1
  - page: 1

- toc_items: []

- tables: []

- images:
  - number: 0
  - bbox: Rect(0.0, 50.0, 648.0, 310.0)
  - transform: (648.0, 0.0, 0.0, 360.0, 0.0, 0.0)
  - width: 2700
  - height: 1500
  - colorspace: 3
  - cs-name: "DeviceRGB"
  - xres: 300
  - yres: 300
  - bpc: 8
  - size: 88631
  - has-mask: False

- graphics: []

- text: "-----"

- words: []


In [4]:
!rm -rf /content/*
import pdf4llm
from google.colab import files
import numpy as np
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from transformers import AutoTokenizer

# Connect to LanceDB
db = lancedb.connect("/content")

# Initialize SBERT Embedder
embedder = get_registry().get("huggingface").create(
    name='sentence-transformers/all-MiniLM-L6-v2',
    device="cuda"
)

# Load tokenizer to chunk text
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Define LanceDB Schema
class PDFSchema(LanceModel):
    text: str = embedder.SourceField()
    vector: Vector(embedder.ndims()) = embedder.VectorField()
    page_name: str

# Upload PDFs
uploaded = files.upload()
print(list(uploaded.keys()))


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Saving 12th_General English_Text_www.tntextbooks.in.pdf to 12th_General English_Text_www.tntextbooks.in.pdf
['12th_General English_Text_www.tntextbooks.in.pdf']


- Including overlap between chunks helps preserve context across chunk boundaries — especially useful when text is split in the middle of a sentence or paragraph(sliding window approach)



In [5]:
entries = []

# Token chunk config
chunk_size = 256
overlap = 64


# Collect all entries
entries = []

# Process each uploaded file separately
for pdf_filename in list(uploaded.keys())[:2]:

    print(f"\n Processing: {pdf_filename}")

    # Ask for page numbers or ranges
    page_input = input(f"Enter pages or ranges for {pdf_filename} (e.g., 1,3-5,7): ")

    # Parse user input into zero-based page indices
    selected_pages = []
    for part in page_input.split(","):
        part = part.strip()
        if "-" in part:
            start, end = map(int, part.split("-"))
            selected_pages.extend(range(start - 1, end))  # Zero-based
        else:
            selected_pages.append(int(part) - 1)

    # Extract specified pages
    selected_page_data = pdf4llm.to_markdown(pdf_filename, page_chunks=True, pages=selected_pages)

    # Prepare entries for LanceDB
    for page_data in selected_page_data:
        text = page_data["text"]
        page_path = page_data["metadata"]["file_path"]

        entries.append({
            "text": text,
            "page_name": page_path
        })

# Store all entries in LanceDB
tbl = db.create_table("pdf_data", schema=PDFSchema, mode="overwrite")
tbl.add(entries)

print("\n All selected pages embedded and stored in LanceDB!")


 Processing: 12th_General English_Text_www.tntextbooks.in.pdf
Enter pages or ranges for 12th_General English_Text_www.tntextbooks.in.pdf (e.g., 1,3-5,7): 30-180
Processing 12th_General English_Text_www.tntextbooks.in.pdf...

 All selected pages embedded and stored in LanceDB!


In [16]:
from lancedb.rerankers import CrossEncoderReranker

reranker = CrossEncoderReranker()

query = input("\n Enter your query: ")

tbl.create_fts_index("text", replace=True)

result = tbl.search(query, query_type="hybrid").rerank(reranker=reranker).limit(2).to_list()
# Display search results
print("\n Search Results:\n")
for res in result[:3]:  # Show top 5 results
    print(f" Page {res['page_name']}\n{res['text'][:]}\n")


 Enter your query:  Rama-Lakshmana

 Search Results:

 Page 12th_General English_Text_www.tntextbooks.in.pdf
###### neem tree in our cattle-pasture and laying it out to dry. Pedanna said, “Making it out of poovarasu wood would be really good. It’s a fine-grained wood, without knots. And glossy and strong too.”

 Our elder sister said, “All these are light-coloured woods. Ugly to look at! After a while, we’ll begin to even detest them. I’m saying it will be best to make it the colour of ripe sugarcane or dark like sesame oilcake. But it’s your wish…” The vision of a highly comfortable chair—in black wood with a mirror-like gleam, with perfectly shaped front legs, and curved back legs, yawning languorously, to match the recline of the chair—flashed before our eyes.

 Everyone felt that she was right. So it was arranged for two chairs to be made immediately, one for us and the other for Maamanaar.

 When the two chairs arrived, we didn’t know which one to keep and which to send to Maaman

### Mistral-7B-Instruct-v0.3/config.json

{
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  `"max_position_embeddings": 32768,`
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.0.dev0",
  "use_cache": true,
  "vocab_size": 32768
}

In [25]:
def generate_questions_with_mistral_bulk(pages_text, query):
    prompt = f"""<s>[INST]
You are an expert educational assistant that generates high-quality questions from academic text using Bloom's Taxonomy.

Follow these steps:
1. Read and understand the context provided.
2. Identify key ideas, concepts, or learning objectives.
3. Generate questions based on the user's request.

Context:
\"\"\"
{pages_text}
\"\"\"

User Request:
{query}

Make sure:
- The number of questions matches the request.
- The Bloom's Taxonomy levels (e.g., Analyze, Evaluate) are followed.
- The questions reflect the specified difficulty (e.g., easy, medium).
- Include brief reasoning before listing the final questions.


[/INST]
### Questions:
</s>

"""

    # Tokenize input
    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")

    # Generate output
    output = mistral_model.generate(
        **inputs,
        max_new_tokens=500,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )

    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract only the generated questions
    if "### Questions:" in generated_text:
        generated_questions = generated_text.split("### Questions:")[1].strip()
    else:
        generated_questions = generated_text  # Fallback if marker is missing

    return generated_questions.split("\n")


def bulk_process_pages(pages_array,query,batch_size=2,filename="generated_questions.csv"):

  with open(filename, mode='w', newline='', encoding='utf-8') as file:
      writer = csv.writer(file)
      writer.writerow(["Question"])


      for i in range(0, len(pages_array), batch_size):
        batch_pages = pages_array[i:i+batch_size]  # Get a batch of pages
        combined_text = "\n".join(page["text"] for page in batch_pages)
        print(f"📄 Processing Pages {i+1} to {min(i+batch_size, len(pages_array))}...")

        # Generate questions with CoT
        questions = generate_questions_with_mistral_bulk(combined_text,query)

        for question in questions:
                writer.writerow([question])


#  Function 4: Classify Questions & Save with Bloom’s Taxonomy
def classify_questions_and_save(input_csv="generated_questions.csv", output_csv="classified_questions.csv"):
    with open(input_csv, mode='r', encoding='utf-8') as infile, open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        # Write header row
        writer.writerow(["Question", "Bloom's Taxonomy Level",
                         "BT1 (Remembering)", "BT2 (Understanding)", "BT3 (Applying)",
                         "BT4 (Analyzing)", "BT5 (Evaluating)", "BT6 (Creating)"])

        next(reader)  # Skip header

        for row in reader:
            question = row
            bloom_level, probs = classify_blooms_taxonomy(question)

            # Write to output CSV
            writer.writerow([question, bloom_level,
                             probs["BT1 (Remembering)"], probs["BT2 (Understanding)"],
                             probs["BT3 (Applying)"], probs["BT4 (Analyzing)"],
                             probs["BT5 (Evaluating)"], probs["BT6 (Creating)"]])

    #  Download CSV file
    files.download(output_csv)



query = "Generate 5 analysis-level and evaluation-level questions."
bulk_process_pages(result,query)
classify_questions_and_save()



📄 Processing Pages 1 to 2...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
import csv


def generate_questions_with_mistral_bulk(pages_text, query, num_questions, difficulty, question_pattern):
    blooms_mapping = {
        "easy": "BT1 (Remembering) and BT2 (Understanding)",
        "medium": "BT3 (Applying) and BT4 (Analyzing)",
        "hard": "BT5 (Evaluating) and BT6 (Creating)"
    }
    bloom_levels = blooms_mapping.get(difficulty, "BT1 (Remembering) and BT2 (Understanding)")

    prompt = f'''
You are an AI assistant specialized in question generation.
Generate {num_questions} well-formed {question_pattern} questions on "{query}".
Match the {difficulty} difficulty level, corresponding to **{bloom_levels} in Bloom’s Taxonomy.

### Question Format:
- MCQ: Provide a question with four options and indicate the correct answer.
- Descriptive: Ask open-ended questions.

Context:
{pages_text}

### Generated {question_pattern.capitalize()} Questions:
'''

    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")
    output = mistral_model.generate(
        **inputs,
        max_new_tokens=500,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )

    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)

    if f"### Generated {question_pattern.capitalize()} Questions:" in generated_text:
        generated_questions = generated_text.split(f"### Generated {question_pattern.capitalize()} Questions:")[1].strip()
    else:
        generated_questions = generated_text

    lines = generated_questions.split("\n")
    questions = []
    current_question = ""

    for line in lines:
        if line.strip():
            if line[0].isdigit():
                if current_question:
                    questions.append(current_question.strip())
                current_question = line
            else:
                current_question += "\n" + line

    if current_question:
        questions.append(current_question.strip())

    print(f"Extracted {question_pattern.capitalize()} Questions:\n", "\n\n".join(questions))
    return questions[:]

def bulk_process_pages(pages_array, query, num_questions, difficulty, question_pattern, batch_size=5, filename="generated_questions.csv"):
    local_path = f"/content/{filename}"

    with open(local_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if question_pattern == "mcq":
            writer.writerow(["Question", "Options", "Correct Answer", "Difficulty Level"])
        else:  # Descriptive
            writer.writerow(["Question", "Difficulty Level"])

        for i in range(0, len(pages_array), batch_size):
            batch_pages = pages_array[i:i+batch_size]
            combined_text = "\n".join(page["text"] for page in batch_pages)
            print(f"📄 Processing Pages {i+1} to {min(i+batch_size, len(pages_array))}...")

            questions = generate_questions_with_mistral_bulk(combined_text, query, num_questions, difficulty, question_pattern)

            for question in questions:
                if question_pattern == "mcq":
                    lines = question.split("\n")
                    question_text = lines[0].strip()
                    options = " | ".join(lines[1:-1]).replace("a) ", "").replace("b) ", "").replace("c) ", "").replace("d) ", "")
                    correct_answer = lines[-1].replace("Correct Answer:", "").strip()
                    writer.writerow([question_text, options, correct_answer, difficulty])
                else:  # Descriptive
                    writer.writerow([question, difficulty])

    print(f"✅ Questions saved in {local_path}")
    files.download(local_path)

query = input("\n🔎 Enter your query: ")
num_questions = int(input("📌 Enter the number of questions to generate: "))
difficulty = input("⚡ Choose difficulty level (easy/medium/hard): ").strip().lower()
if difficulty not in ["easy", "medium", "hard"]:
    print("❌ Invalid difficulty level.")
    exit()
question_pattern = input("✏ Choose question pattern (MCQ/Descriptive): ").strip().lower()
if question_pattern not in ["mcq", "descriptive"]:
    print("❌ Invalid question pattern.")
    exit()

bulk_process_pages(result, query, num_questions, difficulty, question_pattern)



🔎 Enter your query: generate 
📌 Enter the number of questions to generate: 7
⚡ Choose difficulty level (easy/medium/hard): easy
✏ Choose question pattern (MCQ/Descriptive): Descriptive
📄 Processing Pages 1 to 2...
Extracted Descriptive Questions:
 1. Based on the story, what was the purpose of making the chair?

2. What was the color of the chair that was made for the narrator's family?

3. Why did everyone feel that the elder sister was right about the chair's color?

4. Describe the reaction of the family members when they first saw the finished chairs.

5. What was the reason for sending one of the chairs to Maamanaar?

6. How did the villagers react to the new chair? Provide an example of their reaction.

7. What was the significance of the chair at the funeral?
✅ Questions saved in /content/generated_questions.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>