<a href="https://colab.research.google.com/github/gunasekaran81m/melanies_smoothies/blob/main/Mcq_lc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install langchain langchain-community faiss-cpu pypdf python-dotenv sentence-transformers



In [2]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from typing import List, Dict
import json

In [3]:
class MCQGenerator:
    def __init__(self, model_name="microsoft/DialoGPT-medium", embedding_model="sentence-transformers/all-mpnet-base-v2"):
        """
        Initialize the MCQ Generator

        Args:
            model_name: HuggingFace model for question generation
            embedding_model: Model for text embeddings
        """
        self.model_name = model_name
        self.embedding_model = embedding_model
        self.vector_store = None
        self.llm = None
        self.setup_llm()

    def setup_llm(self):
        """Initialize the language model"""
        try:
            tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16,
                device_map="auto",
                low_cpu_mem_usage=True
            )

            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                max_length=1024,
                temperature=0.7,
                top_p=0.95,
                repetition_penalty=1.15
            )

            self.llm = HuggingFacePipeline(pipeline=pipe)
            print(f"LLM {self.model_name} loaded successfully!")

        except Exception as e:
            print(f"Error loading LLM: {e}")
            # Fallback to a smaller model
            self.setup_fallback_llm()

    def setup_fallback_llm(self):
        """Fallback to a smaller model if primary fails"""
        try:
            self.llm = HuggingFacePipeline.from_model_id(
                model_id="gpt2",
                task="text-generation",
                pipeline_kwargs={"max_length": 512}
            )
            print("Fallback LLM (GPT-2) loaded successfully!")
        except Exception as e:
            print(f"Error loading fallback LLM: {e}")

    def load_pdf(self, pdf_path: str):
        """Load and process PDF file"""
        try:
            loader = PyPDFLoader(pdf_path)
            documents = loader.load()

            # Split documents into chunks
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                length_function=len
            )

            chunks = text_splitter.split_documents(documents)
            print(f"Loaded {len(chunks)} chunks from PDF")

            return chunks

        except Exception as e:
            print(f"Error loading PDF: {e}")
            return []

    def create_vector_store(self, documents):
        """Create FAISS vector store from documents"""
        try:
            embeddings = HuggingFaceEmbeddings(
                model_name=self.embedding_model,
                model_kwargs={'device': 'cpu'}
            )

            self.vector_store = FAISS.from_documents(documents, embeddings)
            print("Vector store created successfully!")

        except Exception as e:
            print(f"Error creating vector store: {e}")

    def search_relevant_content(self, query: str, k: int = 3):
        """Search for relevant content in the vector store"""
        if self.vector_store is None:
            return "No vector store available. Please load a PDF first."

        try:
            docs = self.vector_store.similarity_search(query, k=k)
            return "\n\n".join([doc.page_content for doc in docs])
        except Exception as e:
            print(f"Error searching content: {e}")
            return ""

    def generate_mcqs(self, topic: str, num_questions: int = 5, difficulty: str = "medium"):
        """Generate MCQs based on the topic"""

        # Search for relevant content
        relevant_content = self.search_relevant_content(topic)

        if not relevant_content:
            return {"error": "No relevant content found for the topic"}

        # Prompt template for MCQ generation
        prompt_template = PromptTemplate(
            input_variables=["content", "topic", "num_questions", "difficulty"],
            template="""
            Based on the following content, generate {num_questions} multiple-choice questions about {topic} with {difficulty} difficulty.

            Content:
            {content}

            Generate the questions in JSON format with the following structure:
            {{
                "mcqs": [
                    {{
                        "question": "question text",
                        "options": {{
                            "A": "option A",
                            "B": "option B",
                            "C": "option C",
                            "D": "option D"
                        }},
                        "correct_answer": "A",
                        "explanation": "brief explanation"
                    }}
                ]
            }}

            Ensure questions are clear, options are plausible, and only one correct answer.
            Return only the JSON, no additional text.
            """
        )

        # Create LLM chain
        mcq_chain = LLMChain(llm=self.llm, prompt=prompt_template)

        try:
            response = mcq_chain.run({
                "content": relevant_content,
                "topic": topic,
                "num_questions": num_questions,
                "difficulty": difficulty
            })

            # Clean the response and parse JSON
            response = response.strip()
            if response.startswith("```json"):
                response = response[7:]
            if response.endswith("```"):
                response = response[:-3]

            mcq_data = json.loads(response)
            return mcq_data

        except Exception as e:
            print(f"Error generating MCQs: {e}")
            return {"error": f"Failed to generate MCQs: {str(e)}"}

    def process_pdf_and_generate_mcqs(self, pdf_path: str, topic: str, num_questions: int = 5):
        """Complete pipeline: Load PDF and generate MCQs"""
        print("Loading PDF...")
        documents = self.load_pdf(pdf_path)

        if not documents:
            return {"error": "Failed to load PDF"}

        print("Creating vector store...")
        self.create_vector_store(documents)

        print(f"Generating {num_questions} MCQs about {topic}...")
        return self.generate_mcqs(topic, num_questions)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
cd /content/drive/MyDrive/pdf

/content/drive/MyDrive/pdf


In [6]:
ls

cbse.pdf


In [7]:
def main():
    # Initialize the MCQ generator
    mcq_gen = MCQGenerator()

    # Path to your PDF file
    pdf_path = "cbse.pdf"  # Replace with your PDF path

    # Topic for which you want to generate MCQs
    topic = "machine learning fundamentals"

    # Generate MCQs
    result = mcq_gen.process_pdf_and_generate_mcqs(
        pdf_path=pdf_path,
        topic=topic,
        num_questions=3
    )

    # Display results
    if "mcqs" in result:
        print(f"\nGenerated {len(result['mcqs'])} MCQs about '{topic}':")
        print("=" * 50)

        for i, mcq in enumerate(result["mcqs"], 1):
            print(f"\n{i}. {mcq['question']}")
            for option, text in mcq['options'].items():
                print(f"   {option}) {text}")
            print(f"   Correct: {mcq['correct_answer']}")
            print(f"   Explanation: {mcq['explanation']}")
    else:
        print(f"Error: {result.get('error', 'Unknown error')}")

# Simple function for quick testing
def quick_test():
    """Quick test with a sample PDF"""
    generator = MCQGenerator()

    # You can use any PDF file path
    try:
        result = generator.process_pdf_and_generate_mcqs(
            pdf_path="your_document.pdf",
            topic="artificial intelligence",
            num_questions=2
        )
        print(json.dumps(result, indent=2))
    except Exception as e:
        print(f"Test failed: {e}")

if __name__ == "__main__":
    main()

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cpu
  self.llm = HuggingFacePipeline(pipeline=pipe)


LLM microsoft/DialoGPT-medium loaded successfully!
Loading PDF...




Loaded 6 chunks from PDF
Creating vector store...


  embeddings = HuggingFaceEmbeddings(
  mcq_chain = LLMChain(llm=self.llm, prompt=prompt_template)
  response = mcq_chain.run({
Token indices sequence length is longer than the specified maximum sequence length for this model (1253 > 1024). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Vector store created successfully!
Generating 3 MCQs about machine learning fundamentals...
Error generating MCQs: Input length of input_ids is 1253, but `max_length` is set to 1024. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Error: Failed to generate MCQs: Input length of input_ids is 1253, but `max_length` is set to 1024. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
