In [None]:
!pip show keras tf-keras

In [1]:
import os
import re
from pdf2image import convert_from_path
from google.cloud import vision
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sentence_transformers import SentenceTransformer, util
import spacy
import pytesseract
from PIL import Image
import numpy as np
from textblob import TextBlob
import time

2025-01-29 13:43:05.351121: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738138385.534304    4409 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738138385.589286    4409 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-29 13:43:05.997155: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/dhruv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_da

True

In [2]:
def pdf_to_images(pdf_path, output_folder):
    """Convert a PDF to images, one image per page."""
    images = convert_from_path(pdf_path)
    image_paths = []

    for i, image in enumerate(images):
        image_path = f"{output_folder}/page_{i + 1}.jpg"
        image.save(image_path, "JPEG")
        image_paths.append(image_path)

    return image_paths

In [3]:
def extract_text_from_images(image_paths):
    """Extract text from a list of image paths using Google Cloud Vision."""
    client = vision.ImageAnnotatorClient()
    all_text = ""

    for image_path in image_paths:
        with open(image_path, "rb") as image_file:
            content = image_file.read()
        
        image = vision.Image(content=content)
        response = client.document_text_detection(image=image)

        if response.error.message:
            raise Exception(f"Error processing {image_path}: {response.error.message}")

        all_text += response.full_text_annotation.text + "\n"

    return all_text

In [4]:
def process_pdf_without_buckets(pdf_path, output_folder):
    """Process a PDF file without using Google Cloud Storage."""
    os.makedirs(output_folder, exist_ok=True)

    print("Converting PDF to images...")
    image_paths = pdf_to_images(pdf_path, output_folder)

    print("Extracting text from images...")
    extracted_text = extract_text_from_images(image_paths)

    return extracted_text

In [5]:
def preprocess_text(text):
    """Preprocess text by removing stopwords, punctuation, and lowercasing."""
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

In [6]:
def get_similarity_score(text1, text2):
    """Calculate the similarity score between two texts using SBERT."""
    sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    embeddings1 = sbert_model.encode(text1, convert_to_tensor=True)
    embeddings2 = sbert_model.encode(text2, convert_to_tensor=True)
    cosine_similarity = util.pytorch_cos_sim(embeddings1, embeddings2)
    return cosine_similarity.item()

In [7]:
def main(student_pdf_path, teacher_answer_path, output_folder):
    # Extract text from student's PDF
    print("Processing student's PDF...")
    student_text = process_pdf_without_buckets(student_pdf_path, output_folder)
    
    # If teacher's answer is a PDF, extract text; otherwise, read as text
    if teacher_answer_path.endswith('.pdf'):
        print("Processing teacher's PDF...")
        teacher_text = process_pdf_without_buckets(teacher_answer_path, output_folder)
    else:
        with open(teacher_answer_path, 'r') as file:
            teacher_text = file.read()
    
    # Preprocess both texts
    print("Preprocessing texts...")
    student_text_processed = preprocess_text(student_text)
    teacher_text_processed = preprocess_text(teacher_text)
    
    # Calculate similarity score
    print("Calculating similarity score...")
    similarity_score = get_similarity_score(student_text_processed, teacher_text_processed)
    
    # Assign marks based on similarity score (example: out of 10)
    marks = round(similarity_score * 10, 2)
    
    print(f"Similarity Score: {similarity_score}")
    print(f"Marks Awarded: {marks}")

In [8]:
student_pdf_path = "/home/dhruv/Desktop/CloudOCR/myAnswer.pdf"
teacher_answer_path = "/home/dhruv/Desktop/CloudOCR/teacher_answer.txt"
output_folder = "output_images"

main(student_pdf_path, teacher_answer_path, output_folder)

Processing student's PDF...
Converting PDF to images...
Extracting text from images...
Preprocessing texts...
Calculating similarity score...




Similarity Score: 0.7783663272857666
Marks Awarded: 7.78


## Entire Paper stuff


In [1]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
import json
import os
from tqdm import tqdm
from pdf2image import convert_from_path
from google.cloud import vision
import re

In [7]:
def extract_questions_from_pdf(pdf_path):
    """Extract questions and their details from PDF with improved parsing."""
    
    def pdf_to_images(pdf_path, output_folder='temp_images'):
        """Convert PDF to images."""
        os.makedirs(output_folder, exist_ok=True)
        images = convert_from_path(pdf_path)
        image_paths = []
        
        for i, image in enumerate(images):
            image_path = f"{output_folder}/page_{i + 1}.jpg"
            image.save(image_path, "JPEG")
            image_paths.append(image_path)
            
        return image_paths

    def extract_text_from_images(image_paths):
        """Extract text using Google Cloud Vision."""
        client = vision.ImageAnnotatorClient()
        all_text = ""
        
        for image_path in image_paths:
            with open(image_path, "rb") as image_file:
                content = image_file.read()
            
            image = vision.Image(content=content)
            response = client.document_text_detection(image=image)
            
            if response.error.message:
                raise Exception(f"Error processing {image_path}: {response.error.message}")
                
            all_text += response.full_text_annotation.text + "\n"
            
        return all_text

    def parse_questions(text):
        """Parse questions with multiple pattern matching attempts."""
        # Print the extracted text for debugging
        print("\nExtracted text from PDF:")
        print("-" * 50)
        print(text)
        print("-" * 50)

        # Try different question patterns
        patterns = [
            r'Q\.?\s*(\d+)\.(.*?)(?=Q\.?\s*\d+\.|$)',  # Q. 1. or Q1.
            r'Question\s*(\d+)[.:]?(.*?)(?=Question\s*\d+|$)',  # Question 1:
            r'(\d+)\.(.*?)(?=\d+\.|$)',  # 1.
            r'$$(\d+)$$(.*?)(?=$$\d+$$|$)'  # (1)
        ]
        
        marks_patterns = [
            r'$$(\d+)\s*marks?$$',  # (5 marks) or (5 mark)
            r'(\d+)\s*marks?',      # 5 marks
            r'$$(\d+)\s*marks?$$'   # [5 marks]
        ]
        
        questions = {}
        
        # Try each pattern until we find questions
        for pattern in patterns:
            matches = list(re.finditer(pattern, text, re.DOTALL | re.IGNORECASE))
            if matches:
                print(f"\nFound questions using pattern: {pattern}")
                for match in matches:
                    q_num = match.group(1)
                    q_text = match.group(2).strip()
                    
                    # Try to extract marks
                    marks = 0
                    for marks_pattern in marks_patterns:
                        marks_match = re.search(marks_pattern, q_text, re.IGNORECASE)
                        if marks_match:
                            marks = int(marks_match.group(1))
                            # Remove marks pattern from question text
                            q_text = re.sub(marks_pattern, '', q_text, flags=re.IGNORECASE).strip()
                            break
                    
                    questions[q_num] = {
                        'question_text': q_text,
                        'marks': marks
                    }
                    print(f"\nFound Question {q_num}:")
                    print(f"Text: {q_text[:100]}...")  # Print first 100 chars
                    print(f"Marks: {marks}")
                
                if questions:  # If we found any questions, stop trying patterns
                    break
        
        return questions

    try:
        # Process PDF
        print("Converting PDF to images...")
        image_paths = pdf_to_images(pdf_path)
        
        print("Extracting text from images...")
        extracted_text = extract_text_from_images(image_paths)
        
        print("Parsing questions...")
        questions = parse_questions(extracted_text)
        
        # Cleanup temporary images
        for image_path in image_paths:
            os.remove(image_path)
        os.rmdir('temp_images')
        
        if not questions:
            print("No questions were found in the text.")
            return None
            
        return questions
        
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return None

In [24]:
class QuestionPaper:
    def __init__(self, paper_id):
        self.paper_id = paper_id
        self.questions = {}
        self.total_marks = 0

    @staticmethod
    def extract_text_from_pdf(pdf_path):
        """Extract text from PDF using Google Cloud Vision."""
        try:
            # Convert PDF to images
            print("Converting PDF to images...")
            images = convert_from_path(pdf_path)
            
            # Setup Google Cloud Vision
            client = vision.ImageAnnotatorClient()
            extracted_text = ""
            
            print("Extracting text from images...")
            for i, image in enumerate(images):
                # Save image temporarily
                image_path = f"temp_page_{i}.jpg"
                image.save(image_path, "JPEG")
                
                # Extract text from image
                with open(image_path, "rb") as image_file:
                    content = image_file.read()
                image = vision.Image(content=content)
                response = client.document_text_detection(image=image)
                extracted_text += response.full_text_annotation.text + "\n"
                
                # Clean up temp image
                os.remove(image_path)
            
            return extracted_text
            
        except Exception as e:
            print(f"Error in text extraction: {str(e)}")
            return None

    @staticmethod
    def parse_questions(text):
        """Parse questions from extracted text."""
        # Print the extracted text for debugging
        print("\nExtracted text:")
        print("-" * 50)
        print(text)
        print("-" * 50)

        questions = {}
        
        # Pattern to match questions with parts
        pattern = r'Q\.?\s*(\d+)([AB]?)\.?\s*([^Q]*)(?=Q\.|$)'
        matches = re.finditer(pattern, text, re.DOTALL | re.IGNORECASE)
        
        for match in matches:
            q_num = match.group(1)
            q_part = match.group(2) if match.group(2) else ''
            q_text = match.group(3).strip()
            
            # Extract marks
            marks_match = re.search(r'(?:Max\.\s*Marks\s*(\d+))|(\d+)\s*marks?', q_text, re.IGNORECASE)
            marks = int(marks_match.group(1) or marks_match.group(2)) if marks_match else 5
            
            # Clean question text
            q_text = re.sub(r'(?:Max\.\s*Marks\s*\d+)|(?:\d+\s*marks?)', '', q_text, re.IGNORECASE)
            q_text = re.sub(r'OR', '', q_text)
            q_text = ' '.join(q_text.split())  # Clean up whitespace
            
            question_id = f"{q_num}{q_part}"
            questions[question_id] = {
                'text': q_text,
                'marks': marks
            }
            
            # Print parsed question for debugging
            print(f"\nParsed Q{question_id}:")
            print(f"Text: {q_text}")
            print(f"Marks: {marks}")
        
        return questions

    @classmethod
    def from_pdf(cls, pdf_path, paper_id):
        """Create QuestionPaper instance from PDF file."""
        paper = cls(paper_id)
        
        # Extract text from PDF
        extracted_text = cls.extract_text_from_pdf(pdf_path)
        if not extracted_text:
            return None
        
        # Parse questions
        paper.questions = cls.parse_questions(extracted_text)
        paper.total_marks = sum(q['marks'] for q in paper.questions.values())
        
        if paper.questions:
            print(f"\nSuccessfully extracted {len(paper.questions)} questions")
            print(f"Total marks: {paper.total_marks}")
        else:
            print("No questions were extracted from the PDF")
        
        return paper

    def display_questions(self):
        """Display formatted questions."""
        print(f"\nQuestion Paper ID: {self.paper_id}")
        print(f"Total Marks: {self.total_marks}")
        print("\nQuestions:")
        print("-" * 50)
        
        for q_num, q_data in sorted(self.questions.items()):
            print(f"\nQuestion {q_num}:")
            print(f"Text: {q_data['text']}")  # Changed from 'question_text' to 'text'
            print(f"Marks: {q_data['marks']}")
            if 'model_answer' in q_data:
                print(f"Model Answer: {q_data['model_answer']}")

    def to_dataframe(self):
        """Convert questions to pandas DataFrame."""
        questions_list = []
        
        for q_num, q_data in sorted(self.questions.items()):
            row = {
                'Question No.': f'Q{q_num}',
                'Question': q_data['text'],  # Changed from 'question_text' to 'text'
                'Marks': q_data['marks']
            }
            if 'model_answer' in q_data:
                row['Model Answer'] = q_data['model_answer']
            questions_list.append(row)
        
        df = pd.DataFrame(questions_list)
        return df

    def get_raw_text(self):
        """Get raw text of all questions."""
        raw_text = []
        for q_num, q_data in sorted(self.questions.items()):
            question_text = f"Q{q_num} {q_data['text']} Max. Marks {q_data['marks']}"
            raw_text.append(question_text)
        return '\n'.join(raw_text)


In [25]:
# Create from PDF

pdf_path = "QPTEST-Crop.pdf"

paper = QuestionPaper.from_pdf(pdf_path, "EXAM2")


# Display questions

paper.display_questions()


# Get raw text

raw_text = paper.get_raw_text()

print(raw_text)


# Get as DataFrame

df = paper.to_dataframe()

display(df)




Converting PDF to images...
Error in text extraction: Unable to get page count.
I/O Error: Couldn't open file 'QPTEST-Crop.pdf': No such file or directory.



AttributeError: 'NoneType' object has no attribute 'display_questions'

In [23]:
# Create question paper from PDF with debugging
pdf_path = "qpconvert.pdf"
paper = QuestionPaper.from_pdf(pdf_path, "EXAM2023001")

# If questions were found, display them
if paper.questions:
    print("\nExtracted Questions:")
    paper.display_questions()
else:
    print("\nNo questions were extracted. Please check the PDF format.")

Converting PDF to images...
Extracting text from images...
Parsing questions...

Successfully extracted 1 questions.
Extracted 1 questions from PDF

Extracted Questions:

Question 2:


KeyError: 'question_text'

In [12]:
# Create question paper from PDF with debugging
pdf_path = "QPTest-Nocrop.pdf"
paper = QuestionPaper.from_pdf(pdf_path, "TEST_NoC")

# If questions were found, display them
if paper.questions:
    print("\nExtracted Questions:")
    paper.display_questions()
else:
    print("\nNo questions were extracted. Please check the PDF format.")

Converting PDF to images...
Extracting text from images...
Parsing questions...

Successfully extracted 1 questions.

Extracted Questions:

Question Paper ID: TEST_NoC
Total Marks: 0

Questions:
--------------------------------------------------

Question 1 (0 marks)
a Describe how the Canny Edge Detector algorithm can be applied to an image to detect edges. Include the steps involved and discuss how this approach contributes to better image analysis. 05 OR Q1 b Explain the Hough Transform and its fundamental concepts. Consider the edge pixels detected at coordinates 1,1 and 3,3. 05 Q2 a Analyze the impact of different morphological operators used in image processing, such as dilation, erosion, opening, and closing. How do these operations impact binary images? 05 OR Q2 b Describe how different noise can be handled in image restoration. How can restoration algorithms be applied to reduce or eliminate these types of noise from an image. 05 Q3 Analyze the differences and similarities bet

In [18]:
# Create question paper from PDF with debugging
pdf_path = "QPTest-Crop.pdf"
paper = QuestionPaper.from_pdf(pdf_path, "TEST_C")

# If questions were found, display them
if paper.questions:
    print("\nExtracted Questions:")
    paper.display_questions()
else:
    print("\nNo questions were extracted. Please check the PDF format.")

Converting PDF to images...
Extracting text from images...
Parsing questions...

Successfully extracted 1 questions.
Extracted 1 questions from PDF

Extracted Questions:

Question 1:


KeyError: 'question_text'

### Question organisation

In [14]:
def preprocess_questions(text):
    """Preprocess and structure questions from raw text."""
    
    # Pattern to match questions with parts
    pattern = r'Q(\d+)\s*([ab])?\s*(?:Question)?\s*([^Q]*)(?=Q\d+|$)'
    
    # Clean and structure questions
    questions = []
    matches = re.finditer(pattern, text, re.DOTALL | re.IGNORECASE)
    
    for match in matches:
        q_num = match.group(1)
        q_part = match.group(2) if match.group(2) else ''
        q_text = match.group(3).strip()
        
        # Extract marks
        marks_match = re.search(r'Max\.\s*Marks\s*(\d+)', q_text)
        marks = int(marks_match.group(1)) if marks_match else 5
        
        # Clean question text
        q_text = re.sub(r'Max\.\s*Marks\s*\d+', '', q_text)
        q_text = re.sub(r'OR', '', q_text)
        q_text = q_text.strip()
        
        questions.append({
            'question_number': f'Q{q_num}{q_part}',
            'question_text': q_text,
            'marks': marks
        })
    
    return questions

In [15]:
def create_structured_df(questions):
    """Create a structured DataFrame from processed questions."""
    df = pd.DataFrame(questions)
    
    # Rename columns for clarity
    df.columns = ['Question No.', 'Question', 'Marks']
    
    # Calculate total marks
    total_marks = df['Marks'].sum()
    
    print(f"Total Questions: {len(df)}")
    print(f"Total Marks: {total_marks}")
    
    return df

In [16]:
# Your raw text
raw_text =paper.display_questions()

# Process questions and create DataFrame
questions = preprocess_questions(raw_text)
df = create_structured_df(questions)

# Display structured questions
print("\nStructured Questions:")
print("-" * 50)
display(df)


Question Paper ID: TEST_C
Total Marks: 0

Questions:
--------------------------------------------------

Question 1 (0 marks)
a Question Describe how the Canny Edge Detector algorithm can be applied to an image to detect edges. Include the steps involved and discuss how this approach contributes to better image analysis. Max. Marks 05 OR Q1 b Explain the Hough Transform and its fundamental concepts. Consider the edge pixels detected at coordinates 1,1 and 3,3. 05 Q2 a Analyze the impact of different morphological operators used in image processing, such as dilation, erosion, opening, and closing. How do these operations impact binary images? 05 OR Q2 b Describe how different noise can be handled in image restoration. How can restoration algorithms be applied to reduce or eliminate these types of noise from an image. 05 Q3 Analyze the differences and similarities between optical flow and the motion field. 05


TypeError: expected string or bytes-like object, got 'NoneType'