In [1]:
import os
import re
from pdf2image import convert_from_path
from google.cloud import vision
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sentence_transformers import SentenceTransformer, util
import spacy
import pytesseract
from PIL import Image
import numpy as np
from textblob import TextBlob
import time

2025-04-05 10:24:35.828443: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743828876.188841    4986 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743828876.308745    4986 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-05 10:24:37.275049: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def pdf_to_images(pdf_path, output_folder):
    """Convert a PDF to images, one image per page."""
    images = convert_from_path(pdf_path)
    image_paths = []

    for i, image in enumerate(images):
        image_path = f"{output_folder}/page_{i + 1}.jpg"
        image.save(image_path, "JPEG")
        image_paths.append(image_path)

    return image_paths

In [3]:
def extract_text_from_images(image_paths):
    """Extract text from a list of image paths using Google Cloud Vision."""
    client = vision.ImageAnnotatorClient()
    all_text = ""

    for image_path in image_paths:
        with open(image_path, "rb") as image_file:
            content = image_file.read()
        
        image = vision.Image(content=content)
        response = client.document_text_detection(image=image)

        if response.error.message:
            raise Exception(f"Error processing {image_path}: {response.error.message}")

        all_text += response.full_text_annotation.text + "\n"

    return all_text

In [4]:
def process_pdf_without_buckets(pdf_path, output_folder):
    """Process a PDF file without using Google Cloud Storage."""
    os.makedirs(output_folder, exist_ok=True)

    print("Converting PDF to images...")
    image_paths = pdf_to_images(pdf_path, output_folder)

    print("Extracting text from images...")
    extracted_text = extract_text_from_images(image_paths)

    return extracted_text

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\n+', ' ', text)  
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)  
    
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    
    return ' '.join(filtered_text), ' '.join(filtered_text)  

In [6]:
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import word_tokenize, sent_tokenize
import re

def evaluate_answer(student_text, reference_text, total_marks):
    """Comprehensive answer evaluation with scaling based on total marks."""
    sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    def get_semantic_similarity():
        """Calculate semantic similarity using SBERT."""
        embeddings1 = sbert_model.encode(student_text, convert_to_tensor=True)
        embeddings2 = sbert_model.encode(reference_text, convert_to_tensor=True)
        return util.pytorch_cos_sim(embeddings1, embeddings2).item()

    def check_length_ratio():
        """Check if answer length is appropriate."""
        student_length = len(word_tokenize(student_text))
        reference_length = len(word_tokenize(reference_text))
        ratio = student_length / reference_length if reference_length > 0 else 0
        return min(1.0, ratio if ratio <= 1.5 else 1.5 / ratio)

    def check_key_phrases():
        """Check for presence of key phrases and concepts."""
        def get_phrases(text):
            words = word_tokenize(text.lower())
            return {f"{words[i]} {words[i+1]}" for i in range(len(words)-1)} | \
                   {f"{words[i]} {words[i+1]} {words[i+2]}" for i in range(len(words)-2)}

        ref_phrases = get_phrases(reference_text)
        student_phrases = get_phrases(student_text)
        return len(student_phrases & ref_phrases) / len(ref_phrases) if ref_phrases else 0

    def check_sequence_alignment():
        """Check if ideas are presented in a similar sequence."""
        student_sentences = sent_tokenize(student_text)
        reference_sentences = sent_tokenize(reference_text)
        
        student_emb = sbert_model.encode(student_sentences)
        reference_emb = sbert_model.encode(reference_sentences)
        
        alignment_scores = [
            util.pytorch_cos_sim(student_emb[i], reference_emb[i]).item()
            for i in range(min(len(student_emb), len(reference_emb)))
        ]
        
        return sum(alignment_scores) / len(alignment_scores) if alignment_scores else 0

    def check_factual_accuracy():
        """Check for presence of numerical values and specific facts."""
        extract_numbers = lambda text: set(re.findall(r'\d+(?:\.\d+)?', text))
        
        student_numbers = extract_numbers(student_text)
        reference_numbers = extract_numbers(reference_text)
        
        return len(student_numbers & reference_numbers) / len(reference_numbers) if reference_numbers else 1.0

    def check_coherence():
        """Check text coherence using sentence transitions."""
        sentences = sent_tokenize(student_text)
        if len(sentences) < 2:
            return 1.0
            
        coherence_scores = [
            util.pytorch_cos_sim(
                sbert_model.encode(sentences[i]), 
                sbert_model.encode(sentences[i+1])
            ).item()
            for i in range(len(sentences)-1)
        ]
            
        return sum(coherence_scores) / len(coherence_scores)

    scores = {
        'semantic_similarity': get_semantic_similarity(),
        'length_appropriateness': check_length_ratio(),
        'key_phrases': check_key_phrases(),
        'sequence_alignment': check_sequence_alignment(),
        'factual_accuracy': check_factual_accuracy(),
        'coherence': check_coherence()
    }

    weights = {
        'semantic_similarity': 0.30,
        'length_appropriateness': 0.15,
        'key_phrases': 0.20,
        'sequence_alignment': 0.15,
        'factual_accuracy': 0.10,
        'coherence': 0.10
    }

    raw_score = sum(scores[metric] * weights[metric] for metric in scores)
    scaled_score = raw_score * total_marks  # Scale the score based on total marks

    return {
        'final_score': scaled_score,
        'details': scores
    }


In [7]:
import time

def main(student_pdf_path, teacher_answer_path, output_folder, total_marks=10):
    """Main function with comprehensive evaluation and scaled scoring."""
    try:
        start_time = time.time()
        
        print("Processing student's submission...")
        student_text = process_pdf_without_buckets(student_pdf_path, output_folder)
        
        print("Processing teacher's answer...")
        if teacher_answer_path.endswith('.pdf'):
            teacher_text = process_pdf_without_buckets(teacher_answer_path, output_folder)
        else:
            with open(teacher_answer_path, 'r') as file:
                teacher_text = file.read()
        
        print("Preprocessing texts...")
        student_processed, student_lemmatized = preprocess_text(student_text)
        teacher_processed, teacher_lemmatized = preprocess_text(teacher_text)
        
        print("Evaluating answer...")
        evaluation_result = evaluate_answer(student_processed, teacher_processed, total_marks)
        
        marks = round(evaluation_result['final_score'], 2)  # Already scaled in evaluate_answer
        
        print("\n=== Grading Report ===")
        print(f"Final Marks: {marks}/{total_marks}")
        print("\nDetailed Scores:")
        for metric, score in evaluation_result['details'].items():
            print(f"{metric.replace('_', ' ').title()}: {score:.2f}")
        
        print(f"\nProcessing Time: {time.time() - start_time:.2f} seconds")
        
        return evaluation_result

    except Exception as e:
        print(f"Error in main processing: {str(e)}")
        raise


# Jeet

In [8]:
student_pdf = "Jeet/Jeet-D038-Q1.pdf"
teacher_answer = "Q1Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=3)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.06/3

Detailed Scores:
Semantic Similarity: 0.55
Length Appropriateness: 0.30
Key Phrases: 0.01
Sequence Alignment: 0.28
Factual Accuracy: 0.60
Coherence: 0.40

Processing Time: 8.45 seconds


In [10]:
student_pdf = "Jeet/Jeet-D038-Q2.pdf"
teacher_answer = "Q2-i-answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=7)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 3.15/7

Detailed Scores:
Semantic Similarity: 0.54
Length Appropriateness: 1.00
Key Phrases: 0.00
Sequence Alignment: 0.24
Factual Accuracy: 0.67
Coherence: 0.37

Processing Time: 7.36 seconds


In [11]:
student_pdf = "Jeet/Jeet-D038-Q3.pdf"
teacher_answer = "Q3Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=5)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.02/5

Detailed Scores:
Semantic Similarity: 0.31
Length Appropriateness: 0.37
Key Phrases: 0.00
Sequence Alignment: 0.13
Factual Accuracy: 0.27
Coherence: 0.10

Processing Time: 5.59 seconds


# Joel


In [12]:
student_pdf = "Joel/Joel-D041-Q1.pdf"
teacher_answer = "Q1Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=3)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 0.86/3

Detailed Scores:
Semantic Similarity: 0.59
Length Appropriateness: 0.13
Key Phrases: 0.00
Sequence Alignment: 0.18
Factual Accuracy: 0.40
Coherence: 0.23

Processing Time: 7.03 seconds


In [13]:
student_pdf = "Joel/Joel-D041-Q2.pdf"
teacher_answer = "Q2-ii-answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=7)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 3.09/7

Detailed Scores:
Semantic Similarity: 0.62
Length Appropriateness: 0.90
Key Phrases: 0.04
Sequence Alignment: 0.28
Factual Accuracy: 0.36
Coherence: 0.37

Processing Time: 7.80 seconds


In [14]:
student_pdf = "Joel/Joel-D041-Q3.pdf"
teacher_answer = "Q3Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=5)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.51/5

Detailed Scores:
Semantic Similarity: 0.46
Length Appropriateness: 0.63
Key Phrases: 0.00
Sequence Alignment: 0.18
Factual Accuracy: 0.27
Coherence: 0.17

Processing Time: 6.72 seconds


# Kalp

In [15]:
student_pdf = "Kalp/Kalp-D043-Q1.pdf"
teacher_answer = "Q1Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=3)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 0.74/3

Detailed Scores:
Semantic Similarity: 0.50
Length Appropriateness: 0.19
Key Phrases: 0.01
Sequence Alignment: 0.16
Factual Accuracy: 0.20
Coherence: 0.21

Processing Time: 6.34 seconds


In [16]:
student_pdf = "Kalp/Kalp-D043-Q2.pdf"
teacher_answer = "Q2-ii-answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=7)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 3.87/7

Detailed Scores:
Semantic Similarity: 0.72
Length Appropriateness: 1.00
Key Phrases: 0.09
Sequence Alignment: 0.41
Factual Accuracy: 0.82
Coherence: 0.26

Processing Time: 10.11 seconds


In [17]:
student_pdf = "Kalp/Kalp-D043-Q3.pdf"
teacher_answer = "Q3Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=5)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.45/5

Detailed Scores:
Semantic Similarity: 0.39
Length Appropriateness: 0.51
Key Phrases: 0.00
Sequence Alignment: 0.21
Factual Accuracy: 0.33
Coherence: 0.30

Processing Time: 6.51 seconds


# Krisha

In [19]:
student_pdf = "Krisha/Krisha-D053-Q1.pdf"
teacher_answer = "Q1Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=3)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.09/3

Detailed Scores:
Semantic Similarity: 0.64
Length Appropriateness: 0.36
Key Phrases: 0.01
Sequence Alignment: 0.20
Factual Accuracy: 0.60
Coherence: 0.27

Processing Time: 7.56 seconds


In [21]:
student_pdf = "Krisha/Krisha-D053-Q2.pdf"
teacher_answer = "Q2-i-answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=7)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 2.69/7

Detailed Scores:
Semantic Similarity: 0.47
Length Appropriateness: 0.74
Key Phrases: 0.00
Sequence Alignment: 0.13
Factual Accuracy: 0.89
Coherence: 0.24

Processing Time: 7.41 seconds


In [22]:
student_pdf = "Krisha/Krisha-D053-Q3.pdf"
teacher_answer = "Q3Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=5)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.25/5

Detailed Scores:
Semantic Similarity: 0.29
Length Appropriateness: 0.49
Key Phrases: 0.00
Sequence Alignment: 0.19
Factual Accuracy: 0.27
Coherence: 0.36

Processing Time: 5.56 seconds


# Mehika


In [23]:
student_pdf = "Mehika/Mehika-D062-Q1.pdf"
teacher_answer = "Q1Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=3)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.1/3

Detailed Scores:
Semantic Similarity: 0.52
Length Appropriateness: 0.49
Key Phrases: 0.03
Sequence Alignment: 0.24
Factual Accuracy: 0.60
Coherence: 0.37

Processing Time: 7.00 seconds


In [25]:
student_pdf = "Mehika/Mehika-D062-Q2.pdf"
teacher_answer = "Q2-ii-answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=7)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 3.14/7

Detailed Scores:
Semantic Similarity: 0.55
Length Appropriateness: 0.80
Key Phrases: 0.08
Sequence Alignment: 0.20
Factual Accuracy: 0.73
Coherence: 0.44

Processing Time: 14.53 seconds


In [26]:
student_pdf = "Mehika/Mehika-D062-Q3.pdf"
teacher_answer = "Q3Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=5)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.88/5

Detailed Scores:
Semantic Similarity: 0.42
Length Appropriateness: 0.78
Key Phrases: 0.00
Sequence Alignment: 0.28
Factual Accuracy: 0.40
Coherence: 0.48

Processing Time: 8.17 seconds


# MULTIMODAL

In [1]:
import cv2
import numpy as np
import pytesseract
from PIL import Image
import re
from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification
import torch
from typing import Dict, List, Tuple
import json

class MultimodalOCRSystem:
    def __init__(self):
        # Initialize OCR and LLM components
        self.processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
        self.model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
        
    def preprocess_image(self, image_path: str) -> np.ndarray:
        """Preprocess the input image for better OCR results"""
        image = cv2.imread(image_path)
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        # Apply threshold to get image with only black and white
        _, binary = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
        return binary

    def extract_text(self, image: np.ndarray) -> str:
        """Extract text from image using OCR"""
        text = pytesseract.image_to_string(image)
        return text

    def detect_question_numbers(self, text: str) -> List[str]:
        """Detect question numbers from text"""
        pattern = r'Q\.?\s*(\d+)|Question\s*(\d+)'
        matches = re.finditer(pattern, text)
        question_numbers = [match.group(1) or match.group(2) for match in matches]
        return question_numbers

    def extract_answers(self, text: str, question_numbers: List[str]) -> Dict[str, str]:
        """Extract answers based on question numbers"""
        answers = {}
        text_lines = text.split('\n')
        current_question = None
        current_answer = []

        for line in text_lines:
            # Check if line contains question number
            for q_num in question_numbers:
                if f"Q{q_num}" in line or f"Question {q_num}" in line:
                    if current_question:
                        answers[current_question] = ' '.join(current_answer)
                    current_question = q_num
                    current_answer = []
                    break
            else:
                if current_question:
                    current_answer.append(line)

        # Add the last answer
        if current_question:
            answers[current_question] = ' '.join(current_answer)

        return answers

    def extract_numericals(self, text: str, question_numbers: List[str]) -> Dict[str, List[float]]:
        """Extract numerical values based on question numbers"""
        numericals = {}
        for q_num in question_numbers:
            # Find the section of text corresponding to this question
            pattern = f"Q{q_num}.*?(?=Q{int(q_num)+1}|$)"
            question_text = re.search(pattern, text, re.DOTALL)
            if question_text:
                # Extract all numbers from the question text
                numbers = re.findall(r'-?\d*\.?\d+', question_text.group())
                numericals[q_num] = [float(num) for num in numbers]
        return numericals

    def detect_diagrams(self, image: np.ndarray, question_numbers: List[str]) -> Dict[str, Dict]:
        """Detect and classify DSA diagrams"""
        diagrams = {}
        
        # Convert image to binary
        _, binary = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
        
        # Find contours
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        for q_num in question_numbers:
            diagrams[q_num] = {
                'type': None,
                'components': []
            }
            
            # Analyze contours to identify diagram types and components
            for contour in contours:
                # Get bounding box
                x, y, w, h = cv2.boundingRect(contour)
                
                # Basic shape analysis for diagram classification
                aspect_ratio = float(w)/h
                area = cv2.contourArea(contour)
                
                # Simple classification logic (can be enhanced with ML)
                if aspect_ratio > 0.8 and aspect_ratio < 1.2:
                    # Might be a node
                    diagrams[q_num]['components'].append({
                        'type': 'node',
                        'position': (x, y)
                    })
                elif aspect_ratio > 2:
                    # Might be an edge/link
                    diagrams[q_num]['components'].append({
                        'type': 'edge',
                        'position': (x, y)
                    })
                
            # Determine diagram type based on components
            if len(diagrams[q_num]['components']) > 0:
                diagrams[q_num]['type'] = self.classify_diagram_type(diagrams[q_num]['components'])
                
        return diagrams

    def classify_diagram_type(self, components: List[Dict]) -> str:
        """Classify diagram type based on components"""
        nodes = sum(1 for c in components if c['type'] == 'node')
        edges = sum(1 for c in components if c['type'] == 'edge')
        
        if nodes == 0:
            return None
        elif edges == 0:
            return 'single_node'
        elif edges == nodes - 1:
            return 'tree'
        elif edges >= nodes:
            return 'graph'
        else:
            return 'linked_list'

    def process_document(self, image_path: str) -> Dict:
        """Process document and extract all required information"""
        # Preprocess image
        processed_image = self.preprocess_image(image_path)
        
        # Extract text
        text = self.extract_text(processed_image)
        
        # Detect question numbers
        question_numbers = self.detect_question_numbers(text)
        
        # Extract answers
        answers = self.extract_answers(text, question_numbers)
        
        # Extract numericals
        numericals = self.extract_numericals(text, question_numbers)
        
        # Detect diagrams
        diagrams = self.detect_diagrams(processed_image, question_numbers)
        
        return {
            'answers': answers,
            'numericals': numericals,
            'diagrams': diagrams
        }

# Usage example
def main():
    ocr_system = MultimodalOCRSystem()
    
    # Process a document
    result = ocr_system.process_document('ProperTesting/Jeet/Jeet-D038.pdf')
    
    # Save results to JSON
    with open('results.json', 'w') as f:
        json.dump(result, f, indent=4)
    
    print("Processing complete. Results saved to results.json")

if __name__ == "__main__":
    main()

2025-04-05 13:57:53.456130: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743841673.574655    6062 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743841673.605107    6062 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-05 13:57:53.937321: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


preprocessor_config.json:   0%|          | 0.00/135 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/802M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/microsoft/layoutlmv2-base-uncased/8cffd5ed065ff81e1e5c9a38968372c8541ecb8499999c89a8d9e10d65de3406?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1743844729&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0Mzg0NDcyOX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9taWNyb3NvZnQvbGF5b3V0bG12Mi1iYXNlLXVuY2FzZWQvOGNmZmQ1ZWQwNjVmZjgxZTFlNWM5YTM4OTY4MzcyYzg1NDFlY2I4NDk5OTk5Yzg5YThkOWUxMGQ2NWRlMzQwNj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=Hl3tu1Xtjp6nZZl7EJNSdJF6Q5J9RmRXsk%7EnHHh6UymcZTImICqgBSWfpkQGf454L4AuH0weSLbq05Wt4tzqmQQ92VLbvV3s9jakfXA%7EVpNG-y4veZWkMMY-Rv3k9B%7EKGDbHylPMHp3o3x%7E4s0ZzLKekbN6qLiRFCG2PSV-fyaf17zeGaMU13yKSBisteVeMRGInMNqrvnYM9JxXCjBm5FZfFiGkYtRKPF0iHDznlu3dtllbL55%7EgXiMvQjUW0CL14fWMG4ChDVi8FQXz71XOwxh7

ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'cdn-lfs.hf.co\', port=443): Max retries exceeded with url: /microsoft/layoutlmv2-base-uncased/8cffd5ed065ff81e1e5c9a38968372c8541ecb8499999c89a8d9e10d65de3406?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1743844729&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0Mzg0NDcyOX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9taWNyb3NvZnQvbGF5b3V0bG12Mi1iYXNlLXVuY2FzZWQvOGNmZmQ1ZWQwNjVmZjgxZTFlNWM5YTM4OTY4MzcyYzg1NDFlY2I4NDk5OTk5Yzg5YThkOWUxMGQ2NWRlMzQwNj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=Hl3tu1Xtjp6nZZl7EJNSdJF6Q5J9RmRXsk~nHHh6UymcZTImICqgBSWfpkQGf454L4AuH0weSLbq05Wt4tzqmQQ92VLbvV3s9jakfXA~VpNG-y4veZWkMMY-Rv3k9B~KGDbHylPMHp3o3x~4s0ZzLKekbN6qLiRFCG2PSV-fyaf17zeGaMU13yKSBisteVeMRGInMNqrvnYM9JxXCjBm5FZfFiGkYtRKPF0iHDznlu3dtllbL55~gXiMvQjUW0CL14fWMG4ChDVi8FQXz71XOwxh7nSZwhkrT61syYFno2zhB1mX9Swk8ZX~wk24H0~MIBqXdn16TLuQKV-6dZjMdA__&Key-Pair-Id=K3RPWS32NSSJCE (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7e3310f3d700>: Failed to resolve \'cdn-lfs.hf.co\' ([Errno -3] Temporary failure in name resolution)"))'), '(Request ID: 023eb10e-36a8-4038-a842-e025222a7bf1)')

## Attempt 2

In [1]:
import os
import re
import base64
import json
from pdf2image import convert_from_path
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from sentence_transformers import SentenceTransformer, util
import spacy
import pytesseract
from PIL import Image
import numpy as np
from textblob import TextBlob
import time
from openai import OpenAI
import nltk

# Download required NLP resources
nltk.download('punkt')
nltk.download('wordnet')


2025-04-16 19:05:36.077215: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744810536.090076   23984 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744810536.093921   23984 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-16 19:05:36.108046: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  return torch._C._cuda_getDeviceCount() > 0
[nltk_data] Downloading package punkt to /home/dhruv/nltk_data...
[nltk_data]   

True

In [2]:
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="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",
)


In [3]:
def pdf_to_images(pdf_path, output_folder="images"):
    """Convert PDF pages to image files."""
    os.makedirs(output_folder, exist_ok=True)
    images = convert_from_path(pdf_path)
    image_paths = []

    for i, image in enumerate(images):
        path = os.path.join(output_folder, f"page_{i+1}.jpg")
        image.save(path, "JPEG")
        image_paths.append(path)

    return image_paths

In [4]:
def extract_text_from_images(image_paths):
    """Use Kimi VL model to extract only the visible text from each image, with no extra content."""
    all_text = ""

    for image_path in image_paths:
        with open(image_path, "rb") as img_file:
            base64_image = base64.b64encode(img_file.read()).decode("utf-8")

        completion = client.chat.completions.create(
            model="moonshotai/kimi-vl-a3b-thinking:free",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": (
                                "Transcribe all the visible text from this image exactly as it is. "
                                "Do not explain, summarize, or interpret anything. "
                                "Do not include any thought process or internal reasoning like ◁think▷. "
                                "Only return the raw visible text, exactly as it appears in the image."
                            )
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ]
        )

        all_text += completion.choices[0].message.content + "\n"

    return all_text


In [5]:
def process_pdf_to_text(pdf_path):
    print("[1/2] Converting PDF to images...")
    image_paths = pdf_to_images(pdf_path)

    print("[2/2] Extracting text using Kimi VL...")
    extracted_text = extract_text_from_images(image_paths)

    return extracted_text

In [6]:
pdf_file_path = "Jeet/Jeet-D038-Q1.pdf"  # <- Replace this
extracted_text = process_pdf_to_text(pdf_file_path)

# You now have your result in `extracted_text`
print(extracted_text)  

[1/2] Converting PDF to images...
[2/2] Extracting text using Kimi VL...
◁think▷I need to transcribe all the visible text from the provided image exactly as it is. I should not include any explanations, summaries, interpretations, or thought processes. I should only return the raw visible text.

The image appears to be a handwritten document with several points about asymptotic notation and algorithm complexities. Let me transcribe each visible line carefully:

1. The first line seems to be "01" with some markings next to it.
2. The "Ans" label followed by numbered points:
   - Point 1: "Asymptotic Defined that a function is tending to one axis you can say like it is a limit Example of Asymptotic Function could be e²"
   - Point 2: "Asymptotic notation in Data Structure are Used to Represent time Complexities of your Algorithm"
   - Point 3: "It tells about the worst, average and Best Case time Complexity i.e time taken to execute a particular algorithm or Code"
   - Point 4: "The deno

In [7]:
def strip_thoughts(text):
    return re.sub(r"◁think▷.*?◁/think▷", "", text, flags=re.DOTALL).strip()


In [8]:
clean_output = strip_thoughts(extracted_text)
clean_output

'# Transcription of Visible Text:\n\n```\nQ.1\nAns\n1) Asymptotic Defined that a function is tending to one axis you can say like it is a limit Example of Asymptotic Function could be e²\n2) Asymptotic notation in Data Structure are Used to Represent time Complexities of your Algorithm\n3) It tells about the worst, average and Best Case time Complexity i.e time taken to execute a particular algorithm or Code\n4) The denotions or Best, Worst and average case are : O(Big O) → worst case Ω (Omega) → Best case Θ (theta) → Average case\n```\n◁ dhe◁\n\nQuestion Nos. Marks Awarded\n3\n5) But In data structure We\nAlways (consider that what Is\nthe cost Case of An Algorithm\nor Code that define that how\nmuch Computational power Computer\nto Execute.\n6) Ex :\nint count = O\nfor (int i = 0 ; i < n ; i++)\n{\ncount ++;\n}\nIn this Code the loop has\ntime complexity of O (n) and\nthe count variable updation\nhas O (1)\nTime Complexity = O(n)\n7) O(1) > O (log n) < O (Jin) < O (n)\n< O(n²) < O (n

In [10]:
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


In [12]:

# Load model
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# --- Helper Functions ---

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))  
    return synonyms

def get_semantic_factual_similarity(student_text, reference_text):
    embeddings1 = sbert_model.encode(student_text, convert_to_tensor=True)
    embeddings2 = sbert_model.encode(reference_text, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embeddings1, embeddings2).item()

    def extract_numbers(text):
        return set(re.findall(r'\d+(?:\.\d+)?', text))

    student_numbers = extract_numbers(student_text)
    reference_numbers = extract_numbers(reference_text)
    factual_accuracy = len(student_numbers.intersection(reference_numbers)) / len(reference_numbers) if reference_numbers else 1.0

    return (similarity + factual_accuracy) / 2  

def check_length_ratio(student_text, reference_text):
    student_length = len(word_tokenize(student_text))
    reference_length = len(word_tokenize(reference_text))
    ratio = student_length / reference_length if reference_length > 0 else 0
    return min(1.0, ratio if ratio <= 1.5 else 1.5 / ratio)

def check_sequence_alignment(student_text, reference_text):
    student_sentences = sent_tokenize(student_text)
    reference_sentences = sent_tokenize(reference_text)
    
    sent_ratio = min(len(student_sentences), len(reference_sentences)) / max(len(student_sentences), len(reference_sentences))
    
    student_paragraphs = student_text.split('\n\n')
    reference_paragraphs = reference_text.split('\n\n')
    para_ratio = min(len(student_paragraphs), len(reference_paragraphs)) / max(len(student_paragraphs), len(reference_paragraphs))
    
    return (sent_ratio + para_ratio) / 2

def check_key_phrases(student_text, reference_text):
    def get_phrases(text):
        words = word_tokenize(text.lower())
        phrases = set()
        for i in range(len(words)-1):
            phrases.add(f"{words[i]} {words[i+1]}")
            if i < len(words)-2:
                phrases.add(f"{words[i]} {words[i+1]} {words[i+2]}")
        return phrases
    
    ref_phrases = get_phrases(reference_text)
    student_phrases = get_phrases(student_text)
    
    expanded_ref_phrases = set()
    for phrase in ref_phrases:
        words = phrase.split()
        for word in words:
            expanded_ref_phrases.update(get_synonyms(word))
        expanded_ref_phrases.add(phrase)

    student_embedding = sbert_model.encode(student_text, convert_to_tensor=True)
    ref_embedding = sbert_model.encode(list(expanded_ref_phrases), convert_to_tensor=True)
    scores = util.pytorch_cos_sim(student_embedding, ref_embedding).tolist()[0]
    
    phrase_match_score = len(student_phrases.intersection(expanded_ref_phrases)) / len(expanded_ref_phrases) if expanded_ref_phrases else 0
    bert_match_score = max(scores) if scores else 0

    return max(phrase_match_score, bert_match_score)

def check_coherence(text):
    sentences = sent_tokenize(text)
    if len(sentences) < 2:
        return 1.0
            
    coherence_scores = []
    for i in range(len(sentences)-1):
        emb1 = sbert_model.encode(sentences[i])
        emb2 = sbert_model.encode(sentences[i+1])
        similarity = util.pytorch_cos_sim(emb1, emb2)
        coherence_scores.append(similarity.item())
            
    return sum(coherence_scores) / len(coherence_scores) 

# --- Master Evaluation Function ---

def evaluate_student_answer(student_text: str, reference_path: str, total_marks: float = 10.0):
    with open(reference_path, 'r', encoding='utf-8') as f:
        reference_text = f.read()

    scores = {
        'semantic_factual_similarity': get_semantic_factual_similarity(student_text, reference_text),
        'length_appropriateness': check_length_ratio(student_text, reference_text),
        'sequence_alignment': check_sequence_alignment(student_text, reference_text),
        'key_phrases': check_key_phrases(student_text, reference_text),
        'coherence': check_coherence(student_text)
    }

    weights = {
        'semantic_factual_similarity': 0.68,
        'length_appropriateness': 0.22,
        'sequence_alignment': 0.02,
        'key_phrases': 0.04,
        'coherence': 0.04
    }

    weighted_score = sum(scores[metric] * weights[metric] for metric in scores)
    scaled_score = weighted_score * total_marks  # Scale to total marks

    return {
        'final_score': round(scaled_score, 2),
        'details': {k: round(v, 3) for k, v in scores.items()}
    }


In [14]:
result = evaluate_student_answer(extracted_text, reference_path='Q1Answer.txt', total_marks=3)

print("Final Scaled Score:", result['final_score'])
print("Component Breakdown:", result['details'])


Final Scaled Score: 1.47
Component Breakdown: {'semantic_factual_similarity': 0.431, 'length_appropriateness': 0.73, 'sequence_alignment': 0.273, 'key_phrases': 0.493, 'coherence': 0.264}
