In [1]:
import os
import re
from pdf2image import convert_from_path
from google.cloud import vision
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sentence_transformers import SentenceTransformer, util
import spacy
import pytesseract
from PIL import Image
import numpy as np
from textblob import TextBlob
import time

2025-04-05 10:24:35.828443: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743828876.188841    4986 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743828876.308745    4986 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-05 10:24:37.275049: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def pdf_to_images(pdf_path, output_folder):
    """Convert a PDF to images, one image per page."""
    images = convert_from_path(pdf_path)
    image_paths = []

    for i, image in enumerate(images):
        image_path = f"{output_folder}/page_{i + 1}.jpg"
        image.save(image_path, "JPEG")
        image_paths.append(image_path)

    return image_paths

In [3]:
def extract_text_from_images(image_paths):
    """Extract text from a list of image paths using Google Cloud Vision."""
    client = vision.ImageAnnotatorClient()
    all_text = ""

    for image_path in image_paths:
        with open(image_path, "rb") as image_file:
            content = image_file.read()
        
        image = vision.Image(content=content)
        response = client.document_text_detection(image=image)

        if response.error.message:
            raise Exception(f"Error processing {image_path}: {response.error.message}")

        all_text += response.full_text_annotation.text + "\n"

    return all_text

In [4]:
def process_pdf_without_buckets(pdf_path, output_folder):
    """Process a PDF file without using Google Cloud Storage."""
    os.makedirs(output_folder, exist_ok=True)

    print("Converting PDF to images...")
    image_paths = pdf_to_images(pdf_path, output_folder)

    print("Extracting text from images...")
    extracted_text = extract_text_from_images(image_paths)

    return extracted_text

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\n+', ' ', text)  
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)  
    
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    
    return ' '.join(filtered_text), ' '.join(filtered_text)  

In [6]:
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import word_tokenize, sent_tokenize
import re

def evaluate_answer(student_text, reference_text, total_marks):
    """Comprehensive answer evaluation with scaling based on total marks."""
    sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    def get_semantic_similarity():
        """Calculate semantic similarity using SBERT."""
        embeddings1 = sbert_model.encode(student_text, convert_to_tensor=True)
        embeddings2 = sbert_model.encode(reference_text, convert_to_tensor=True)
        return util.pytorch_cos_sim(embeddings1, embeddings2).item()

    def check_length_ratio():
        """Check if answer length is appropriate."""
        student_length = len(word_tokenize(student_text))
        reference_length = len(word_tokenize(reference_text))
        ratio = student_length / reference_length if reference_length > 0 else 0
        return min(1.0, ratio if ratio <= 1.5 else 1.5 / ratio)

    def check_key_phrases():
        """Check for presence of key phrases and concepts."""
        def get_phrases(text):
            words = word_tokenize(text.lower())
            return {f"{words[i]} {words[i+1]}" for i in range(len(words)-1)} | \
                   {f"{words[i]} {words[i+1]} {words[i+2]}" for i in range(len(words)-2)}

        ref_phrases = get_phrases(reference_text)
        student_phrases = get_phrases(student_text)
        return len(student_phrases & ref_phrases) / len(ref_phrases) if ref_phrases else 0

    def check_sequence_alignment():
        """Check if ideas are presented in a similar sequence."""
        student_sentences = sent_tokenize(student_text)
        reference_sentences = sent_tokenize(reference_text)
        
        student_emb = sbert_model.encode(student_sentences)
        reference_emb = sbert_model.encode(reference_sentences)
        
        alignment_scores = [
            util.pytorch_cos_sim(student_emb[i], reference_emb[i]).item()
            for i in range(min(len(student_emb), len(reference_emb)))
        ]
        
        return sum(alignment_scores) / len(alignment_scores) if alignment_scores else 0

    def check_factual_accuracy():
        """Check for presence of numerical values and specific facts."""
        extract_numbers = lambda text: set(re.findall(r'\d+(?:\.\d+)?', text))
        
        student_numbers = extract_numbers(student_text)
        reference_numbers = extract_numbers(reference_text)
        
        return len(student_numbers & reference_numbers) / len(reference_numbers) if reference_numbers else 1.0

    def check_coherence():
        """Check text coherence using sentence transitions."""
        sentences = sent_tokenize(student_text)
        if len(sentences) < 2:
            return 1.0
            
        coherence_scores = [
            util.pytorch_cos_sim(
                sbert_model.encode(sentences[i]), 
                sbert_model.encode(sentences[i+1])
            ).item()
            for i in range(len(sentences)-1)
        ]
            
        return sum(coherence_scores) / len(coherence_scores)

    scores = {
        'semantic_similarity': get_semantic_similarity(),
        'length_appropriateness': check_length_ratio(),
        'key_phrases': check_key_phrases(),
        'sequence_alignment': check_sequence_alignment(),
        'factual_accuracy': check_factual_accuracy(),
        'coherence': check_coherence()
    }

    weights = {
        'semantic_similarity': 0.30,
        'length_appropriateness': 0.15,
        'key_phrases': 0.20,
        'sequence_alignment': 0.15,
        'factual_accuracy': 0.10,
        'coherence': 0.10
    }

    raw_score = sum(scores[metric] * weights[metric] for metric in scores)
    scaled_score = raw_score * total_marks  # Scale the score based on total marks

    return {
        'final_score': scaled_score,
        'details': scores
    }


In [7]:
import time

def main(student_pdf_path, teacher_answer_path, output_folder, total_marks=10):
    """Main function with comprehensive evaluation and scaled scoring."""
    try:
        start_time = time.time()
        
        print("Processing student's submission...")
        student_text = process_pdf_without_buckets(student_pdf_path, output_folder)
        
        print("Processing teacher's answer...")
        if teacher_answer_path.endswith('.pdf'):
            teacher_text = process_pdf_without_buckets(teacher_answer_path, output_folder)
        else:
            with open(teacher_answer_path, 'r') as file:
                teacher_text = file.read()
        
        print("Preprocessing texts...")
        student_processed, student_lemmatized = preprocess_text(student_text)
        teacher_processed, teacher_lemmatized = preprocess_text(teacher_text)
        
        print("Evaluating answer...")
        evaluation_result = evaluate_answer(student_processed, teacher_processed, total_marks)
        
        marks = round(evaluation_result['final_score'], 2)  # Already scaled in evaluate_answer
        
        print("\n=== Grading Report ===")
        print(f"Final Marks: {marks}/{total_marks}")
        print("\nDetailed Scores:")
        for metric, score in evaluation_result['details'].items():
            print(f"{metric.replace('_', ' ').title()}: {score:.2f}")
        
        print(f"\nProcessing Time: {time.time() - start_time:.2f} seconds")
        
        return evaluation_result

    except Exception as e:
        print(f"Error in main processing: {str(e)}")
        raise


# Jeet

In [8]:
student_pdf = "Jeet/Jeet-D038-Q1.pdf"
teacher_answer = "Q1Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=3)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.06/3

Detailed Scores:
Semantic Similarity: 0.55
Length Appropriateness: 0.30
Key Phrases: 0.01
Sequence Alignment: 0.28
Factual Accuracy: 0.60
Coherence: 0.40

Processing Time: 8.45 seconds


In [10]:
student_pdf = "Jeet/Jeet-D038-Q2.pdf"
teacher_answer = "Q2-i-answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=7)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 3.15/7

Detailed Scores:
Semantic Similarity: 0.54
Length Appropriateness: 1.00
Key Phrases: 0.00
Sequence Alignment: 0.24
Factual Accuracy: 0.67
Coherence: 0.37

Processing Time: 7.36 seconds


In [11]:
student_pdf = "Jeet/Jeet-D038-Q3.pdf"
teacher_answer = "Q3Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=5)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.02/5

Detailed Scores:
Semantic Similarity: 0.31
Length Appropriateness: 0.37
Key Phrases: 0.00
Sequence Alignment: 0.13
Factual Accuracy: 0.27
Coherence: 0.10

Processing Time: 5.59 seconds


# Joel


In [12]:
student_pdf = "Joel/Joel-D041-Q1.pdf"
teacher_answer = "Q1Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=3)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 0.86/3

Detailed Scores:
Semantic Similarity: 0.59
Length Appropriateness: 0.13
Key Phrases: 0.00
Sequence Alignment: 0.18
Factual Accuracy: 0.40
Coherence: 0.23

Processing Time: 7.03 seconds


In [13]:
student_pdf = "Joel/Joel-D041-Q2.pdf"
teacher_answer = "Q2-ii-answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=7)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 3.09/7

Detailed Scores:
Semantic Similarity: 0.62
Length Appropriateness: 0.90
Key Phrases: 0.04
Sequence Alignment: 0.28
Factual Accuracy: 0.36
Coherence: 0.37

Processing Time: 7.80 seconds


In [14]:
student_pdf = "Joel/Joel-D041-Q3.pdf"
teacher_answer = "Q3Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=5)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.51/5

Detailed Scores:
Semantic Similarity: 0.46
Length Appropriateness: 0.63
Key Phrases: 0.00
Sequence Alignment: 0.18
Factual Accuracy: 0.27
Coherence: 0.17

Processing Time: 6.72 seconds


# Kalp

In [15]:
student_pdf = "Kalp/Kalp-D043-Q1.pdf"
teacher_answer = "Q1Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=3)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 0.74/3

Detailed Scores:
Semantic Similarity: 0.50
Length Appropriateness: 0.19
Key Phrases: 0.01
Sequence Alignment: 0.16
Factual Accuracy: 0.20
Coherence: 0.21

Processing Time: 6.34 seconds


In [16]:
student_pdf = "Kalp/Kalp-D043-Q2.pdf"
teacher_answer = "Q2-ii-answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=7)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 3.87/7

Detailed Scores:
Semantic Similarity: 0.72
Length Appropriateness: 1.00
Key Phrases: 0.09
Sequence Alignment: 0.41
Factual Accuracy: 0.82
Coherence: 0.26

Processing Time: 10.11 seconds


In [17]:
student_pdf = "Kalp/Kalp-D043-Q3.pdf"
teacher_answer = "Q3Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=5)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.45/5

Detailed Scores:
Semantic Similarity: 0.39
Length Appropriateness: 0.51
Key Phrases: 0.00
Sequence Alignment: 0.21
Factual Accuracy: 0.33
Coherence: 0.30

Processing Time: 6.51 seconds


# Krisha

In [19]:
student_pdf = "Krisha/Krisha-D053-Q1.pdf"
teacher_answer = "Q1Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=3)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.09/3

Detailed Scores:
Semantic Similarity: 0.64
Length Appropriateness: 0.36
Key Phrases: 0.01
Sequence Alignment: 0.20
Factual Accuracy: 0.60
Coherence: 0.27

Processing Time: 7.56 seconds


In [21]:
student_pdf = "Krisha/Krisha-D053-Q2.pdf"
teacher_answer = "Q2-i-answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=7)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 2.69/7

Detailed Scores:
Semantic Similarity: 0.47
Length Appropriateness: 0.74
Key Phrases: 0.00
Sequence Alignment: 0.13
Factual Accuracy: 0.89
Coherence: 0.24

Processing Time: 7.41 seconds


In [22]:
student_pdf = "Krisha/Krisha-D053-Q3.pdf"
teacher_answer = "Q3Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=5)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.25/5

Detailed Scores:
Semantic Similarity: 0.29
Length Appropriateness: 0.49
Key Phrases: 0.00
Sequence Alignment: 0.19
Factual Accuracy: 0.27
Coherence: 0.36

Processing Time: 5.56 seconds


# Mehika


In [23]:
student_pdf = "Mehika/Mehika-D062-Q1.pdf"
teacher_answer = "Q1Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=3)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.1/3

Detailed Scores:
Semantic Similarity: 0.52
Length Appropriateness: 0.49
Key Phrases: 0.03
Sequence Alignment: 0.24
Factual Accuracy: 0.60
Coherence: 0.37

Processing Time: 7.00 seconds


In [25]:
student_pdf = "Mehika/Mehika-D062-Q2.pdf"
teacher_answer = "Q2-ii-answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=7)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 3.14/7

Detailed Scores:
Semantic Similarity: 0.55
Length Appropriateness: 0.80
Key Phrases: 0.08
Sequence Alignment: 0.20
Factual Accuracy: 0.73
Coherence: 0.44

Processing Time: 14.53 seconds


In [26]:
student_pdf = "Mehika/Mehika-D062-Q3.pdf"
teacher_answer = "Q3Answer.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir, total_marks=5)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 1.88/5

Detailed Scores:
Semantic Similarity: 0.42
Length Appropriateness: 0.78
Key Phrases: 0.00
Sequence Alignment: 0.28
Factual Accuracy: 0.40
Coherence: 0.48

Processing Time: 8.17 seconds
