In [None]:
!pip show keras tf-keras

In [1]:
import os
import re
import io
from pdf2image import convert_from_path
from google.cloud import vision
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

2025-01-27 23:28:04.933410: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738000684.947063    9009 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738000684.951229    9009 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-27 23:28:04.964294: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/dhruv/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] D

True

In [2]:
def pdf_to_images(pdf_path, output_folder):
    """Convert a PDF to images, one image per page."""
    images = convert_from_path(pdf_path)
    image_paths = []

    for i, image in enumerate(images):
        image_path = f"{output_folder}/page_{i + 1}.jpg"
        image.save(image_path, "JPEG")
        image_paths.append(image_path)

    return image_paths

In [3]:
def extract_text_from_images(image_paths):
    """Extract text from a list of image paths using Google Cloud Vision."""
    client = vision.ImageAnnotatorClient()
    all_text = ""

    for image_path in image_paths:
        with open(image_path, "rb") as image_file:
            content = image_file.read()
        
        image = vision.Image(content=content)
        response = client.document_text_detection(image=image)

        if response.error.message:
            raise Exception(f"Error processing {image_path}: {response.error.message}")

        all_text += response.full_text_annotation.text + "\n"

    return all_text

In [4]:
def process_pdf_without_buckets(pdf_path, output_folder):
    """Process a PDF file without using Google Cloud Storage."""
    os.makedirs(output_folder, exist_ok=True)

    print("Converting PDF to images...")
    image_paths = pdf_to_images(pdf_path, output_folder)

    print("Extracting text from images...")
    extracted_text = extract_text_from_images(image_paths)

    return extracted_text

In [5]:
def preprocess_text(text):
    """Preprocess text by removing stopwords, punctuation, and lowercasing."""
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

In [6]:
def get_similarity_score(text1, text2):
    """Calculate the similarity score between two texts using SBERT."""
    sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    embeddings1 = sbert_model.encode(text1, convert_to_tensor=True)
    embeddings2 = sbert_model.encode(text2, convert_to_tensor=True)
    cosine_similarity = util.pytorch_cos_sim(embeddings1, embeddings2)
    return cosine_similarity.item()

In [7]:
def main(student_pdf_path, teacher_answer_path, output_folder):
    # Extract text from student's PDF
    print("Processing student's PDF...")
    student_text = process_pdf_without_buckets(student_pdf_path, output_folder)
    
    # If teacher's answer is a PDF, extract text; otherwise, read as text
    if teacher_answer_path.endswith('.pdf'):
        print("Processing teacher's PDF...")
        teacher_text = process_pdf_without_buckets(teacher_answer_path, output_folder)
    else:
        with open(teacher_answer_path, 'r') as file:
            teacher_text = file.read()
    
    # Preprocess both texts
    print("Preprocessing texts...")
    student_text_processed = preprocess_text(student_text)
    teacher_text_processed = preprocess_text(teacher_text)
    
    # Calculate similarity score
    print("Calculating similarity score...")
    similarity_score = get_similarity_score(student_text_processed, teacher_text_processed)
    
    # Assign marks based on similarity score (example: out of 10)
    marks = round(similarity_score * 10, 2)
    
    print(f"Similarity Score: {similarity_score}")
    print(f"Marks Awarded: {marks}")