# Model

In [1]:
import os
import re
from pdf2image import convert_from_path
from google.cloud import vision
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sentence_transformers import SentenceTransformer, util
import spacy
import pytesseract
from PIL import Image
import numpy as np
from textblob import TextBlob
import time

2025-04-03 17:34:19.238065: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743681859.251674   17619 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743681859.255704   17619 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-03 17:34:19.269003: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  return torch._C._cuda_getDeviceCount() > 0


In [2]:
def pdf_to_images(pdf_path, output_folder):
    """Convert a PDF to images, one image per page."""
    images = convert_from_path(pdf_path)
    image_paths = []

    for i, image in enumerate(images):
        image_path = f"{output_folder}/page_{i + 1}.jpg"
        image.save(image_path, "JPEG")
        image_paths.append(image_path)

    return image_paths

In [3]:
def extract_text_from_images(image_paths):
    """Extract text from a list of image paths using Google Cloud Vision."""
    client = vision.ImageAnnotatorClient()
    all_text = ""

    for image_path in image_paths:
        with open(image_path, "rb") as image_file:
            content = image_file.read()
        
        image = vision.Image(content=content)
        response = client.document_text_detection(image=image)

        if response.error.message:
            raise Exception(f"Error processing {image_path}: {response.error.message}")

        all_text += response.full_text_annotation.text + "\n"

    return all_text

In [4]:
def process_pdf_without_buckets(pdf_path, output_folder):
    """Process a PDF file without using Google Cloud Storage."""
    os.makedirs(output_folder, exist_ok=True)

    print("Converting PDF to images...")
    image_paths = pdf_to_images(pdf_path, output_folder)

    print("Extracting text from images...")
    extracted_text = extract_text_from_images(image_paths)

    return extracted_text

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\n+', ' ', text)  
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)  
    
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    
    return ' '.join(filtered_text), ' '.join(filtered_text)  

In [6]:
def evaluate_answer(student_text, reference_text):
    """Comprehensive answer evaluation using multiple metrics."""
    sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    def get_semantic_similarity(text1, text2):
        """Calculate semantic similarity using SBERT."""
        embeddings1 = sbert_model.encode(text1, convert_to_tensor=True)
        embeddings2 = sbert_model.encode(text2, convert_to_tensor=True)
        return util.pytorch_cos_sim(embeddings1, embeddings2).item()

    def get_keyword_coverage(student_text, reference_text):
        """Calculate keyword matching and coverage."""
        student_words = set(word_tokenize(student_text.lower()))
        reference_words = set(word_tokenize(reference_text.lower()))
        return len(student_words.intersection(reference_words)) / len(reference_words) if reference_words else 0

    def check_length_ratio(student_text, reference_text):
        """Check if answer length is appropriate."""
        student_length = len(word_tokenize(student_text))
        reference_length = len(word_tokenize(reference_text))
        ratio = student_length / reference_length if reference_length > 0 else 0
        return min(1.0, ratio if ratio <= 1.5 else 1.5 / ratio)

    def check_structure_similarity(student_text, reference_text):
        """Compare structural elements like paragraphs and sentences."""
        student_sentences = sent_tokenize(student_text)
        reference_sentences = sent_tokenize(reference_text)
        
        sent_ratio = min(len(student_sentences), len(reference_sentences)) / max(len(student_sentences), len(reference_sentences))
        
        student_paragraphs = student_text.split('\n\n')
        reference_paragraphs = reference_text.split('\n\n')
        para_ratio = min(len(student_paragraphs), len(reference_paragraphs)) / max(len(student_paragraphs), len(reference_paragraphs))
        
        return (sent_ratio + para_ratio) / 2

    def check_key_phrases(student_text, reference_text):
        """Check for presence of key phrases and concepts."""
        def get_phrases(text):
            words = word_tokenize(text.lower())
            phrases = set()
            for i in range(len(words)-1):
                phrases.add(f"{words[i]} {words[i+1]}")
                if i < len(words)-2:
                    phrases.add(f"{words[i]} {words[i+1]} {words[i+2]}")
            return phrases
        
        ref_phrases = get_phrases(reference_text)
        student_phrases = get_phrases(student_text)
        return len(student_phrases.intersection(ref_phrases)) / len(ref_phrases) if ref_phrases else 0

    def check_sequence_alignment(student_text, reference_text):
        """Check if ideas are presented in a similar sequence."""
        student_sentences = sent_tokenize(student_text)
        reference_sentences = sent_tokenize(reference_text)
        
        student_emb = sbert_model.encode(student_sentences)
        reference_emb = sbert_model.encode(reference_sentences)
        
        alignment_scores = []
        for i in range(min(len(student_emb), len(reference_emb))):
            similarity = util.pytorch_cos_sim(student_emb[i], reference_emb[i])
            alignment_scores.append(similarity.item())
        
        return sum(alignment_scores) / len(alignment_scores) if alignment_scores else 0

    def check_factual_accuracy(student_text, reference_text):
        """Check for presence of numerical values and specific facts."""
        def extract_numbers(text):
            return set(re.findall(r'\d+(?:\.\d+)?', text))
        
        student_numbers = extract_numbers(student_text)
        reference_numbers = extract_numbers(reference_text)
        
        number_accuracy = len(student_numbers.intersection(reference_numbers)) / len(reference_numbers) if reference_numbers else 1.0
        return number_accuracy

    def check_coherence(text):
        """Check text coherence using sentence transitions."""
        sentences = sent_tokenize(text)
        if len(sentences) < 2:
            return 1.0
            
        coherence_scores = []
        for i in range(len(sentences)-1):
            emb1 = sbert_model.encode(sentences[i])
            emb2 = sbert_model.encode(sentences[i+1])
            similarity = util.pytorch_cos_sim(emb1, emb2)
            coherence_scores.append(similarity.item())
            
        return sum(coherence_scores) / len(coherence_scores)

    scores = {
        'semantic_similarity': get_semantic_similarity(student_text, reference_text),
        'keyword_coverage': get_keyword_coverage(student_text, reference_text),
        'length_appropriateness': check_length_ratio(student_text, reference_text),
        'structure_similarity': check_structure_similarity(student_text, reference_text),
        'key_phrases': check_key_phrases(student_text, reference_text),
        'sequence_alignment': check_sequence_alignment(student_text, reference_text),
        'factual_accuracy': check_factual_accuracy(student_text, reference_text),
        'coherence': check_coherence(student_text)
    }

    weights = {
        'semantic_similarity': 0.25,
        'keyword_coverage': 0.15,
        'length_appropriateness': 0.10,
        'structure_similarity': 0.10,
        'key_phrases': 0.15,
        'sequence_alignment': 0.10,
        'factual_accuracy': 0.10,
        'coherence': 0.05
    }

    final_score = sum(scores[metric] * weights[metric] for metric in scores)

    return {
        'final_score': final_score,
        'details': scores
    }

In [7]:
def main(student_pdf_path, teacher_answer_path, output_folder):
    """Main function with comprehensive evaluation."""
    try:
        start_time = time.time()
        
        print("Processing student's submission...")
        student_text = process_pdf_without_buckets(student_pdf_path, output_folder)
        
        print("Processing teacher's answer...")
        if teacher_answer_path.endswith('.pdf'):
            teacher_text = process_pdf_without_buckets(teacher_answer_path, output_folder)
        else:
            with open(teacher_answer_path, 'r') as file:
                teacher_text = file.read()
        
        print("Preprocessing texts...")
        student_processed, student_lemmatized = preprocess_text(student_text)
        teacher_processed, teacher_lemmatized = preprocess_text(teacher_text)
        
        print("Evaluating answer...")
        evaluation_result = evaluate_answer(student_processed, teacher_processed)
        
        marks = round(evaluation_result['final_score'] * 10, 2)
        
        print("\n=== Grading Report ===")
        print(f"Final Marks: {marks}/10")
        print("\nDetailed Scores:")
        for metric, score in evaluation_result['details'].items():
            print(f"{metric.replace('_', ' ').title()}: {score:.2f}")
        
        print(f"\nProcessing Time: {time.time() - start_time:.2f} seconds")
        
        return evaluation_result

    except Exception as e:
        print(f"Error in main processing: {str(e)}")
        raise

In [8]:
student_pdf = "/home/dhruv/Desktop/CloudOCR/ProperTesting/Jeet-D038.pdf"
teacher_answer = "/home/dhruv/Desktop/CloudOCR/ProperTesting/AnswerKey.txt"
output_dir = "output_images"

result = main(student_pdf, teacher_answer, output_dir)

Processing student's submission...
Converting PDF to images...
Extracting text from images...
Processing teacher's answer...
Preprocessing texts...
Evaluating answer...





=== Grading Report ===
Final Marks: 3.89/10

Detailed Scores:
Semantic Similarity: 0.58
Keyword Coverage: 0.21
Length Appropriateness: 0.31
Structure Similarity: 0.63
Key Phrases: 0.01
Sequence Alignment: 0.24
Factual Accuracy: 0.75
Coherence: 0.39

Processing Time: 8.98 seconds


In [12]:
student_pdf = "/home/dhruv/Desktop/CloudOCR/ProperTesting/Krisha-D053.pdf"
student_text = process_pdf_without_buckets(student_pdf, output_dir)
student_text

Converting PDF to images...
Extracting text from images...


'TIC\nmsi\nare\nate\nOur\nate.\nate:\ntes\ntion\ndi\nest\ner ar\nans\nto\nwer\nQuestion\nNos.\nAsymptotic notions are used to\ncheck the time st& complexity of\nalgorithms and to find out which\nalgorithm is the most time efficient.\nwhen input size is large enough.\n2) Big\nBig O → This is used to\n0\nrepresent the worst case of the time\ncomplexity. This suggests the max time that an algorithm\nf(n)\ncan take for\ncompletion\ne an\nthis\noug\nsho\not b\nare f\ne in.\nnk, v\non w\nwriti\nsho\nПо\nIn the above graph (g(n) and fen)\nare functions and\ncgcn) = 0 (f(n))\ncg (n) = f(n)\nif and\nwe can\nonly if\nand n>no.\nsay that\nQuestion\nNos.\n3\nthe\n2) Big. W + This is used to represent\nbest of time complexity of an\nalgorithm. This suggests the minimum\ntime algorithm can can for take for its.\ncompletion.\nMarks\nAwarded\nf(n)\n((9cm)\n(gem = ar from the\nabove graph we can say that f(n) w(eg(n)\negen) = w(fcns) if and only if\nfin) & c (g(n))\nand >no.\n3) Big 0 - This is used to

# Comparison of different metric weights

In [65]:
import pandas as pd
import numpy as np
from itertools import combinations_with_replacement

In [72]:
def generate_weight_combinations():
    metrics = [
        'semantic_similarity',
        'keyword_coverage',
        'length_appropriateness',
        'structure_similarity',
        'key_phrases',
        'sequence_alignment',
        'factual_accuracy',
        'coherence'
    ]
    
    # Predefined reasonable weight combinations that sum to 1
    weight_sets = [
        [0.20, 0.15, 0.15, 0.10, 0.10, 0.10, 0.10, 0.10],  # Balanced
        [0.30, 0.20, 0.10, 0.10, 0.10, 0.10, 0.05, 0.05],  # Emphasis on semantic
        [0.25, 0.25, 0.10, 0.10, 0.10, 0.10, 0.05, 0.05],  # Dual emphasis
        [0.20, 0.20, 0.20, 0.10, 0.10, 0.10, 0.05, 0.05],  # Triple emphasis
        [0.15, 0.15, 0.15, 0.15, 0.10, 0.10, 0.10, 0.10],  # More balanced
        [0.25, 0.15, 0.15, 0.15, 0.10, 0.10, 0.05, 0.05],  # Semantic focus
        [0.20, 0.20, 0.15, 0.15, 0.10, 0.10, 0.05, 0.05],  # Dual high
        [0.30, 0.15, 0.15, 0.10, 0.10, 0.10, 0.05, 0.05],  # High semantic
        [0.25, 0.20, 0.15, 0.10, 0.10, 0.10, 0.05, 0.05],  # Balanced high
        [0.20, 0.20, 0.15, 0.15, 0.10, 0.10, 0.05, 0.05],  # Even split
        [0.25, 0.25, 0.15, 0.10, 0.10, 0.05, 0.05, 0.05],  # Top heavy
        [0.20, 0.15, 0.15, 0.15, 0.15, 0.10, 0.05, 0.05],  # Mid spread
        [0.30, 0.25, 0.15, 0.10, 0.05, 0.05, 0.05, 0.05],  # Very top heavy
        [0.25, 0.20, 0.20, 0.15, 0.05, 0.05, 0.05, 0.05],  # Upper focus
        [0.20, 0.20, 0.20, 0.20, 0.05, 0.05, 0.05, 0.05],  # Even top
        [0.35, 0.20, 0.15, 0.10, 0.05, 0.05, 0.05, 0.05],  # Semantic priority
        [0.30, 0.30, 0.10, 0.10, 0.05, 0.05, 0.05, 0.05],  # Dual priority
        [0.25, 0.25, 0.20, 0.10, 0.05, 0.05, 0.05, 0.05],  # Triple top
        [0.20, 0.20, 0.20, 0.15, 0.10, 0.05, 0.05, 0.05],  # Balanced top
        [0.25, 0.20, 0.15, 0.15, 0.10, 0.05, 0.05, 0.05],  # Graduated
        [0.30, 0.20, 0.20, 0.10, 0.05, 0.05, 0.05, 0.05],  # High priority
        [0.25, 0.25, 0.15, 0.15, 0.05, 0.05, 0.05, 0.05],  # Dual high priority
        [0.20, 0.20, 0.15, 0.15, 0.15, 0.05, 0.05, 0.05],  # Triple mid
        [0.30, 0.25, 0.20, 0.10, 0.05, 0.05, 0.03, 0.02],  # Strong top
        [0.25, 0.25, 0.25, 0.10, 0.05, 0.05, 0.03, 0.02]   # Triple strong
    ]
    
    weight_combinations = []
    for weights in weight_sets:
        weight_dict = dict(zip(metrics, weights))
        weight_combinations.append(weight_dict)
    
    return weight_combinations

In [73]:
def analyze_two_answers(student_text1, reference_text1, student_text2, reference_text2, max_marks=100):
    """
    Analyze weights performance on two different answers.
    max_marks: Maximum marks possible (default 100)
    """
    
    weight_combinations = generate_weight_combinations()
    
    scores1 = evaluate_answer(student_text1, reference_text1)['details']
    scores2 = evaluate_answer(student_text2, reference_text2)['details']
    
    results = []
    for weights in weight_combinations:
        # Calculate scores for both answers (scaled to max_marks)
        final_score1 = sum(scores1[metric] * weights[metric] for metric in weights) * max_marks
        final_score2 = sum(scores2[metric] * weights[metric] for metric in weights) * max_marks
        
        result = {
            f'score_answer1 (out of {max_marks})': round(final_score1, 2),
            f'score_answer2 (out of {max_marks})': round(final_score2, 2),
            'semantic_similarity': round(weights['semantic_similarity'], 3),
            'keyword_coverage': round(weights['keyword_coverage'], 3),
            'length_appropriateness': round(weights['length_appropriateness'], 3),
            'structure_similarity': round(weights['structure_similarity'], 3),
            'key_phrases': round(weights['key_phrases'], 3),
            'sequence_alignment': round(weights['sequence_alignment'], 3),
            'factual_accuracy': round(weights['factual_accuracy'], 3),
            'coherence': round(weights['coherence'], 3)
        }
        results.append(result)
    
    df = pd.DataFrame(results)
    
    df['average_score'] = (df[f'score_answer1 (out of {max_marks})'] + 
                          df[f'score_answer2 (out of {max_marks})']) / 2
    
    df['average_score'] = df['average_score'].round(2)
    
    df = df.sort_values('average_score', ascending=False)
    
    column_order = [
        f'score_answer1 (out of {max_marks})',
        f'score_answer2 (out of {max_marks})',
        'average_score',
        'semantic_similarity',
        'keyword_coverage',
        'length_appropriateness',
        'structure_similarity',
        'key_phrases',
        'sequence_alignment',
        'factual_accuracy',
        'coherence'
    ]
    df = df[column_order]
    
    return df

In [74]:
def export_analysis(student_text1, reference_text1,
                   student_text2, reference_text2,
                   max_marks=100,
                   output_path='weight_analysis.csv'):
    """Analyze and export results to CSV."""
    
    df = analyze_two_answers(
        student_text1, reference_text1,
        student_text2, reference_text2,
        max_marks=max_marks
    )
    
    print("\nWeight Analysis Results:")
    print("\nFirst 5 combinations:")
    display(df.head())
    print("\nLast 5 combinations:")
    display(df.tail())
    
    print("\nSummary Statistics:")
    summary_stats = df[[f'score_answer1 (out of {max_marks})', 
                       f'score_answer2 (out of {max_marks})', 
                       'average_score']].describe()
    display(summary_stats)
    
    df.to_csv(output_path, index=False)
    print(f"\nFull analysis exported to {output_path}")
    
    best_weights = df.iloc[0].drop([f'score_answer1 (out of {max_marks})', 
                                  f'score_answer2 (out of {max_marks})', 
                                  'average_score']).to_dict()
    return best_weights

In [81]:
student_pdf_path1 = "/home/dhruv/Desktop/CloudOCR/myAnswer.pdf"
student_pdf_path2 = "/home/dhruv/Desktop/CloudOCR/student_answer.pdf"
teacher_answer_path = "/home/dhruv/Desktop/CloudOCR/teacher_answer.txt"
output_folder = "output"

student_answer1 = process_pdf_without_buckets(student_pdf_path1, output_folder)

student_answer2 = process_pdf_without_buckets(student_pdf_path2, output_folder)
with open(teacher_answer_path, 'r') as file:
                reference_answer = file.read()

Converting PDF to images...
Extracting text from images...
Converting PDF to images...
Extracting text from images...


In [82]:
student_answer2

'Gray-level slicing is a technique used in image\nprocessing to highlight certain intensity levels in an\nimage. There are two types of gray-level slicing:\n1. Gray-level slicing with background:\n-\nIn this method, the pixels within a specific intensity\nrange are highlighted, while the rest of the image\nremains unchanged.\nThis is useful when you want to focus on specific\nfeatures while keeping the background intact.\n2. Gray-level slicing without background:\nIn this method, only the pixels within the specified\nintensity range are highlighted, and the rest of the\nimage is set to a constant value like black or white.\nThis is useful when you want to isolate specific\nfeatures and remove all other details from the\nimage.\nBoth methods involve selecting a range of intensity\nvalues and applying a transformation to the image\nbased on those values.\n'

In [83]:
student_answer1

"8.\nGray -Level Slicing w/out background.\nEnhance a specific range of intensify levels\nwhile sitting all other pixel values to 0.\nisclating specific features.\nGray - Level slicing w/ background:\nEnhances ce certain range of intensity levels bet\nretains the rest of Image's intensity levels as\nare preserving the background.\nthey\n"

In [86]:
max_marks = 10

best_weights = export_analysis(
    student_answer1, reference_answer,
    student_answer2, reference_answer,
    max_marks=max_marks,
    output_path='weight_analysis.csv'
)

print("\nBest performing weights:")
for metric, weight in best_weights.items():
    print(f"{metric}: {round(weight, 3)}")




Weight Analysis Results:

First 5 combinations:


Unnamed: 0,score_answer1 (out of 10),score_answer2 (out of 10),average_score,semantic_similarity,keyword_coverage,length_appropriateness,structure_similarity,key_phrases,sequence_alignment,factual_accuracy,coherence
15,4.51,8.35,6.43,0.35,0.2,0.15,0.1,0.05,0.05,0.05,0.05
23,4.37,8.37,6.37,0.3,0.25,0.2,0.1,0.05,0.05,0.03,0.02
20,4.3,8.3,6.3,0.3,0.2,0.2,0.1,0.05,0.05,0.05,0.05
12,4.28,8.24,6.26,0.3,0.25,0.15,0.1,0.05,0.05,0.05,0.05
7,4.26,8.22,6.24,0.3,0.15,0.15,0.1,0.1,0.1,0.05,0.05



Last 5 combinations:


Unnamed: 0,score_answer1 (out of 10),score_answer2 (out of 10),average_score,semantic_similarity,keyword_coverage,length_appropriateness,structure_similarity,key_phrases,sequence_alignment,factual_accuracy,coherence
18,3.79,7.9,5.85,0.2,0.2,0.2,0.15,0.1,0.05,0.05,0.05
0,3.63,7.92,5.78,0.2,0.15,0.15,0.1,0.1,0.1,0.1,0.1
11,3.74,7.82,5.78,0.2,0.15,0.15,0.15,0.15,0.1,0.05,0.05
22,3.65,7.73,5.69,0.2,0.2,0.15,0.15,0.15,0.05,0.05,0.05
4,3.47,7.74,5.6,0.15,0.15,0.15,0.15,0.1,0.1,0.1,0.1



Summary Statistics:


Unnamed: 0,score_answer1 (out of 10),score_answer2 (out of 10),average_score
count,25.0,25.0,25.0
mean,4.022,8.066,6.0448
std,0.254378,0.181016,0.212938
min,3.47,7.73,5.6
25%,3.86,7.93,5.9
50%,4.02,8.06,6.06
75%,4.23,8.19,6.22
max,4.51,8.37,6.43



Full analysis exported to weight_analysis.csv

Best performing weights:
semantic_similarity: 0.35
keyword_coverage: 0.2
length_appropriateness: 0.15
structure_similarity: 0.1
key_phrases: 0.05
sequence_alignment: 0.05
factual_accuracy: 0.05
coherence: 0.05


In [85]:
results_df

Unnamed: 0,score_answer1,score_answer2,semantic_similarity,keyword_coverage,length_appropriateness,structure_similarity,key_phrases,sequence_alignment,factual_accuracy,coherence
0,2.38,2.38,0.125,0.125,0.125,0.125,0.125,0.125,0.125,0.125
1,2.33,2.33,0.117647,0.117647,0.117647,0.117647,0.117647,0.117647,0.117647,0.176471
2,2.28,2.28,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.222222
3,2.24,2.24,0.105263,0.105263,0.105263,0.105263,0.105263,0.105263,0.105263,0.263158
4,2.2,2.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.3
5,2.2,2.2,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.166667,0.166667
6,2.16,2.16,0.105263,0.105263,0.105263,0.105263,0.105263,0.105263,0.157895,0.210526
7,2.13,2.13,0.1,0.1,0.1,0.1,0.1,0.1,0.15,0.25
8,2.1,2.1,0.095238,0.095238,0.095238,0.095238,0.095238,0.095238,0.142857,0.285714
9,2.05,2.05,0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.2


# Comparison of different similarity metrics

In [95]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer  
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import nltk
nltk.download('punkt')
nltk.download('wordnet')



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

loading configuration file config.json from cache at /home/dhruv/snap/jupyterlab-desktop/common/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_attentions": true,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.45.0",
  "vocab_size": 30522
}



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

loading file vocab.txt from cache at /home/dhruv/snap/jupyterlab-desktop/common/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/dhruv/snap/jupyterlab-desktop/common/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/tokenizer_config.json
loading file tokenizer.json from cache at /home/dhruv/snap/jupyterlab-desktop/common/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/tokenizer.json
loading configuration file config.json from cache at /home/dhruv/snap/jupyterlab-desktop/common/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "_name_or_path": "distil

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /home/dhruv/snap/jupyterlab-desktop/common/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/model.safetensors
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of DistilBertModel were initialized from the model checkpoi

True

In [101]:
def calculate_bleu(reference, candidate):
    """Calculate BLEU score."""
    smoothie = SmoothingFunction().method1
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()
    return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothie)

def calculate_rouge(reference, candidate):
    """Calculate ROUGE scores."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

def calculate_meteor(reference, candidate):
    """Calculate METEOR score."""
    return meteor_score([reference.split()], candidate.split())

def calculate_bert_score(reference, candidate):
    """Calculate BERTScore."""
    P, R, F1 = bert_score([candidate], [reference], lang='en')
    return {
        'bert_precision': P.mean().item(),
        'bert_recall': R.mean().item(),
        'bert_f1': F1.mean().item()
    }

def calculate_perplexity(text):
    """Calculate perplexity."""
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
    model = AutoModelForCausalLM.from_pretrained('gpt2')
    
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs['input_ids'])
    return torch.exp(outputs.loss).item()

In [102]:
def compare_metrics(student_text1, student_text2, reference_text):
    """Compare all metrics between two student answers."""
    
    metrics = {}
    
    # Calculate metrics for first answer
    metrics['answer1_bleu'] = calculate_bleu(reference_text, student_text1)
    metrics['answer1_meteor'] = calculate_meteor(reference_text, student_text1)
    
    rouge1 = calculate_rouge(reference_text, student_text1)
    metrics['answer1_rouge1'] = rouge1['rouge1']
    metrics['answer1_rouge2'] = rouge1['rouge2']
    metrics['answer1_rougeL'] = rouge1['rougeL']
    
    bert1 = calculate_bert_score(reference_text, student_text1)
    metrics['answer1_bert_f1'] = bert1['bert_f1']
    
    metrics['answer1_perplexity'] = calculate_perplexity(student_text1)
    
    # Calculate metrics for second answer
    metrics['answer2_bleu'] = calculate_bleu(reference_text, student_text2)
    metrics['answer2_meteor'] = calculate_meteor(reference_text, student_text2)
    
    rouge2 = calculate_rouge(reference_text, student_text2)
    metrics['answer2_rouge1'] = rouge2['rouge1']
    metrics['answer2_rouge2'] = rouge2['rouge2']
    metrics['answer2_rougeL'] = rouge2['rougeL']
    
    bert2 = calculate_bert_score(reference_text, student_text2)
    metrics['answer2_bert_f1'] = bert2['bert_f1']
    
    metrics['answer2_perplexity'] = calculate_perplexity(student_text2)
    
    return metrics

In [103]:
def create_comparison_df(metrics_dict):
    """Create a formatted DataFrame from metrics."""
    
    # Reorganize data for DataFrame
    data = {
        'Metric': [
            'BLEU Score',
            'METEOR Score',
            'ROUGE-1',
            'ROUGE-2',
            'ROUGE-L',
            'BERTScore (F1)',
            'Perplexity'
        ],
        'Answer 1': [
            metrics_dict['answer1_bleu'],
            metrics_dict['answer1_meteor'],
            metrics_dict['answer1_rouge1'],
            metrics_dict['answer1_rouge2'],
            metrics_dict['answer1_rougeL'],
            metrics_dict['answer1_bert_f1'],
            metrics_dict['answer1_perplexity']
        ],
        'Answer 2': [
            metrics_dict['answer2_bleu'],
            metrics_dict['answer2_meteor'],
            metrics_dict['answer2_rouge1'],
            metrics_dict['answer2_rouge2'],
            metrics_dict['answer2_rougeL'],
            metrics_dict['answer2_bert_f1'],
            metrics_dict['answer2_perplexity']
        ]
    }
    
    df = pd.DataFrame(data)
    
    # Add difference column
    df['Difference (A1 - A2)'] = df['Answer 1'] - df['Answer 2']
    
    # Round all numeric columns to 3 decimal places
    numeric_columns = ['Answer 1', 'Answer 2', 'Difference (A1 - A2)']
    df[numeric_columns] = df[numeric_columns].round(3)
    
    return df

In [104]:
def analyze_answers(student_text1, student_text2, reference_text, output_path='metric_comparison.csv'):
    """Main function to analyze and compare two answers."""
    
    print("Calculating metrics...")
    metrics = compare_metrics(student_text1, student_text2, reference_text)
    
    print("Creating comparison DataFrame...")
    df = create_comparison_df(metrics)
    
    # Display results
    print("\nMetric Comparison Results:")
    display(df)
    
    # Export to CSV
    df.to_csv(output_path, index=False)
    print(f"\nResults exported to {output_path}")
    
    return df

In [105]:
results_df = analyze_answers(student_answer1, student_answer2, reference_answer)

Calculating metrics...


loading configuration file config.json from cache at /home/dhruv/snap/jupyterlab-desktop/common/.cache/huggingface/hub/models--roberta-large/snapshots/722cf37b1afa9454edce342e7895e588b6ff1d59/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.45.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/dhruv/snap/jupyterlab-desktop/common/.cache/huggingface/hub/models--roberta-la

Creating comparison DataFrame...

Metric Comparison Results:


Unnamed: 0,Metric,Answer 1,Answer 2,Difference (A1 - A2)
0,BLEU Score,0.006,0.529,-0.523
1,METEOR Score,0.111,0.745,-0.634
2,ROUGE-1,0.359,0.847,-0.488
3,ROUGE-2,0.108,0.683,-0.575
4,ROUGE-L,0.223,0.786,-0.563
5,BERTScore (F1),0.843,0.922,-0.079
6,Perplexity,238.497,13.205,225.292



Results exported to metric_comparison.csv
