In [None]:
# Set Tesseract path
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

def extract_text_from_pdf(pdf_path):
    text = ""

    # Try extracting text using pdfplumber (for digital PDFs)
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted = page.extract_text()
            if extracted:
                text += extracted + "\n"

    # If pdfplumber fails, use OCR
    if not text.strip():
        images = convert_from_path(pdf_path, dpi=300)
        for image in images:
            text += pytesseract.image_to_string(image, lang="eng") + "\n"

    return text

# Extract text and store it in a variable for the marking scheme model
pdf_path = "/content/view_marking_scheme_removed.pdf"
teacher_answer = extract_text_from_pdf(pdf_path)
# Save extracted text to a .txt file
output_txt_path = "teacher_answer.txt"
with open(output_txt_path, "w", encoding="utf-8") as f:
    f.write(teacher_answer)

# The extracted text can now be used as input for the marking scheme model
print(teacher_answer)  # Check output


: 

Student   Answer   OCR

In [None]:
# Set Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/prefab-builder-452209-f8-ca411802c1fb.json"

# Function to preprocess image
def preprocess_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
    return binary

# Function to perform OCR using Google Cloud Vision
def ocr_with_google_vision(image_path):
    client = vision.ImageAnnotatorClient()
    with open(image_path, 'rb') as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations
    return texts[0].description if texts else ""

# Function to extract and structure text
def extract_and_structure_text(pdf_path):
    images = convert_from_path(pdf_path)
    structured_data = {}
    full_text = []  # Stores extracted text for all pages

    for i, image in enumerate(images):
        image_path = f"page_{i+1}.png"
        image.save(image_path, 'PNG')
        preprocessed_image = preprocess_image(cv2.imread(image_path))
        text = ocr_with_google_vision(image_path)
        full_text.append(text)  # Store text for each page

        print(f"--- Extracted Text from Page {i+1} ---\n{text}\n")

        # Extract question numbers, sub-parts, and marks using regex
        questions = re.findall(r'(\d+[a-z]*)\.\s*(.*?)\s*\((\d+)\)', text)
        for question in questions:
            q_num, q_text, marks = question
            if q_num not in structured_data:
                structured_data[q_num] = []
            structured_data[q_num].append({"text": q_text, "marks": int(marks)})

    return structured_data, full_text

# Path to the PDF file
pdf_path = "/content/view_ans_sheet9.pdf"

# Extract and structure text
student_answer, full_text = extract_and_structure_text(pdf_path)

# Store text starting from page 3 onwards
student_answer_page3 = "\n".join(full_text[2:])  # Pages are 0-indexed, so page 3 is index 2

# Save extracted text from page 3 onwards to a .txt file
output_txt_path = "student_answer_page3.txt"
with open(output_txt_path, "w", encoding="utf-8") as f:
    f.write(student_answer_page3)

# Print output (optional)
print("Structured Data for Student Answer:", student_answer)
# # Save structured data to JSON
# with open("structured_answers.json", "w") as json_file:
#     json.dump(structured_data, json_file, indent=4)

# print("Structured data saved to structured_answers.json")

Marking   Scheme   Model

In [None]:
# Load Sentence-BERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Preprocess text by removing stop words
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words and word.isalnum()]
    return " ".join(filtered_words)


def sentiment_analysis(teacher_answer, student_answer):
    # Load the NLP model
    nlp = spacy.load("en_core_web_sm")
    doc_teacher = nlp(teacher_answer)
    doc_student = nlp(student_answer)


    # Using TextBlob for sentiment analysis
    blob_teacher_answer = TextBlob(teacher_answer)
    blob_student_answer = TextBlob(student_answer)


    teacher_answer_polarity = blob_teacher_answer.sentiment.polarity
    student_answer_polarity = blob_student_answer.sentiment.polarity


    teacher_answer_subjectivity = blob_teacher_answer.sentiment.subjectivity
    student_answer_subjectivity = blob_student_answer.sentiment.subjectivity

    # Extracting positive and negative sentiment scores
    def get_sentiment_scores(polarity):
        return max(0, polarity), abs(min(0, polarity))

    teacher_answer_positive, teacher_answer_negative = get_sentiment_scores(teacher_answer)
    student_answer_positive, student_answer_negative = get_sentiment_scores(student_answer)


    return {
        "teacher_answer": {
            "Polarity": teacher_answer_polarity,
            "Positive Score": teacher_answer_positive,
            "Negative Score": teacher_answer_negative,
            # "TextBlob Subjectivity": teacher_subjectivity,
            # "Tokens": [token.text for token in doc_teacher],
            # "Lemmatized": [token.lemma_ for token in doc_teacher],
            # "POS Tags": [(token.text, token.pos_) for token in doc_teacher]
        },
        "student_answer": {
            "Polarity": student_answer_polarity,
            "Positive Score": student_answer_positive,
            "Negative Score": student_answer_negative,
            # "TextBlob Subjectivity": student_100_subjectivity,
            # "Tokens": [token.text for token in doc_student_100],
            # "Lemmatized": [token.lemma_ for token in doc_student_100],
            # "POS Tags": [(token.text, token.pos_) for token in doc_student_100]
        }
    }

# TF-IDF + Cosine Similarity for objective-type questions
def compute_tf_idf_similarity(answer, correct_answer):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([answer, correct_answer])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

# Function to evaluate a single answer
def evaluate_answer(student_answer, correct_answer, question_type):
    # Preprocess text to remove stop words
    student_answer_processed = preprocess_text(student_answer)
    correct_answer_processed = preprocess_text(correct_answer)

    if question_type == "objective":
        score = compute_tf_idf_similarity(student_answer_processed, correct_answer_processed)
    else:
        # Use Sentence-BERT for semantic similarity
        student_emb = sbert_model.encode(student_answer_processed, convert_to_tensor=True)
        correct_emb = sbert_model.encode(correct_answer_processed, convert_to_tensor=True)
        score = util.pytorch_cos_sim(student_emb, correct_emb).item()

    # Apply a threshold to filter out low-similarity matches
    if score < 0.5:  # Adjusted threshold to 0.2
        score = 0.0
    return round(score, 2)

# Function to clean and organize the text
def organize_text(text):
    # Remove extra spaces, newlines, and tabs
    text = re.sub(r'\s+', ' ', text).strip()

    # Ensure consistent formatting for subparts (e.g., 1(a)(i))
    text = re.sub(r'(\d+\([a-z]\)(?:\([i-v]+\))?)\s*\.?\s*', r'\1. ', text)

    # Split into lines for further processing
    lines = text.split('. ')
    organized_lines = []

    for line in lines:
        # Ensure each line starts with a question or subpart identifier
        if re.match(r'\d+\([a-z]\)(?:\([i-v]+\))?', line):
            organized_lines.append(line)
        else:
            # Append to the previous line if it doesn't start with an identifier
            if organized_lines:
                organized_lines[-1] += ' ' + line

    # Join the lines back into a single string
    organized_text = '. '.join(organized_lines)
    return organized_text

def simplify_subpart_key(key):
    """
    Simplify subpart keys like 1(a)(i) to 1(a), but leave 1(a)(ii) unchanged.
    """
    # Regex to match subpart keys like 1(a)(i) or 1(a)(ii)
    pattern = re.compile(r'(\d+\([a-z]\))(\(i\))')
    match = pattern.search(key)
    if match:
        # If the key ends with (i), simplify it to the main part
        return match.group(1)
    return key  # Otherwise, return the key as is

# Function to parse the answer text into questions and subparts
def parse_answers(answer_text):
    # Regex to detect question and subpart patterns (e.g., 1(a), 1(a)(i), etc.)
    pattern = re.compile(r'(\d+\([a-z]\)(?:\([i-v]+\))?)\.?\s*(.*?)(?=\d+\([a-z]\)(?:\([i-v]+\))?|$)', re.DOTALL)
    matches = pattern.findall(answer_text)
    parsed_answers = {}
    for match in matches:
        key = match[0].strip()  # Question or subpart identifier (e.g., 1(a), 1(a)(i))
        key = simplify_subpart_key(key)  # Simplify the key if necessary
        content = match[1].strip()  # Content of the answer
        parsed_answers[key] = content
    return parsed_answers

# Function to evaluate the entire answer sheet
def evaluate_answer_sheet(teacher_answer_text, student_answer_text):
    # Organize the text before parsing
    teacher_answer_text = organize_text(teacher_answer_text)
    student_answer_text = organize_text(student_answer_text)

    # Parse teacher's and student's answers
    teacher_answers = parse_answers(teacher_answer_text)
    student_answers = parse_answers(student_answer_text)

    # Debug: Print parsed answers
    print("Parsed Teacher Answers:")
    for key, value in teacher_answers.items():
        print(f"{key}: {value}")

    print("\nParsed Student Answers:")
    for key, value in student_answers.items():
        print(f"{key}: {value}")

    total_score = 0.0
    results = []
    marks_dict = {}  # Dictionary to store marks for each subpart

    # First, evaluate all subparts and store their marks
    subpart_results = {}
    for subpart_key in teacher_answers:
        if subpart_key not in marks_dict:
            marks_dict[subpart_key] = float(input(f"Enter the maximum marks for {subpart_key}: "))

        student_answer = student_answers.get(subpart_key, "")
        teacher_answer = teacher_answers.get(subpart_key, "")
        similarity = evaluate_answer(student_answer, teacher_answer, "descriptive")
        marks_awarded = round(similarity * marks_dict[subpart_key], 2)
        subpart_results[subpart_key] = {
            "student_answer": student_answer,
            "teacher_answer": teacher_answer,
            "similarity": similarity,
            "marks_awarded": marks_awarded
        }

    # Now, group subparts under their respective main parts
    main_parts = {}
    for key in teacher_answers:
        if re.match(r'\d+\([a-z]\)$', key):  # Main part (e.g., 1(a))
            main_parts[key] = []

    for key in teacher_answers:
        if re.match(r'\d+\([a-z]\)(\([i-v]+\))?$', key):  # Subpart (e.g., 1(a)(i))
            main_part_key = re.match(r'\d+\([a-z]\)', key).group()
            if main_part_key in main_parts:
                main_parts[main_part_key].append(key)

    # Aggregate the results by main part
    for main_part_key, subpart_keys in main_parts.items():
        question_score = 0.0
        question_results = []

        for subpart_key in subpart_keys:
            question_score += subpart_results[subpart_key]["marks_awarded"]
            question_results.append({
                "subpart": subpart_key,
                **subpart_results[subpart_key]
            })

        # Add the question results to the overall results
        results.append({
            "question": main_part_key,
            "question_score": question_score,
            "subparts": question_results
        })
        total_score += question_score

    return {"total_score": total_score, "results": results}



# Example teacher's and student's answers


# Evaluate the answer sheet
test_results = evaluate_answer_sheet(teacher_answer_text, student_answer_text)

# Print the results
for question in test_results["results"]:
    print(f"\nQuestion {question['question']}:")
    for subpart in question["subparts"]:
        print(f"  Subpart {subpart['subpart']}:")
        print(f"    Student Answer: {subpart['student_answer']}")
        print(f"    Teacher Answer: {subpart['teacher_answer']}")
        print(f"    Similarity: {subpart['similarity']}")
        print(f"    Marks Awarded: {subpart['marks_awarded']}")
    print(f"  Total Marks for Question {question['question']}: {question['question_score']}")

print(f"\nTotal Score: {test_results['total_score']}")





In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from textblob import TextBlob
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load Sentence-BERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocess text by removing stop words
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words and word.isalnum()]
    return " ".join(filtered_words)

# Extract keyphrases from the teacher's answer
def extract_keyphrases(text):
    doc = nlp(text)
    keyphrases = []
    for chunk in doc.noun_chunks:
        if chunk.text.lower() not in stopwords.words('english'):
            keyphrases.append(chunk.text)
    return keyphrases

# Fact-checking function
def fact_check(student_answer, teacher_answer):
    # Extract keyphrases from the teacher's answer
    teacher_keyphrases = extract_keyphrases(teacher_answer)

    # Check if the student's answer contains the keyphrases or their semantic equivalents
    student_doc = nlp(student_answer)
    for phrase in teacher_keyphrases:
        phrase_doc = nlp(phrase)
        # Check for exact match or semantic similarity
        if not any(token.text.lower() == phrase.lower() for token in student_doc):
            # If no exact match, check semantic similarity
            student_emb = sbert_model.encode(student_answer, convert_to_tensor=True)
            phrase_emb = sbert_model.encode(phrase, convert_to_tensor=True)
            similarity = util.pytorch_cos_sim(student_emb, phrase_emb).item()
            if similarity < 0.6:  # Threshold for semantic similarity
                return False  # Factually incorrect
    return True  # Factually correct

# Function to evaluate a single answer
def evaluate_answer(student_answer, teacher_answer, question_type):
    student_answer_processed = preprocess_text(student_answer)
    teacher_answer_processed = preprocess_text(teacher_answer)

    # Perform fact-checking
    if not fact_check(student_answer, teacher_answer):
        return 0.0  # Factually incorrect answers get 0 marks

    # Compute semantic similarity
    if question_type == "objective":
        semantic_score = compute_tf_idf_similarity(student_answer_processed, teacher_answer_processed)
    else:
        student_emb = sbert_model.encode(student_answer_processed, convert_to_tensor=True)
        teacher_emb = sbert_model.encode(teacher_answer_processed, convert_to_tensor=True)
        semantic_score = util.pytorch_cos_sim(student_emb, teacher_emb).item()

    # Apply the scoring logic
    if semantic_score <= 0.5:
        score = 0.0  # 0 marks
    elif 0.5 < semantic_score < 0.8:
        score = 0.5  # Half marks
    else:
        score = 1.0  # Full marks

    return round(score, 2)

# Function to compute TF-IDF similarity
def compute_tf_idf_similarity(answer, correct_answer):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([answer, correct_answer])
    return cosine_similarity(vectors[0], vectors[1])[0][0]


# Function to evaluate a single answer
def evaluate_answer(student_answer, teacher_answer, question_type):
    student_answer_processed = preprocess_text(student_answer)
    teacher_answer_processed = preprocess_text(teacher_answer)

    # Compute semantic similarity
    if question_type == "objective":
        semantic_score = compute_tf_idf_similarity(student_answer_processed, teacher_answer_processed)
    else:
        student_emb = sbert_model.encode(student_answer_processed, convert_to_tensor=True)
        teacher_emb = sbert_model.encode(teacher_answer_processed, convert_to_tensor=True)
        semantic_score = util.pytorch_cos_sim(student_emb, teacher_emb).item()

    # Apply the scoring logic
    if semantic_score >= 0.8:  # Full marks for highly similar answers
        score = 1.0
    elif 0.5 <= semantic_score < 0.8:  # Half marks for moderately similar answers
        score = 0.5
    else:  # 0 marks for low similarity
        score = 0.0

    return round(score, 1)  # Round to one decimal place

# Function to evaluate the entire answer sheet
def evaluate_answer_sheet(teacher_answer_text, student_answer_text):
    teacher_answer_text = organize_text(teacher_answer_text)
    student_answer_text = organize_text(student_answer_text)

    teacher_answers = parse_answers(teacher_answer_text)
    student_answers = parse_answers(student_answer_text)

    print("Parsed Teacher Answers:")
    for key, value in teacher_answers.items():
        print(f"{key}: {value}")

    print("\nParsed Student Answers:")
    for key, value in student_answers.items():
        print(f"{key}: {value}")

    total_score = 0.0
    results = []
    marks_dict = {}

    subpart_results = {}
    for subpart_key in teacher_answers:
        if subpart_key not in marks_dict:
            marks_dict[subpart_key] = float(input(f"Enter the maximum marks for {subpart_key}: "))

        student_answer = student_answers.get(subpart_key, "")
        teacher_answer = teacher_answers.get(subpart_key, "")
        similarity = evaluate_answer(student_answer, teacher_answer, "descriptive")
        marks_awarded = round(similarity * marks_dict[subpart_key], 1)  # Round to one decimal place
        subpart_results[subpart_key] = {
            "student_answer": student_answer,
            "teacher_answer": teacher_answer,
            "similarity": similarity,
            "marks_awarded": marks_awarded
        }

    main_parts = {}
    for key in teacher_answers:
        if re.match(r'\d+\([a-z]\)$', key):
            main_parts[key] = []

    for key in teacher_answers:
        if re.match(r'\d+\([a-z]\)(\([i-v]+\))?$', key):
            main_part_key = re.match(r'\d+\([a-z]\)', key).group()
            if main_part_key in main_parts:
                main_parts[main_part_key].append(key)

    for main_part_key, subpart_keys in main_parts.items():
        question_score = 0.0
        question_results = []

        for subpart_key in subpart_keys:
            question_score += subpart_results[subpart_key]["marks_awarded"]
            question_results.append({
                "subpart": subpart_key,
                **subpart_results[subpart_key]
            })

        results.append({
            "question": main_part_key,
            "question_score": round(question_score, 1),  # Round to one decimal place
            "subparts": question_results
        })
        total_score += question_score

    return {"total_score": round(total_score, 1), "results": results}  # Round total score to one decimal place

# Function to clean and organize the text
import google.generativeai as genai
from IPython.display import Markdown

def organize_text_with_gemini(text, api_key):
    """
    Uses Google's Gemini to structure unorganized extracted text into a properly formatted answer sheet.
    
    Args:
        text (str): The unorganized extracted text from answer sheets
        api_key (str): Your Google AI Studio API key
        
    Returns:
        str: Well-structured answer sheet text
    """
    genai.configure(api_key=api_key)
    
    prompt = f"""
    You are an expert in structuring unorganized answer sheets for grading. 
    The following text was extracted from an answer sheet but is poorly organized:
    
    {text}
    
    Please:
    1. Identify all questions and sub-questions (like 1(a), 1(a)(i), etc.)
    2. Structure them in a clear, readable format
    3. Maintain proper numbering and indentation
    4. Preserve all original answer content
    5. Output only the structured text with no additional commentary
    
    Structured version:
    """
    
    model = genai.GenerativeModel(model_name='gemini-1.5-flash')
    response = model.generate_content(prompt)
    
    # Process the streaming response
    structured_text = ""
    for chunk in response:
        structured_text += chunk.text
    
    return structured_text

# Example usage:
API_KEY = "your_api_key_here"  # Replace with your actual API key
unorganized_text = "1a i AnswerA 1a ii AnswerB 1b AnswerC..."

structured_result = organize_text_with_gemini(unorganized_text, API_KEY)
display(Markdown(f"**Structured Answer Sheet:**\n\n{structured_result}"))

def simplify_subpart_key(key):
    pattern = re.compile(r'(\d+\([a-z]\))(\(i\))')
    match = pattern.search(key)
    if match:
        return match.group(1)
    return key

# Function to parse the answer text into questions and subparts
def parse_answers(answer_text):
    pattern = re.compile(r'(\d+\([a-z]\)(?:\([i-v]+\))?)\s*[\.\-]?\s*(.+?)(?=\n\d+\([a-z]\)|$)', re.DOTALL)
    matches = pattern.findall(answer_text)

    parsed_answers = {}
    for match in matches:
        key = match[0].strip()
        content = match[1].strip()
        parsed_answers[key] = content

    print("\n🔍 Parsed Answers:")
    for key, value in parsed_answers.items():
        print(f"{key}: {value[:50]}...")  # Print first 50 chars of each answer

    return parsed_answers


# Function to evaluate the entire answer sheet
def evaluate_answer_sheet(teacher_answer_text, student_answer_text):
    teacher_answer_text = organize_text(teacher_answer_text)
    student_answer_text = organize_text(student_answer_text)

    teacher_answers = parse_answers(teacher_answer_text)
    student_answers = parse_answers(student_answer_text)

    print("Parsed Teacher Answers:")
    for key, value in teacher_answers.items():
        print(f"{key}: {value}")

    print("\nParsed Student Answers:")
    for key, value in student_answers.items():
        print(f"{key}: {value}")

    total_score = 0.0
    results = []
    marks_dict = {}

    subpart_results = {}
    for subpart_key in teacher_answers:
        if subpart_key not in marks_dict:
            marks_dict[subpart_key] = float(input(f"Enter the maximum marks for {subpart_key}: "))

        student_answer = student_answers.get(subpart_key, "")
        teacher_answer = teacher_answers.get(subpart_key, "")
        similarity = evaluate_answer(student_answer, teacher_answer, "descriptive")
        marks_awarded = round(similarity * marks_dict[subpart_key], 2)
        subpart_results[subpart_key] = {
            "student_answer": student_answer,
            "teacher_answer": teacher_answer,
            "similarity": similarity,
            "marks_awarded": marks_awarded
        }

    main_parts = {}
    for key in teacher_answers:
        if re.match(r'\d+\([a-z]\)$', key):
            main_parts[key] = []

    for key in teacher_answers:
        if re.match(r'\d+\([a-z]\)(\([i-v]+\))?$', key):
            main_part_key = re.match(r'\d+\([a-z]\)', key).group()
            if main_part_key in main_parts:
                main_parts[main_part_key].append(key)

    for main_part_key, subpart_keys in main_parts.items():
        question_score = 0.0
        question_results = []

        for subpart_key in subpart_keys:
            question_score += subpart_results[subpart_key]["marks_awarded"]
            question_results.append({
                "subpart": subpart_key,
                **subpart_results[subpart_key]
            })

        results.append({
            "question": main_part_key,
            "question_score": question_score,
            "subparts": question_results
        })
        total_score += question_score

    return {"total_score": total_score, "results": results}

# Main execution
if __name__ == "__main__":
    teacher_answer = input("Enter the teacher's paragraph: ")
    student_answer = input("Enter the student's paragraph: ")

    # # Perform sentiment analysis
    # sentiment_result = sentiment_analysis(teacher_answer, student_answer)
    # print("Sentiment Analysis Result:")
    # print(sentiment_result)

    # Evaluate the answer sheet
    test_results = evaluate_answer_sheet(teacher_answer, student_answer)
    print("\nAnswer Sheet Evaluation Results:")
    for question in test_results["results"]:
        print(f"\nQuestion {question['question']}:")
        for subpart in question["subparts"]:
            print(f"  Subpart {subpart['subpart']}:")
            print(f"    Student Answer: {subpart['student_answer']}")
            print(f"    Teacher Answer: {subpart['teacher_answer']}")
            print(f"    Similarity: {subpart['similarity']}")
            print(f"    Marks Awarded: {subpart['marks_awarded']}")
        print(f"  Total Marks for Question {question['question']}: {question['question_score']}")

    print(f"\nTotal Score: {test_results['total_score']}")