# Marksheet Forgery Detection Backend

This notebook serves as the backend logic to detect tampering in marksheet documents. 
It implements 3 layers of forgery detection:
1. **Error Level Analysis (ELA)**: Detects digital modifications by analyzing compression artifacts.
2. **Metadata Analysis**: Checks for editing software signatures in file metadata.
3. **Logical Discrepancy Check (OCR)**: Extracts marks and verifies if the Totals match the sum of individual subjects.

Finally, it generates a comprehensive 1-page report.

In [None]:
import os
import cv2
import numpy as np
import pytesseract
from PIL import Image, ImageChops, ImageEnhance, ExifTags
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import re

# Ensure dependencies are present
# !pip install opencv-python pytesseract pillow pandas matplotlib
# Note: Tesseract OCR engine must be installed on the system (e.g., sudo apt install tesseract-ocr)

In [None]:
class ForgeryDetector:
    def __init__(self, file_path):
        self.file_path = file_path
        self.image = Image.open(file_path).convert('RGB')
        self.cv_image = cv2.imread(file_path)
        self.report_data = {
            "filename": os.path.basename(file_path),
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "checks": {}
        }

    def perform_ela(self, quality=90):
        """
        Check 1: Error Level Analysis (ELA)
        Detects regions that have been resaved/edited by comparing compression levels.
        """
        temp_filename = "temp_ela.jpg"
        self.image.save(temp_filename, 'JPEG', quality=quality)
        temp_image = Image.open(temp_filename).convert('RGB')
        
        # Calculate difference
        ela_image = ImageChops.difference(self.image, temp_image)
        
        # Enhance brightness to make differences visible
        extrema = ela_image.getextrema()
        max_diff = max([ex[1] for ex in extrema])
        scale = 255.0 / max_diff if max_diff > 0 else 1
        ela_image = ImageEnhance.Brightness(ela_image).enhance(scale)
        
        # Calculate a "Tamper Score" based on variance of ELA
        ela_np = np.array(ela_image)
        mean = np.mean(ela_np)
        std_dev = np.std(ela_np)
        
        is_suspect = std_dev > 15 # Threshold for suspicion (tunable)
        
        self.report_data["checks"]["ELA"] = {
            "status": "Fail" if is_suspect else "Pass",
            "details": f"ELA Standard Deviation: {std_dev:.2f}. High variance suggests pasted elements.",
            "score": std_dev
        }
        
        # Cleanup
        if os.path.exists(temp_filename):
            os.remove(temp_filename)
            
        return ela_image

    def check_metadata(self):
        """
        Check 2: Metadata Analysis
        Looks for editing software in EXIF data.
        """
        suspicious_tags = ['Photoshop', 'GIMP', 'Editor', 'Adobe']
        found_tags = []
        
        exif_data = self.image._getexif()
        if exif_data:
            for tag, value in exif_data.items():
                decoded = ExifTags.TAGS.get(tag, tag)
                val_str = str(value)
                for sus in suspicious_tags:
                    if sus.lower() in val_str.lower():
                        found_tags.append(f"{decoded}: {val_str}")
        
        is_tampered = len(found_tags) > 0
        self.report_data["checks"]["Metadata"] = {
            "status": "Fail" if is_tampered else "Pass",
            "details": f"Found editing software traces: {', '.join(found_tags)}" if is_tampered else "No editing software traces found."
        }

    def verify_logical_consistency(self):
        """
        Check 3: Logical Discrepancy (OCR)
        Extracts numbers and checks if Subject Sum == Total.
        """
        # Preprocessing for OCR
        gray = cv2.cvtColor(self.cv_image, cv2.COLOR_BGR2GRAY)
        # Simple thresholding
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
        
        text = pytesseract.image_to_string(thresh)
        
        # Simple parser to find 2 or 3 digit numbers
        # This is a heuristic: gather all numbers, check if the largest one roughly equals sum of others (basic check)
        # A more advanced backend would use layout analysis (LayoutParser) to identify columns.
        numbers = [int(s) for s in re.findall(r'\b\d{2,3}\b', text)]
        
        suspect = False
        details = "Could not extract sufficient data for logic check."
        
        if len(numbers) > 3:
            # Assumption: The largest number might be the total. 
            # Check if any combination of other numbers sums to it.
            numbers.sort()
            potential_total = numbers[-1]
            others = numbers[:-1]
            
            # Try checking sum of last 3-5 marks (common marksheet format)
            # This is a 'soft' check.
            if sum(others) == potential_total:
                details = f"Logic Verified: Sum of marks equals {potential_total}."
            else:
                # Often simply checking if Total is plausible
                # If Total > 600 or < 0 (if percentage based) etc.
                if potential_total > 500 and len(others) < 5: # e.g. 5 subjects 100 each
                    suspect = True
                    details = f"Suspicious Total: {potential_total} does not seem to match subject count."
                else:
                    details = "Logic check inconclusive (layout complex). Visually verify."
                    
        self.report_data["checks"]["Logic"] = {
            "status": "Warn" if suspect else "Pass",
            "details": details
        }

    def generate_report(self):
        """
        Generates a 1-page summary report.
        """
        fig, ax = plt.subplots(figsize=(8.5, 11)) # A4 size roughly
        ax.axis('off')
        
        # Header
        y = 1.0
        ax.text(0.5, y, "FORGERY DETECTION REPORT", ha='center', fontsize=20, weight='bold')
        y -= 0.05
        ax.text(0.5, y, f"File: {self.report_data['filename']} | Date: {self.report_data['timestamp']}", ha='center', fontsize=10)
        y -= 0.1
        
        # Overall Status
        fails = [k for k, v in self.report_data['checks'].items() if v['status'] == 'Fail']
        overall = "TAMPERED" if fails else "AUTHENTIC"
        color = "red" if fails else "green"
        
        ax.text(0.5, y, f"STATUS: {overall}", ha='center', fontsize=24, color=color, weight='bold')
        y -= 0.1
        
        # Detailed Checks
        ax.text(0.1, y, "Detailed Analysis Results:", fontsize=14, weight='bold')
        y -= 0.05
        
        for check_name, info in self.report_data['checks'].items():
            status_color = "red" if info['status'] == 'Fail' else "green"
            ax.text(0.1, y, f"{check_name}:", fontsize=12, weight='bold')
            ax.text(0.3, y, f"{info['status']}", fontsize=12, color=status_color, weight='bold')
            y -= 0.03
            ax.text(0.1, y, f"  Details: {info['details']}", fontsize=10, style='italic', wrap=True)
            y -= 0.08
            
        # Add ELA Visualization thumbnail
        ela_img = self.perform_ela()
        
        # Convert ELA to array and place on figure
        newax = fig.add_axes([0.3, 0.1, 0.4, 0.3])
        newax.imshow(ela_img)
        newax.axis('off')
        newax.set_title("ELA Visualization (White/colored noise indicates editing)")
        
        plt.savefig("forgery_report.png", dpi=100)
        return "forgery_report.png"

In [None]:
# Example Usage Block
# Replace 'test_marksheet.jpg' with your input file path

def analyze_marksheet(file_path):
    if not os.path.exists(file_path):
        return "Error: File not found."
    
    detector = ForgeryDetector(file_path)
    
    # 1. Run ELA
    detector.perform_ela()
    
    # 2. Check Metadata
    detector.check_metadata()
    
    # 3. Check Logic
    detector.verify_logical_consistency()
    
    # Generate Return
    report_path = detector.generate_report()
    print(f"Analysis Complete. Report generated at: {report_path}")
    return detector.report_data

# To run: 
# analyze_marksheet("path/to/your/image.png")