# Imports and Setup

## 1.1 Standard Library Imports

In [167]:
import io
import json
import os
import re
import time
import csv

## 1.2 Third-party Library Imports

In [168]:
### Image Processing
import cv2
from PIL import Image, ImageDraw

### Data Manipulation and Analysis
import numpy as np
import pandas as pd
from copy import deepcopy
from collections import defaultdict, Counter

### Text Processing
from spellchecker import SpellChecker
from Levenshtein import distance
from thefuzz import fuzz
import ast

### API and Environment
from dotenv import load_dotenv
from groq import Groq
from json_repair import repair_json

## 1.3 Google Cloud and Vertex AI Imports

In [169]:
from google.api_core.client_options import ClientOptions
from google.cloud import vision
import vertexai

# OCR and Text Processing
## OCR Functions


In [170]:
def extract_blocks(annotation):
    """Extracts blocks, their bounding boxes, and words with coordinates from the annotation."""
    blocks = []
    for page in annotation.pages:
        for block in page.blocks:
            block_text = ''
            words_with_coords = []
            
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    # Combine symbols to form the word text
                    word_text = ''.join(symbol.text for symbol in word.symbols)
                    
                    # Ensure correct order of coordinates
                    x_coords = [vertex.x for vertex in word.bounding_box.vertices]
                    y_coords = [vertex.y for vertex in word.bounding_box.vertices]
                    top_left = (min(x_coords), min(y_coords))
                    bottom_right = (max(x_coords), max(y_coords))
                    
                    words_with_coords.append({"text": word_text, "bounds": [top_left, bottom_right]})
                    block_text += word_text + ' '
                
                # Add a newline after each paragraph
                block_text += '\n'

            # Ensure correct order of coordinates for block
            b_x_coords = [vertex.x for vertex in block.bounding_box.vertices]
            b_y_coords = [vertex.y for vertex in block.bounding_box.vertices]
            b_top_left = (min(b_x_coords), min(b_y_coords))
            b_bottom_right = (max(b_x_coords), max(b_y_coords))

            # Append the processed block information
            blocks.append({
                "text": block_text.strip(),
                "bounds": [b_top_left, b_bottom_right],
                "words": words_with_coords
            })
    return blocks

In [171]:
def detect_text(image, project_id):
    # Create client options with the specified project ID
    client_options = ClientOptions(quota_project_id=project_id)
    
    # Initialize the Vision API client with the specified options
    client = vision.ImageAnnotatorClient(client_options=client_options)

    # Perform document text detection on the provided image
    response = client.document_text_detection(image=image)
    
    # Extract blocks of text from the full text annotation
    blocks = extract_blocks(response.full_text_annotation)
    
    return blocks

In [172]:
def batch_detect_text(images, project_id):
    # Create client options with the specified project ID
    client_options = ClientOptions(quota_project_id=project_id)
    
    # Initialize the Vision API client with the specified options
    client = vision.ImageAnnotatorClient(client_options=client_options)
    
    # Prepare a list of AnnotateImageRequest objects for batch processing
    requests = [vision.AnnotateImageRequest(
        image=image, 
        features=[vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)]
    ) for image in images]
    
    # Perform batch annotation of images
    response = client.batch_annotate_images(requests=requests)
    
    # Extract blocks from each response and return as a list
    return [extract_blocks(resp.full_text_annotation) for resp in response.responses]

## OCR Cleanup

In [173]:
def calculate_bounds(words):
    """Calculate the bounding box for a group of words."""
    # If there are no words, return a default bounding box of zero size
    if not words:
        return [(0, 0), (0, 0)]
    
    # Initialize the bounding box with the first word's bounds
    top_left = list(words[0]['bounds'][0])
    bottom_right = list(words[0]['bounds'][1])
    
    # Iterate through the remaining words to expand the bounding box
    for word in words[1:]:
        # Update the top-left corner (minimum x and y coordinates)
        top_left[0] = min(top_left[0], word['bounds'][0][0])
        top_left[1] = min(top_left[1], word['bounds'][0][1])
        # Update the bottom-right corner (maximum x and y coordinates)
        bottom_right[0] = max(bottom_right[0], word['bounds'][1][0])
        bottom_right[1] = max(bottom_right[1], word['bounds'][1][1])
    
    # Return the final bounding box as a tuple of tuples
    return [(top_left[0], top_left[1]), (bottom_right[0], bottom_right[1])]

In [174]:
def merge_overlapping_words(paragraphs):
    # Flatten the list of words from all paragraphs
    all_words = []
    for paragraph in paragraphs:
        all_words.extend((word, paragraph) for word in paragraph['words'])
    
    def calculate_overlap(box1, box2):
        """Calculate the overlap ratio between two bounding boxes."""
        # Find the coordinates of the intersection rectangle
        x1 = max(box1[0][0], box2[0][0])
        y1 = max(box1[0][1], box2[0][1])
        x2 = min(box1[1][0], box2[1][0])
        y2 = min(box1[1][1], box2[1][1])
        
        # Check if there is no overlap
        if x2 <= x1 or y2 <= y1:
            return 0.0
        
        # Calculate the area of intersection
        intersection = (x2 - x1) * (y2 - y1)
        # Calculate the areas of both boxes
        area1 = (box1[1][0] - box1[0][0]) * (box1[1][1] - box1[0][1])
        area2 = (box2[1][0] - box2[0][0]) * (box2[1][1] - box2[0][1])
        
        # Return the overlap ratio
        return intersection / min(area1, area2)
    
    def merge_boxes(box1, box2):
        """Merge two bounding boxes into one."""
        return [
            (min(box1[0][0], box2[0][0]), min(box1[0][1], box2[0][1])),
            (max(box1[1][0], box2[1][0]), max(box1[1][1], box2[1][1]))
        ]
    
    def calculate_area(box):
        """Calculate the area of a bounding box."""
        return (box[1][0] - box[0][0]) * (box[1][1] - box[0][1])
    
    merged_words = []
    for word, paragraph in all_words:
        overlapping_word = None
        for existing_word, _ in merged_words:
            # Check if the current word overlaps significantly with any existing word
            if calculate_overlap(word['bounds'], existing_word['bounds']) > 0.5:  # Adjust overlap threshold as needed
                overlapping_word = existing_word
                break
        
        if overlapping_word is None:
            # If no overlap, add the word as is
            merged_words.append((word, paragraph))
        else:
            # Merge the bounding boxes
            merged_bounds = merge_boxes(word['bounds'], overlapping_word['bounds'])
            
            # Keep the word in the paragraph with the larger area
            if calculate_area(word['bounds']) > calculate_area(overlapping_word['bounds']):
                merged_words.remove((overlapping_word, _))
                merged_words.append(({**word, 'bounds': merged_bounds}, paragraph))
            else:
                idx = merged_words.index((overlapping_word, _))
                merged_words[idx] = ({**overlapping_word, 'bounds': merged_bounds}, _)
    
    # Reconstruct paragraphs with merged words
    new_paragraphs = []
    for paragraph in paragraphs:
        new_words = [word for word, para in merged_words if para == paragraph]
        if new_words:
            new_paragraph = paragraph.copy()
            new_paragraph['words'] = new_words
            new_paragraphs.append(new_paragraph)

    # Calculate new bounds for paragraphs
    for paragraph in new_paragraphs:
        paragraph['bounds'] = calculate_bounds(paragraph['words'])

    return new_paragraphs

In [175]:
def merge_overlapping_paragraphs(paragraphs, word_tolerance=1, paragraph_overlap_threshold=0.5):
    def contains_special_characters(word):
        """Check if a word contains any non-alphabetic characters."""
        return bool(re.search(r'[^a-zA-Z]', word))

    def is_valid_word(word):
        """Check if a word is valid (contains no special characters and is in the English dictionary)."""
        if contains_special_characters(word):
            return False
        
        from spellchecker import SpellChecker
        spell = SpellChecker()
        return word in spell

    def overlap_horizontally(box1, box2):
        """Check if two bounding boxes overlap horizontally within the word tolerance."""
        return (box1[1][0] + word_tolerance >= box2[0][0] and box2[1][0] + word_tolerance >= box1[0][0])

    def on_same_line(box1, box2, line_height_tolerance=8):
        """Check if two bounding boxes are on the same line within the line height tolerance."""
        return abs(box1[0][1] - box2[0][1]) <= line_height_tolerance

    def merge_boxes(box1, box2):
        """Merge two bounding boxes into one."""
        return [
            (min(box1[0][0], box2[0][0]), min(box1[0][1], box2[0][1])),
            (max(box1[1][0], box2[1][0]), max(box1[1][1], box2[1][1]))
        ]

    def calculate_overlap(box1, box2):
        """Calculate the overlap ratio between two bounding boxes."""
        x_overlap = max(0, min(box1[1][0], box2[1][0]) - max(box1[0][0], box2[0][0]))
        y_overlap = max(0, min(box1[1][1], box2[1][1]) - max(box1[0][1], box2[0][1]))
        overlap_area = x_overlap * y_overlap
        
        area1 = (box1[1][0] - box1[0][0]) * (box1[1][1] - box1[0][1])
        area2 = (box2[1][0] - box2[0][0]) * (box2[1][1] - box2[0][1])
        
        if (min(area1, area2) == 0):
            return 0

        return overlap_area / min(area1, area2)

    def merge_overlapping_words_in_paragraph(paragraph):
        """Merge overlapping words within a paragraph."""
        words = paragraph['words']
        if not words:
            return paragraph

        merged_words = [words[0]]

        for current_word in words[1:]:
            merge_found = False
            for i, merged_word in enumerate(merged_words):
                cleaned_current_word = re.sub(r'[^a-zA-Z0-9-]', '', current_word['text'])
                cleaned_merged_word = re.sub(r'[^a-zA-Z0-9-]', '', merged_word['text'])
                
                # Skip merging if both words are valid
                if (cleaned_current_word != '' and cleaned_merged_word != ''):
                    if (is_valid_word(cleaned_current_word) and is_valid_word(cleaned_merged_word)):
                        continue
                
                if overlap_horizontally(merged_word['bounds'], current_word['bounds']) and on_same_line(merged_word['bounds'], current_word['bounds']):
                    if current_word['bounds'][0][0] - merged_word['bounds'][1][0] <= word_tolerance:
                        merged_text = merged_word['text'].strip() + current_word['text'].strip()
                        merged_text = re.sub(r'\s+', '', merged_text)  # Remove any remaining spaces
                        merged_words[i] = {
                            'text': merged_text,
                            'bounds': merge_boxes(merged_word['bounds'], current_word['bounds'])
                        }
                        merge_found = True
                        break
            
            if not merge_found:
                merged_words.append(current_word)

        return {
            'text': paragraph['text'],
            'bounds': paragraph['bounds'],
            'words': merged_words
        }
    
    # First, merge overlapping words within each paragraph
    merged_word_paragraphs = [merge_overlapping_words_in_paragraph(p) for p in paragraphs]

    # Now, merge overlapping paragraphs
    merged_paragraphs = []
    for paragraph in merged_word_paragraphs:
        merge_found = False
        for i, merged_paragraph in enumerate(merged_paragraphs):
            overlap = calculate_overlap(merged_paragraph['bounds'], paragraph['bounds'])
            if overlap > paragraph_overlap_threshold:
                merged_paragraphs[i] = {
                    'text': merged_paragraph['text'] + ' ' + paragraph['text'],
                    'bounds': merge_boxes(merged_paragraph['bounds'], paragraph['bounds']),
                    'words': merged_paragraph['words'] + paragraph['words']
                }
                merge_found = True
                break
        
        if not merge_found:
            merged_paragraphs.append(paragraph)

    # Merge overlapping words again after paragraph merging
    merged_paragraphs = merge_overlapping_words(merged_paragraphs)

    # Calculate new bounds for paragraphs
    for paragraph in merged_paragraphs:
        paragraph['bounds'] = calculate_bounds(paragraph['words'])

    return merged_paragraphs

In [176]:
def draw_bounding_boxes(image_path, output_path, paragraphs):
    # Read the input image
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Image {image_path} not found.")

    for paragraph in paragraphs:
        # Draw green rectangle for paragraph bounds
        top_left, bottom_right = paragraph['bounds']
        cv2.rectangle(img, top_left, bottom_right, (0, 255, 0), 2)

        for word in paragraph['words']:
            # Draw blue rectangle for word bounds
            top_left, bottom_right = word['bounds']
            cv2.rectangle(img, top_left, bottom_right, (255, 0, 0), 1)

    try:
        # Save the image with bounding boxes
        cv2.imwrite(output_path, img)
    except Exception as e:
        raise ValueError(f"Error saving image with bounding boxes: {e}")

    return img

## Text Removal

In [177]:
def color_distance(c1, c2, rgb_margin):
    """Calculate if two colors are within the specified RGB margin."""
    return all(abs(int(a) - int(b)) <= rgb_margin for a, b in zip(c1, c2))

def remove_text_from_image(image_path, paragraphs, output_path, x_padding=2, y_padding=2, rgb_margin=10):
    # Sort paragraphs left-to-right, top-to-bottom
    paragraphs = sorted(paragraphs, key=lambda p: p['bounds'][0][1] * 1000 + p['bounds'][0][0])

    # Open the image
    with Image.open(image_path) as img:
        # Store original mode
        original_mode = img.mode
        
        # Convert image to RGB if it's not already
        if original_mode != 'RGB':
            img = img.convert('RGB')
        
        # Convert image to numpy array for faster processing
        img_array = np.array(img)
        
        # Iterate through paragraphs and words
        for paragraph in paragraphs:
            for word in paragraph['words']:
                # Get the bounding box coordinates
                top_left, bottom_right = word['bounds']
                x1, y1 = top_left
                x2, y2 = bottom_right
                
                # Widen the box by padding
                x1 = max(0, x1 - x_padding)
                y1 = max(0, y1 - y_padding)
                x2 = min(img.width, x2 + x_padding)
                y2 = min(img.height, y2 + y_padding)
                
                # Get points right outside the bounding box
                points = [
                    (x, y) for x in range(x1-1, x2+2) for y in [y1-1, y2+1]
                ] + [
                    (x, y) for y in range(y1, y2+1) for x in [x1-1, x2+1]
                ]
                
                # Collect colors from these points
                colors = [tuple(img_array[y, x]) for x, y in points if 0 <= y < img.height and 0 <= x < img.width]
                
                # Find the most common color considering the RGB margin
                color_counts = Counter()
                for color in colors:
                    for existing_color in color_counts:
                        if color_distance(color, existing_color, rgb_margin):
                            color_counts[existing_color] += 1
                            break
                    else:
                        color_counts[color] = 1
                
                counts_most_common = color_counts.most_common(1)
                if (len(counts_most_common) == 0):
                    most_common_color = (0, 0, 0)
                else:
                    most_common_color = counts_most_common[0][0]
                
                # Fill the bounding box with the most common color
                img_array[y1:y2, x1:x2] = most_common_color
        
        # Create a new image from the modified array
        result = Image.fromarray(img_array)
        
        # Convert back to original mode if necessary
        if original_mode != 'RGB':
            result = result.convert(original_mode)
        
        # Save the result
        result.save(output_path)

# Image Sectioning
## Vertical Line Detection

In [178]:
def merge_vertical_lines(lines, y_merge_threshold=10, x_merge_threshold=5, overlap_only=False):
    """
    Merge vertical lines that are close to each other.
    
    :param lines: List of lines, each represented as [x1, y1, x2, y2]
    :param y_merge_threshold: Maximum vertical distance to consider merging non-overlapping lines
    :param x_merge_threshold: Maximum horizontal distance to consider lines for merging
    :param overlap_only: If True, only merge lines that vertically overlap
    :return: Numpy array of merged lines
    """
    if lines is None or len(lines) == 0:
        return []

    # Convert to numpy array and sort by x-coordinate
    lines = np.array(lines, dtype=float)
    sorted_indices = np.argsort(lines[:, 0])
    sorted_lines = lines[sorted_indices]
    
    merged_lines = []
    
    while len(sorted_lines) > 0:
        # Take the first line as the current line to merge
        current_line = sorted_lines[0]
        sorted_lines = sorted_lines[1:]
        x, y1, _, y2 = current_line
        y1, y2 = min(y1, y2), max(y1, y2)
        
        merged = True
        while merged:
            merged = False
            # Find candidate lines within x_merge_threshold
            mask = np.abs(sorted_lines[:, 0] - x) <= x_merge_threshold
            candidates = sorted_lines[mask]
            for i, line in enumerate(candidates):
                lx, ly1, _, ly2 = line
                ly1, ly2 = min(ly1, ly2), max(ly1, ly2)
                
                # Check for vertical overlap or proximity
                y_overlap = (min(y2, ly2) - max(y1, ly1)) > 0
                if not overlap_only:
                    y_overlap = y_overlap or abs(y2 - ly1) <= y_merge_threshold or abs(y1 - ly2) <= y_merge_threshold
                
                if y_overlap:
                    # Merge the lines
                    x = (x + lx) / 2
                    y1 = min(y1, ly1)
                    y2 = max(y2, ly2)
                    # Remove the merged line from sorted_lines
                    sorted_lines = np.delete(sorted_lines, np.where(np.all(sorted_lines == line, axis=1))[0], axis=0)
                    merged = True
                    break
            if not merged:
                break
        
        merged_lines.append([x, y1, x, y2])
    
    return np.array(merged_lines)

In [179]:
def hough_transform(gray, img_shape, min_height_percent, threshold, max_line_gap, edge_min, edge_max):
    """
    Apply Hough Transform to detect vertical lines in an image.

    :param gray: Grayscale input image
    :param img_shape: Shape of the input image
    :param min_height_percent: Minimum line height as a percentage of image height
    :param threshold: Threshold for line detection
    :param max_line_gap: Maximum gap between line segments to be connected
    :param edge_min: Lower threshold for edge detection
    :param edge_max: Upper threshold for edge detection
    :return: List of detected vertical lines
    """
    # Apply Gabor filter to enhance vertical edges
    kernel = cv2.getGaborKernel((21, 21), 5, 0, 10, 1, 0, ktype=cv2.CV_32F)
    filtered = cv2.filter2D(gray, cv2.CV_8UC3, kernel)
    filtered = cv2.cvtColor(filtered, cv2.COLOR_GRAY2BGR)
    
    # Apply Canny edge detection
    edges = cv2.Canny(filtered, edge_min, edge_max, apertureSize=3)

    # Apply probabilistic Hough Line Transform
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold, 
                            minLineLength=int(img_shape[0]*min_height_percent), 
                            maxLineGap=max_line_gap)
    
    if (lines is None) or len(lines) == 0:
        return np.array([])

    cleaned_lines = []
    # Filter out non-vertical lines
    for line in lines:
        x1, y1, x2, y2 = line[0]
        if abs(x2 - x1) < 5:  # Consider lines with horizontal difference less than 5 pixels as vertical
            cleaned_lines.append([x1, y1, x2, y2])
    return cleaned_lines

In [180]:
def lsd_transform(gray):
    """
    Apply Line Segment Detector (LSD) to detect vertical lines in an image.

    :param gray: Grayscale input image
    :return: List of detected vertical lines
    """
    # Create a Line Segment Detector object
    lsd = cv2.createLineSegmentDetector(0)
    
    # Detect lines in the image
    lines = lsd.detect(gray)[0]
    filtered_lines = []

    # Check if any lines were detected
    if (lines is None) or (len(lines) == 0):
        return np.array(filtered_lines)

    # Filter out non-vertical lines
    for line in lines:
        x1, y1, x2, y2 = line[0]
        if abs(x2 - x1) < 5:  # Consider lines with horizontal difference less than 5 pixels as vertical
            filtered_lines.append([x1, y1, x2, y2])
    
    return filtered_lines

In [181]:
def save_vertical_lines(image_path, vertical_lines, output_path):
    """
    Draw detected vertical lines on an image and save the result.

    :param image_path: Path to the original image
    :param vertical_lines: List of vertical lines to draw
    :param output_path: Path to save the output image
    """
    # Read the original image
    img = cv2.imread(image_path)

    count = 0
    # Add the lines to the image
    for x, y1, y2 in vertical_lines:
        # Calculate color based on line index
        color = (255/len(vertical_lines)*count, 0, 255-(255/len(vertical_lines)*count))
        # Draw the line on the image
        cv2.line(img, (int(x), int(y1)), (int(x), int(y2)), color, 2)
        count += 1

    # Save the result
    cv2.imwrite(output_path, img)

In [182]:
def add_vertical_lines(image_path, output_path, threshold=100, min_height_percent=0.3, max_line_gap=20, edge_min=50, edge_max=150):
    """
    Detect and draw vertical lines on an image using both LSD and Hough transforms.

    :param image_path: Path to the input image
    :param output_path: Path to save the output image
    :param threshold: Threshold for Hough transform
    :param min_height_percent: Minimum line height as a percentage of image height
    :param max_line_gap: Maximum gap between line segments to be connected
    :param edge_min: Lower threshold for edge detection
    :param edge_max: Upper threshold for edge detection
    """
    # Read the image and convert to grayscale
    img = cv2.imread(image_path)
    img_height, img_width, _ = img.shape
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Detect vertical lines using LSD transform
    lsd_lines = lsd_transform(gray)
    lsd_lines = merge_vertical_lines(lsd_lines, y_merge_threshold=10, x_merge_threshold=10)
    lsd_lines = list(filter(lambda line: abs(line[1] - line[3]) > 100, lsd_lines))

    # Detect vertical lines using Hough transform
    h_lines = hough_transform(gray, img.shape, min_height_percent, threshold, max_line_gap, edge_min, edge_max)
    h_lines = merge_vertical_lines(h_lines, y_merge_threshold=15, x_merge_threshold=15)

    # Combine LSD and Hough lines
    if (len(lsd_lines) == 0):
        lines = h_lines
    elif (len(h_lines) == 0):
        lines = lsd_lines
    else:
        lines = np.concatenate((lsd_lines, h_lines), axis=0)
    
    # Merge overlapping lines
    lines = merge_vertical_lines(lines, x_merge_threshold=20, overlap_only=True)

    # Filter lines based on height and position
    vertical_lines = []
    if lines is not None:
        for line in lines:
            x1, y1, x2, y2 = map(int, line)
            if abs(y2 - y1) < min_height_percent*img_height:
                continue
            if abs(x2 - x1) < 20 and x1 > 10 and x2 < img_width - 10:
                vertical_lines.append((x1, y1, y2))

    # Save the image with detected vertical lines
    save_vertical_lines(image_path, vertical_lines, output_path)

    return img, vertical_lines

## Horizontal Line Detection

In [183]:
def get_horizontal_lines(img_path, output_path, vertical_lines):
    """
    Detect horizontal lines based on detected vertical lines and draw them on the image.

    :param img_path: Path to the input image
    :param output_path: Path to save the output image
    :param vertical_lines: List of detected vertical lines
    :return: Tuple of (modified image, list of horizontal lines)
    """
    # Read the image
    img = cv2.imread(img_path)
    _, img_width, _ = img.shape

    # Sort vertical lines by x-coordinate
    vertical_lines.sort(key=lambda x: x[0])

    horizontal_lines = []

    for i, line in enumerate(vertical_lines):
        x, y1, y2 = line
        
        # Process top horizontal line (y1)
        left_x = x
        right_x = x
        
        # Find left endpoint
        for j in range(i-1, -1, -1):
            if vertical_lines[j][1] <= y1 <= vertical_lines[j][2]:
                left_x = vertical_lines[j][0]
                break
        if left_x == x:
            left_x = 0

        # Find right endpoint
        for j in range(i+1, len(vertical_lines)):
            if vertical_lines[j][1] <= y1 <= vertical_lines[j][2]:
                right_x = vertical_lines[j][0]
                break
        if right_x == x:
            right_x = img_width
        
        # Add top horizontal line
        horizontal_lines.append((left_x, y1, right_x, y1))
        cv2.line(img, (left_x, y1), (right_x, y1), (0, 255, 0), 2)
        
        # Process bottom horizontal line (y2)
        left_x = x
        right_x = x
        
        # Find left endpoint
        for j in range(i-1, -1, -1):
            if vertical_lines[j][1] <= y2 <= vertical_lines[j][2]:
                left_x = vertical_lines[j][0]
                break
        if left_x == x:
            left_x = 0
        
        # Find right endpoint
        for j in range(i+1, len(vertical_lines)):
            if vertical_lines[j][1] <= y2 <= vertical_lines[j][2]:
                right_x = vertical_lines[j][0]
                break
        if right_x == x:
            right_x = img_width
        
        # Add bottom horizontal line
        horizontal_lines.append((left_x, y2, right_x, y2))
        cv2.line(img, (left_x, y2), (right_x, y2), (0, 255, 0), 2)

    # Save the image with horizontal lines
    cv2.imwrite(output_path, img)

    return img, horizontal_lines

## Section Creation

In [184]:
def save_adjusted_lines(folder_dir, adjusted_vertical_lines, adjusted_horizontal_lines):
    """
    Save the adjusted vertical and horizontal lines on the image.

    :param folder_dir: Directory to save the output images
    :param adjusted_vertical_lines: List of adjusted vertical lines
    :param adjusted_horizontal_lines: List of adjusted horizontal lines
    """
    # Save vertical lines
    new_lines_path = f"{folder_dir}/new_vertical_lines.png"
    save_vertical_lines(f"{folder_dir}/merged_removed.png", adjusted_vertical_lines, new_lines_path)

    # Read the image with vertical lines
    img = cv2.imread(new_lines_path)

    # Add horizontal lines to the image
    for line in adjusted_horizontal_lines:
        cv2.line(img, (line[0], line[1]), (line[2], line[1]), (255, 255, 0), 2)

    # Save the image with both vertical and horizontal lines
    cv2.imwrite(f"{folder_dir}/new_horizontal_lines.png", img)

In [185]:
def adjust_lines(folder_dir, vertical_lines, horizontal_lines, img_width, img_height, min_height):
    """
    Adjust and merge detected lines to create a more coherent grid structure.

    :param folder_dir: Directory to save output images
    :param vertical_lines: List of detected vertical lines
    :param horizontal_lines: List of detected horizontal lines
    :param img_width: Width of the image
    :param img_height: Height of the image
    :param min_height: Minimum height threshold for merging lines
    :return: Tuple of adjusted vertical and horizontal lines
    """
    # Add the top and bottom of the image as horizontal lines
    horizontal_lines.append((0, 0, img_width, 0))
    horizontal_lines.append((0, img_height, img_width, img_height))

    # Sort horizontal lines by y-coordinate
    horizontal_lines.sort(key=lambda x: x[1])

    # Merge close horizontal lines
    adjusted_horizontal_lines = [horizontal_lines[0]]
    for i in range(1, len(horizontal_lines)):
        prev_line = adjusted_horizontal_lines[-1]
        curr_line = horizontal_lines[i]
        if abs(prev_line[1] - curr_line[1]) < min_height:
            # Merge lines if they are close
            adjusted_horizontal_lines[-1] = (
                min(prev_line[0], curr_line[0]),
                int((prev_line[1] + curr_line[1])/2),
                max(prev_line[2], curr_line[2]),
                0
            )
        else:
            adjusted_horizontal_lines.append(curr_line)

    # Adjust vertical lines based on new horizontal lines
    adjusted_vertical_lines = []
    for v_line in vertical_lines:
        v_x, v_y1, v_y2 = v_line[0], min(v_line[1], v_line[2]), max(v_line[1], v_line[2])
        for h_line in adjusted_horizontal_lines:
            if abs(h_line[1] - v_y1) < min_height:
                v_y1 = h_line[1]
            if abs(h_line[1] - v_y2) < min_height:
                v_y2 = h_line[1]
        adjusted_vertical_lines.append((v_x, v_y1, v_y2))

    # Save the adjusted lines
    save_adjusted_lines(folder_dir, adjusted_vertical_lines, adjusted_horizontal_lines)
    
    return adjusted_vertical_lines, adjusted_horizontal_lines

In [186]:
def create_rectangular_sections(folder_dir, vertical_lines, horizontal_lines, img_width, img_height, min_width=10, min_height=10):
    """
    Create rectangular sections based on detected vertical and horizontal lines.

    :param folder_dir: Directory to save output images
    :param vertical_lines: List of detected vertical lines
    :param horizontal_lines: List of detected horizontal lines
    :param img_width: Width of the image
    :param img_height: Height of the image
    :param min_width: Minimum width for a valid section
    :param min_height: Minimum height for a valid section
    :return: List of rectangular sections
    """
    sections = []

    # Adjust lines to create a more coherent grid structure
    adjusted_vertical_lines, adjusted_horizontal_lines = adjust_lines(folder_dir, vertical_lines, horizontal_lines, img_width, img_height, min_height)

    # Create initial rectangles spanning the entire image width
    for i in range(len(adjusted_horizontal_lines) - 1):
        y1 = adjusted_horizontal_lines[i][1]
        y2 = adjusted_horizontal_lines[i + 1][1]
        if y2 - y1 >= min_height:
            sections.append((0, y1, img_width, y2))

    # Split rectangles that collide with vertical lines
    for v_line in adjusted_vertical_lines:
        v_x, v_y1, v_y2 = v_line
        new_sections = []
        narrow_sections = []

        for section in sections:
            x1, y1, x2, y2 = section
            if x1 < v_x < x2 and y1 < v_y2 and v_y1 < y2:
                # Split the section if it intersects with the vertical line
                if v_x - x1 >= min_width:
                    new_sections.append((x1, y1, v_x, y2))
                else:
                    narrow_sections.append((x1, y1, v_x, y2))
                if x2 - v_x >= min_width:
                    new_sections.append((v_x, y1, x2, y2))
                else:
                    narrow_sections.append((v_x, y1, x2, y2))
            else:
                new_sections.append(section)

        sections = new_sections

        # Merge narrow sections with the next big enough section
        for narrow_section in narrow_sections:
            x1, y1, x2, y2 = narrow_section
            merged = False
            for i in range(len(sections)):
                sx1, sy1, sx2, sy2 = sections[i]
                if y1 == sy1 and y2 == sy2 and sx1 == x2:
                    sections[i] = (x1, y1, sx2, y2)
                    merged = True
                    break
            if not merged:
                sections.append(narrow_section)

    return sections

In [187]:
def merge_vertical_rectangles(sections, horizontal_lines, min_height=10):
    """
    Merge vertically adjacent rectangles that have the same width and no horizontal line between them.

    :param sections: List of rectangular sections, each represented as (x1, y1, x2, y2)
    :param horizontal_lines: List of horizontal lines, each represented as (x1, y1, x2, y2)
    :param min_height: Minimum height for a valid section
    :return: List of merged rectangular sections
    """
    merged_sections = []
    sections.sort(key=lambda x: (x[0], x[1]))  # Sort by x, then y

    i = 0
    while i < len(sections):
        current = sections[i]
        j = i + 1
        while j < len(sections):
            next_section = sections[j]
            
            # Check if sections are vertically adjacent and have the same width
            if (current[0] == next_section[0] and 
                current[2] == next_section[2] and 
                current[3] == next_section[1]):
                
                # Check if there's no horizontal line between them
                if not any(h_line[1] == current[3] and 
                           h_line[0] <= current[0] < current[2] <= h_line[2] 
                           for h_line in horizontal_lines):
                    
                    # Merge the sections
                    current = (current[0], current[1], current[2], next_section[3])
                    j += 1
                else:
                    break
            else:
                break
        
        # Add the merged section if it meets the minimum height requirement
        if current[3] - current[1] >= min_height:
            merged_sections.append(current)
        i = j

    return merged_sections

In [188]:
def save_debug_sections(img_path, output_path, merged_sections):
    """
    Create and save a debug image showing the merged sections with colors and indices.

    :param img_path: Path to the original image
    :param output_path: Path to save the debug image
    :param merged_sections: List of merged rectangular sections
    """
    # Read the original image
    debug_img = cv2.imread(img_path)

    # Generate a list of distinct random colors for each section
    colors = [(np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255)) for _ in range(len(merged_sections))]

    # Draw the sections with index and color
    for i, section in enumerate(merged_sections):
        x1, y1, x2, y2 = section
        color = colors[i]
        
        # Draw the rectangle outline
        cv2.rectangle(debug_img, (x1, y1), (x2, y2), color, 2)
        
        # Add the index number
        cv2.putText(debug_img, str(i), (x1 + 5, y1 + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
        
        # Fill the section with a semi-transparent color
        overlay = debug_img.copy()
        cv2.rectangle(overlay, (x1, y1), (x2, y2), color, -1)
        cv2.addWeighted(overlay, 0.2, debug_img, 0.8, 0, debug_img)

    # Save the debug image
    cv2.imwrite(output_path, debug_img)

# Text Layout and Formatting
## Text Formatting

In [189]:
def word_sort_key(word, line_height, tolerance=0):
    """
    Generate a sorting key for a word based on its position in the document.

    :param word: Dictionary containing word information, including 'bounds'
    :param line_height: Height of a line in the document
    :param tolerance: Tolerance value for line number calculation
    :return: Tuple (line_number, x) for sorting
    """
    x, y = word['bounds'][0]  # Extract x and y coordinates of the word's top-left corner
    line_number = (y + tolerance) // line_height  # Calculate the line number
    return (line_number, x)  # Return a tuple for sorting

In [190]:
def find_section(word_center_x, word_center_y, sections):
    """
    Find the section that contains a given word based on its center coordinates.

    :param word_center_x: X-coordinate of the word's center
    :param word_center_y: Y-coordinate of the word's center
    :param sections: List of sections, each represented as (x1, y1, x2, y2)
    :return: Index of the containing section, or -1 if not found
    """
    for i, section in enumerate(sections):
        # Check if the word's center is within the current section's boundaries
        if (section[0] <= word_center_x <= section[2] and 
            section[1] <= word_center_y <= section[3]):
            return i
    
    # If no containing section is found, return -1
    return -1

In [191]:
def split_paragraph(paragraph, sections):
    """
    Split a paragraph into multiple paragraphs based on the sections it spans.

    :param paragraph: Dictionary containing paragraph information, including 'words'
    :param sections: List of sections, each represented as (x1, y1, x2, y2)
    :return: List of split paragraphs, each associated with a section
    """
    # Initialize a list of empty paragraphs, one for each section
    split_paragraphs = [{'words': [], 'section': i, 'bounds': list(section)} for i, section in enumerate(sections)]
    
    for word in paragraph['words']:
        # Calculate the center of the word
        word_center = ((word['bounds'][0][0] + word['bounds'][1][0]) / 2, 
                       (word['bounds'][0][1] + word['bounds'][1][1]) / 2)
        
        # Find which section the word belongs to
        sec_id = find_section(word_center[0], word_center[1], sections)
        if sec_id != -1:
            # Add the word to the appropriate section's paragraph
            split_paragraphs[sec_id]['words'].append(word)
    
    # Return only the paragraphs that contain words
    return [p for p in split_paragraphs if p['words']]

In [192]:
def format_words_with_layout(words, section_bounds, use_relative_tabs, use_ascii_only, line_height=10):
    """
    Format words with layout information, preserving spatial relationships.

    :param words: List of word dictionaries containing text and position information
    :param section_bounds: Boundaries of the section (x1, y1, x2, y2)
    :param use_relative_tabs: Boolean to determine if relative tabbing should be used
    :param use_ascii_only: Boolean to filter out non-ASCII characters
    :param line_height: Height of a line for determining line breaks
    :return: Formatted text string
    """
    words = deepcopy(words)  # Create a deep copy to avoid modifying the original

    formatted_text = ""
    section_start_x, section_start_y = section_bounds[0], section_bounds[1]
    prev_y = section_start_y
    last_x = section_start_x
    
    # Calculate average character width for tabbing
    avg_char_width = sum(len(word['text']) for word in words) / max(1, len(words))
    tab_width = 15 * avg_char_width
    
    count = 0
    for word in words:
        if use_ascii_only:
            word['text'] = ''.join(c for c in word['text'] if ord(c) < 128)
    
        if not word['text']:
            continue

        current_x, current_y = word['bounds'][0]
        vertical_distance = current_y - prev_y        

        # Handle line breaks
        if vertical_distance > line_height * 0.75:
            formatted_text += "\n"
            if vertical_distance > line_height * 2.5:
                formatted_text += "\n"  # Extra line break for larger gaps
            last_x = section_start_x
            
            # Add tabs at the start of the line
            initial_tabs = max(0, min(int((current_x - section_start_x) / tab_width), 8))
            if initial_tabs > 0:
                formatted_text += "\t" * int(initial_tabs / 2)

        if count == 0:
            # Add tabs at the start of the first line
            initial_tabs = max(0, min(int((current_x - section_start_x) / tab_width), 8))
            if initial_tabs > 0:
                formatted_text += "\t" * int(initial_tabs / 2)
            count += 1
        
        elif use_relative_tabs:
            # Add tabs within the same line
            horizontal_distance = current_x - last_x
            horizontal_tabs = max(0, min(int(horizontal_distance / tab_width / 2), 4))
            if horizontal_tabs > 0:
                formatted_text += "\t" * horizontal_tabs
            elif formatted_text and formatted_text[-1] != "\n":
                formatted_text += " "
        else:
            # Add a space if not at the beginning of a line
            if formatted_text and formatted_text[-1] != "\n":
                formatted_text += " "

        formatted_text += word['text']

        prev_y = current_y
        last_x = current_x + len(word['text']) * avg_char_width

    return formatted_text

## Text in Sections

In [193]:
def format_text_in_sections(merged_sections, merged_paragraphs, use_ascii_only=False, use_relative_tabs=False):
    # Flatten all words from all paragraphs into a single list
    words = [word for paragraph in merged_paragraphs for word in paragraph['words']]

    # If no words are found, return an empty list
    if not words:
        return []
    
    # Calculate the most common line height from word bounding boxes
    line_heights = [word['bounds'][1][1] - word['bounds'][0][1] for word in words]
    line_height = max(set(line_heights), key=line_heights.count)

    # Split paragraphs that span multiple sections
    new_paragraphs = []
    for paragraph in merged_paragraphs:
        split_paragraphs = split_paragraph(paragraph, merged_sections)
        new_paragraphs.extend(split_paragraphs)

    section_data = []
    for i, section in enumerate(merged_sections):
        # Collect paragraphs and words for the current section
        section_paragraphs = [p for p in new_paragraphs if p['section'] == i]
        all_words = [word for paragraph in section_paragraphs for word in paragraph['words']]
        if not all_words:
            continue

        # Sort words within the section
        sorted_words = sorted(all_words, key=lambda w: word_sort_key(w, line_height))
        
        # Format the text for the section
        formatted_text = format_words_with_layout(sorted_words, section, use_relative_tabs, use_ascii_only, line_height)
        section_data.append({"words": sorted_words, "text": formatted_text})

    return section_data

## Text Saving

In [194]:
def save_text(folder_output_dir, file_name, text):
    # Open a file for writing in the specified output directory
    with open(f"{folder_output_dir}/{file_name}.txt", 'w') as f:
        # Iterate through each section in the text
        for i, section in enumerate(text):
            f.write(f"Section {i}:\n")
            f.write(section)
            f.write("\n\n")

In [195]:
def save_ocr_sections(folder_output_dir, merged_sections, merged_paragraphs):
    #? Format text in sections
    # Process sections without ASCII filtering or relative tabs
    sections = format_text_in_sections(merged_sections, merged_paragraphs, use_ascii_only=False)
    sections_text = [section['text'] for section in sections]
    save_text(folder_output_dir, "processed_text", sections_text)

    # Save the number of sections
    with open(f"{folder_output_dir}/section_count.txt", 'w') as f:
        f.write(f"Number of sections: {len(sections)}")

    # Process sections with relative tabs but without ASCII filtering
    sections_tabs = format_text_in_sections(merged_sections, merged_paragraphs, use_relative_tabs=True, use_ascii_only=False)
    sections_text_tabs = [section['text'] for section in sections_tabs]
    save_text(folder_output_dir, "processed_text_tabs", sections_text_tabs)

    #? Format text in sections with ASCII only
    # Process sections with ASCII filtering
    ascii_section = format_text_in_sections(merged_sections, merged_paragraphs, use_ascii_only=True)
    ascii_section_text = [section['text'] for section in ascii_section]
    save_text(folder_output_dir, "processed_text_ascii", ascii_section_text)

    #! RETURNING THE TEXT, WITH ONLY INITIAL TABS AND WITH CHARACTERS OUTSIDE ASCII
    return sections

# (Not used) Line Number Detection and Removal

In [196]:
def is_valid_number(text):
    # Dictionary for special number characters
    special_numbers = {'①': '1', '②': '2', '③': '3', '④': '4', '⑤': '5',
                       '⑥': '6', '⑦': '7', '⑧': '8', '⑨': '9'}
    
    # Check if the text is a special number character
    if text in special_numbers:
        return True, int(special_numbers[text])
    
    # Extract all standard digits from the text
    digits = ''.join(char for char in text if char in '0123456789')
    
    # Check if there are any standard digits
    if not digits:
        return False, None
    
    # Remove standard digits from the original text
    non_digits = ''.join(char for char in text if char not in '0123456789')
    
    # Check if there's at most one non-digit character left
    # This allows for numbers with a single prefix or suffix character
    if len(non_digits) <= 1:
        return True, int(digits)
    
    # If more than one non-digit character remains, it's not a valid number
    return False, None

In [197]:
def detect_line_numbers(paragraphs, horizontal_threshold=50, vertical_threshold=50, min_consecutive=3):
    # Initialize a defaultdict to store numbers and their corresponding words
    number_dict = defaultdict(list)

    # Flatten the list of words from all paragraphs
    words = [word.copy() for paragraph in paragraphs for word in paragraph['words']]
    
    # First pass: group numbers
    for word in words:
        number_test = is_valid_number(word['text'])
        if number_test[0]:
            word['text'] = number_test[1]
            number_dict[number_test[1]].append(word)
    
    valid_sequences = []
    current_sequence = []
    
    # Iterate through sorted numbers to find consecutive sequences
    for number in sorted(number_dict.keys()):
        if not current_sequence:
            current_sequence.append(number_dict[number][0])
            continue
        
        prev_word = current_sequence[-1]
        
        if number == int(prev_word['text']) + 1:
            # Find the best aligned occurrence of the current number
            best_word = min(number_dict[number], key=lambda w: (
                abs(w['bounds'][0][0] - prev_word['bounds'][0][0]),  # x difference
                w['bounds'][0][1] - prev_word['bounds'][0][1]  # y difference
            ))
            
            # Check if the best occurrence is within horizontal and vertical thresholds
            if (abs(best_word['bounds'][0][0] - prev_word['bounds'][0][0]) <= horizontal_threshold and
                0 < best_word['bounds'][0][1] - prev_word['bounds'][0][1] <= vertical_threshold):
                current_sequence.append(best_word)
            else:
                # Not aligned or too far apart vertically, start a new sequence
                if len(current_sequence) >= min_consecutive:
                    valid_sequences.append(current_sequence)
                current_sequence = [best_word]
        else:
            # Not consecutive, start a new sequence
            if len(current_sequence) >= min_consecutive:
                valid_sequences.append(current_sequence)
            current_sequence = [number_dict[number][0]]
    
    # Check the last sequence
    if len(current_sequence) >= min_consecutive:
        valid_sequences.append(current_sequence)
    
    expanded_sequences = []
    # Add words on the same vertical line for each sequence
    for sequence in valid_sequences:
        new_sequence = sequence.copy()
        x_coord = sequence[0]['bounds'][1][0]
        for number in number_dict:
            for word in number_dict[number]:
                if abs(word['bounds'][1][0] - x_coord) <= horizontal_threshold and word not in sequence:
                    new_sequence.append(word)
        sequence.sort(key=lambda w: w['bounds'][0][1])  # Sort by y-coordinate
        expanded_sequences.append(new_sequence)

    return expanded_sequences

In [198]:
def remove_line_numbers(paragraphs, number_sequence):
    # Create a deep copy of paragraphs to avoid modifying the original
    paragraphs = deepcopy(paragraphs)
    
    # Create a set of tuples (text, bounds) for quick lookup of numbers to remove
    numbers_to_remove = set((str(word['text']), tuple(word['bounds'])) for word in number_sequence)

    removed_count = 0
    for paragraph in paragraphs:
        if isinstance(paragraph, dict) and 'words' in paragraph:
            original_words = paragraph['words']
            paragraph['words'] = []
            for word in original_words:
                if (word['text'], tuple(word['bounds'])) in numbers_to_remove:
                    removed_count += 1
                else:
                    paragraph['words'].append(word)
    
    # Remove empty paragraphs and recalculate bounds
    paragraphs = [paragraph for paragraph in paragraphs if paragraph['words']]
    for paragraph in paragraphs:
        paragraph['bounds'] = calculate_bounds(paragraph['words'])

    return paragraphs

# Text Highlighting and Citation
## Flexible OCR Text Matcher

In [199]:
import re
from difflib import SequenceMatcher

class FlexibleOCRTextMatcher:
    def __init__(self, pattern, threshold=0.7, case_sensitive=False):
        # Initialize the matcher with a pattern, similarity threshold, and case sensitivity option
        self.original_pattern = pattern
        self.pattern = self._preprocess_text(pattern, case_sensitive)
        self.threshold = threshold
        self.case_sensitive = case_sensitive

    def _preprocess_text(self, text, case_sensitive):
        # Preprocess text by removing extra whitespace and optionally converting to lowercase
        text = re.sub(r'\s+', ' ', text.strip())
        return text if case_sensitive else text.lower()

    def _calculate_similarity(self, s1, s2):
        # Calculate similarity between two strings
        offset = 0
        if (s1 in s2):
            offset = 0.05
        
        # Use SequenceMatcher for similarity calculation
        return min(1.0, SequenceMatcher(None, s1, s2).ratio() + offset)

    def find_fuzzy_matches(self, ordered_words_section):
        matches = []
        # Flatten the list of words from all sections
        all_words = [word for section in ordered_words_section for word in section['words']]
        
        # Iterate through all possible word combinations
        for i in range(len(all_words)):
            for j in range(i + 1, len(all_words) + 1):
                original_text = ' '.join(word['text'] for word in all_words[i:j])
                preprocessed_text = self._preprocess_text(original_text, self.case_sensitive)
                
                if preprocessed_text == self.pattern:
                    # Exact match found, return immediately
                    return [(i, original_text.strip(), 1.0, all_words[i:j])]
                
                similarity = self._calculate_similarity(self.pattern, preprocessed_text)
                
                if similarity >= self.threshold:
                    matches.append((i, original_text.strip(), similarity, all_words[i:j]))

        # Sort matches by similarity (descending) and length (ascending)
        matches.sort(key=lambda x: (-x[2], len(x[1])))
        
        # Remove duplicates and near-duplicates
        unique_matches = []
        for match in matches:
            if not any(self._is_similar_match(match, existing) for existing in unique_matches):
                unique_matches.append(match)

        return unique_matches

    def _is_similar_match(self, match1, match2, similarity_threshold=0.95):
        # Check if two matches are similar based on a high similarity threshold
        text1 = self._preprocess_text(match1[1], self.case_sensitive)
        text2 = self._preprocess_text(match2[1], self.case_sensitive)
        return self._calculate_similarity(text1, text2) > similarity_threshold

## Highlighting Functions

In [200]:
def highlight_words(img, matches, highlight_color=(255, 0, 0)):
  draw = ImageDraw.Draw(img)
  
  for match in matches:
      # The list of matched word dictionaries
      matched_words = match[3]  
      for word in matched_words:
          top_left, bottom_right = word['bounds']
          x1, y1 = top_left
          x2, y2 = bottom_right

          # Add a little padding around the word
          x1 = max(0, x1 - 2)
          y1 = max(0, y1 - 2)
          y2 = min(img.height, y2 + 2)
          x2 = min(img.width, x2 + 2)
          
          # Draw a rectangle border around the word
          draw.rectangle([x1, y1, x2, y2], outline=highlight_color, width=2)
  
  return img

In [201]:
def clean_citation(citation):
    # Remove leading/trailing whitespace and single quotes from a citation
    return citation.strip().strip("'")

In [202]:
def highlight_response(img_id, so_question, response, accepted_answer, ordered_words_section, img_path, folder_output_dir):
    # Clean each citation in the parsed list
    citations = [clean_citation(str(cite)) for cite in response['citations']]
    explanation = response['explanation']

    all_matches = []
    citation_matches = []

    # Find fuzzy matches for each citation in the OCR text
    for citation in citations:
        recognizer = FlexibleOCRTextMatcher(citation, threshold=0.7)
        matches = recognizer.find_fuzzy_matches(ordered_words_section)

        if matches:
            # Get the best matches (highest similarity)
            max_similarity = max(match[2] for match in matches)
            best_matches = [match for match in matches if match[2] == max_similarity]
            all_matches.extend(best_matches)
            
            # Store information about each match
            for match in best_matches:
                citation_matches.append({
                    'citation': citation,
                    'match_text': match[1],
                    'similarity': match[2]
                })
                print(f"Match for '{citation}': '{match[1]}' (Similarity: {match[2]:.2f})")

    # Create DataFrame with all relevant information
    df = pd.DataFrame({
        'id': [img_id],
        'question': [so_question],
        'accepted_answer': [accepted_answer],
        'response': [explanation],
        'citations': [citations],
        'citation_matches': [citation_matches]
    })

    # Save the DataFrame to a CSV file
    df.to_csv(f"{folder_output_dir}/citations.csv", index=False)

    # Open the image and highlight matches or create a black image if no matches
    with Image.open(img_path) as img:
        if all_matches:
            highlighted_img = highlight_words(img, all_matches)
            output_path = f"{folder_output_dir}/citation_highlighted.png"
            highlighted_img.save(output_path)
        else:
            black_img = Image.new('RGB', img.size, (0, 0, 0))
            output_path = f"{folder_output_dir}/citation_highlighted.png"
            black_img.save(output_path)


## Main Processing Loop

In [203]:
def process_image(row_content, img_dir, output_dir, project_id):
    MAX_RETRIES = 2
    RETRY_DELAY = 3  # seconds

    folder_output_dir = f"{output_dir}/{row_content['id']}"
    img_path = f"{img_dir}/{row_content['image_name']}"
    input_path = f"{folder_output_dir}/input.png"

    print(f"Processing image {row_content['id']}")

    # If the directory exists and contains 17 files, skip the image
    if os.path.exists(folder_output_dir) and len(os.listdir(folder_output_dir)) == -1:
        print(f"Skipping image {row_content['id']}")
        return 2, 0  # Return status code and 0 sections

    try:
        with io.open(img_path, "rb") as image_file:
            content = image_file.read()
        VISION_image = vision.Image(content=content)
    except Exception as e:
        return 1, 0  # Return error status and 0 sections

    # Make the directory if it doesn't exist
    if not os.path.exists(folder_output_dir):
        os.makedirs(folder_output_dir)

    paragraphs = detect_text(VISION_image, project_id)
    print("Text detection completed")
    
    # Get the text, append with a " " for same paragraph, "\n" for new paragraph
    text = ""
    for paragraph in paragraphs:
        text += paragraph['text'] + '\n'

    # Save to file
    with open(f"{folder_output_dir}/unprocessed_text.txt", 'w') as f:
        f.write(text)
    print("Unprocessed text saved")

    # Save the image, if it can't be saved, skip the image
    try:
        image = Image.open(img_path)
        image.save(input_path)
    except Exception as e:
        return 1, 0  # Return error status and 0 sections

    merged_paragraphs = merge_overlapping_paragraphs(paragraphs, word_tolerance=3, paragraph_overlap_threshold=0.5)
    print("Paragraphs merged")
    
    # Draw bounding boxes
    output_path = f"{folder_output_dir}/highlighted.png"
    try:
        draw_bounding_boxes(img_path, output_path, paragraphs)
    except Exception as e:
        shutil.rmtree(folder_output_dir)
        return 1, 0  # Return error status and 0 sections

    output_path = f"{folder_output_dir}/merged_highlighted.png"
    draw_bounding_boxes(img_path, output_path, merged_paragraphs)
    print("Bounding boxes drawn")

    # Text removal
    output_path = f"{folder_output_dir}/removed.png"
    remove_text_from_image(img_path, paragraphs, output_path, x_padding=1, y_padding=1)
    
    removed_text_path = f"{folder_output_dir}/merged_removed.png"
    remove_text_from_image(img_path, merged_paragraphs, removed_text_path, x_padding=1, y_padding=1)
    print("Text removed from image")

    # Add vertical lines
    vertical_lines_path = f"{folder_output_dir}/detected_vertical_lines.png"
    img, detected_lines = add_vertical_lines(removed_text_path, vertical_lines_path, 100, 0.35, 20, 20, 200)

    # Plot horizontal lines
    output_path = f"{folder_output_dir}/detected_horizontal_lines.png"
    img, horizontal_lines = get_horizontal_lines(vertical_lines_path, output_path, detected_lines)
    print("Vertical and horizontal lines detected")

    # Create rectangular sections
    img_height, img_width, _ = img.shape
    min_width = 40; min_height = 15
    rectangular_sections = create_rectangular_sections(folder_output_dir, detected_lines, horizontal_lines, img_width, img_height, min_width, min_height)
    
    # Merge vertical rectangles
    merged_sections = merge_vertical_rectangles(rectangular_sections, horizontal_lines, min_height)
    merged_sections.sort(key=lambda x: x[1])

    output_path = f"{folder_output_dir}/debug_sections.png"
    save_debug_sections(input_path, output_path, merged_sections)
    print("Sections created and merged")

    # Return success status and number of sections
    return 0, len(merged_sections)


In [204]:
# Main processing loop with CSV updates
import pandas as pd

def process_images_and_update_csv(df, img_dir, output_dir, project_id):
    # Create a new column for number of sections if it doesn't exist
    if 'num_sections' not in df.columns:
        df['num_sections'] = 0
    
    # Process each image and update the DataFrame
    for i, row in df.iterrows():
        status_code, num_sections = process_image(row, img_dir, output_dir, project_id)
        if status_code == 0:  # Only update if processing was successful
            df.at[i, 'num_sections'] = num_sections
            print(f"Image {row['id']} processed - {num_sections} sections found")
        else:
            print(f"Image {row['id']} processing failed with status code {status_code}")
        
        # Save the DataFrame after each image is processed
        df.to_csv("./Data/metadata.csv", index=False)
        print("---\t---\t---\t---\t---")
    
    return df

# Initialization and Parameters

In [205]:
load_dotenv()

project_id = os.getenv('VERTEXAI_PROJECT_ID')
vertexai.init(project=project_id, location="us-central1")


In [206]:
img_dir = "./Data/images"
output_dir = f"./Data/outputs"

df = pd.read_csv("./Data/metadata.csv")

In [207]:
df = process_images_and_update_csv(df, img_dir, output_dir, project_id)

Processing image 79050514
Text detection completed
Unprocessed text saved
Paragraphs merged
Bounding boxes drawn
Text removed from image
Vertical and horizontal lines detected
Sections created and merged
Image 79050514 processed - 8 sections found
---	---	---	---	---
Processing image 79042213
Text detection completed
Unprocessed text saved
Paragraphs merged
Bounding boxes drawn
Text removed from image
Vertical and horizontal lines detected
Sections created and merged
Image 79042213 processed - 3 sections found
---	---	---	---	---
Processing image 79142755
Text detection completed
Unprocessed text saved
Paragraphs merged
Bounding boxes drawn
Text removed from image
Vertical and horizontal lines detected
Sections created and merged
Image 79142755 processed - 6 sections found
---	---	---	---	---
Processing image 79082901
Text detection completed
Unprocessed text saved
Paragraphs merged
Bounding boxes drawn
Text removed from image
Vertical and horizontal lines detected
Sections created and

In [1]:
import pandas as pd

# Read both CSV files
filtered_data = pd.read_csv('Data/filtered_data.csv')
metadata = pd.read_csv('Data/metadata.csv')

# Filter the filtered_data to keep only rows where Id exists in metadata's id
filtered_result = filtered_data[filtered_data['Id'].isin(metadata['id'])]

# Save the filtered result back to CSV
filtered_result.to_csv('Data/filtered_data_matched.csv', index=False)