In [2]:
import os
from bs4 import BeautifulSoup

# assign directories
gt_folder = 'BGdataset/html'
pred_folder = 'transcriptionsBG_gpt4/html'

# Function to find tags with positions in an HTML file
def find_tags_with_positions(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    body = soup.body

    if not body:
        return []

    results = []
    body_str = str(body)

    for tag in body.find_all(['b', 'u', 'i', 's']):
        tag_str = str(tag)

        # Find opening tag
        opening_tag = f"<{tag.name}"
        closing_tag = f"</{tag.name}>"

        # Get positions of opening and closing tags
        opening_pos = body_str.index(tag_str)
        closing_pos = opening_pos + len(tag_str) - len(closing_tag)

        # Extract text content
        text_content = tag.get_text(strip=True)

        results.append({
            'tag': tag.name,
            'opening_position': opening_pos,
            'closing_position': closing_pos,
            'content': text_content
        })

    return results

# Function to compare two lists of tags
def compare_tag_lists(predicted, ground_truth):
    total_tags = len(predicted)
    print(f"Total predicted tags: {total_tags}")

    correct_tags = 0
    correct_positions = 0
    correct_content = 0
    perfect_matches = 0
    mismatches = []

    for i, (pred, gt) in enumerate(zip(predicted, ground_truth)):
        # Step 1: Check if the tags match
        if pred['tag'] == gt['tag']:
            correct_tags += 1

            # Step 2: Check if both the opening and closing positions match
            if pred['opening_position'] == gt['opening_position'] and pred['closing_position'] == gt['closing_position']:
                correct_positions += 1

                # Step 3: Check if the content matches after confirming tag and position match
                if pred['content'] == gt['content']:
                    correct_content += 1
                    perfect_matches += 1
                    print(f"Tag {i}: Perfect match (tag, positions, and content)")
                else:
                    print(f"Tag {i}: Tag and positions match, but content differs")
            else:
                print(f"Tag {i}: Tag matches, but positions differ")
        else:
            # Log mismatches
            mismatches.append(i)
            print(f"Tag {i}: Tag mismatch")

            # Check if any other fields match in case of tag mismatch
            if pred['opening_position'] == gt['opening_position']:
                print(f"Tag {i}: Opening position matches despite tag mismatch")
            if pred['closing_position'] == gt['closing_position']:
                print(f"Tag {i}: Closing position matches despite tag mismatch")
            if pred['content'] == gt['content']:
                print(f"Tag {i}: Content matches despite tag mismatch")

    # Summary of the results
    print(f"\nSummary:")
    print(f"Correct tags: {correct_tags} out of {total_tags}")
    print(f"Correct positions: {correct_positions} out of {total_tags}")
    print(f"Correct content: {correct_content} out of {total_tags}")
    print(f"Perfect matches: {perfect_matches} out of {total_tags}")

    if mismatches:
        print(f"Mismatched tags at indices: {mismatches}")

# iterate over files in the ground truth folder
for gt_filename in os.listdir(gt_folder):
    gt_filepath = os.path.join(gt_folder, gt_filename)

    # open and read the ground truth file
    with open(gt_filepath, 'r', encoding='utf-8') as gt_file:
        gt_content = gt_file.read()

    # iterate over files in the predicted folder
    for pred_filename in os.listdir(pred_folder):
        pred_filepath = os.path.join(pred_folder, pred_filename)

        # check if the file names match (without extensions)
        if os.path.splitext(gt_filename)[0] == os.path.splitext(pred_filename)[0]:
            # open and read the predicted file
            with open(pred_filepath, 'r', encoding='utf-8') as pred_file:
                pred_content = pred_file.read()

            # Find tags and positions in both ground truth and predicted content
            listtags_gt = find_tags_with_positions(gt_content)
            listtags_predicted = find_tags_with_positions(pred_content)

            # Compare the tag lists
            compare_tag_lists(listtags_predicted, listtags_gt)
