In [6]:
import os
from bs4 import BeautifulSoup

# assign directories
gt_folder = 'BGdataset/filtered100html'
pred_folder = 'transcriptionsBG_gpt4/html'

# Function to find tags with positions in an HTML file
def find_tags_with_positions(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    body = soup.body

    if not body:
        return []

    results = []
    body_str = str(body)

    for tag in body.find_all(['b', 'u', 'i', 's']):
        tag_str = str(tag)

        # Find opening tag
        opening_tag = f"<{tag.name}"
        closing_tag = f"</{tag.name}>"

        # Get positions of opening and closing tags
        opening_pos = body_str.index(tag_str)
        closing_pos = opening_pos + len(tag_str) - len(closing_tag)

        # Extract text content
        text_content = tag.get_text(strip=True)

        results.append({
            'tag': tag.name,
            'opening_position': opening_pos,
            'closing_position': closing_pos,
            'content': text_content
        })

    return results

# Function to compare two lists of tags
def compare_tag_lists(predicted, ground_truth):
    total_tags = len(predicted)
    print(f"Total predicted tags: {total_tags}")

    correct_tags = 0
    correct_positions = 0
    correct_content = 0
    perfect_matches = 0
    mismatches = []

    for i, (pred, gt) in enumerate(zip(predicted, ground_truth)):
        # Step 1: Check if the tags match
        if pred['tag'] == gt['tag']:
            correct_tags += 1

            # Step 2: Check if both the opening and closing positions match
            if pred['opening_position'] == gt['opening_position'] and pred['closing_position'] == gt['closing_position']:
                correct_positions += 1

                # Step 3: Check if the content matches after confirming tag and position match
                if pred['content'] == gt['content']:
                    correct_content += 1
                    perfect_matches += 1
                    print(f"Tag {i}: Perfect match (tag, positions, and content)")
                else:
                    print(f"Tag {i}: Tag and positions match, but content differs")
            else:
                print(f"Tag {i}: Tag matches, but positions differ")
        else:
            # Log mismatches
            mismatches.append(i)
            print(f"Tag {i}: Tag mismatch")

            # Check if any other fields match in case of tag mismatch
            if pred['opening_position'] == gt['opening_position']:
                print(f"Tag {i}: Opening position matches despite tag mismatch")
            if pred['closing_position'] == gt['closing_position']:
                print(f"Tag {i}: Closing position matches despite tag mismatch")
            if pred['content'] == gt['content']:
                print(f"Tag {i}: Content matches despite tag mismatch")

    # Summary of the results
    print(f"\nSummary:")
    print(f"Correct tags: {correct_tags} out of {total_tags}")
    print(f"Correct positions: {correct_positions} out of {total_tags}")
    print(f"Correct content: {correct_content} out of {total_tags}")
    print(f"Perfect matches: {perfect_matches} out of {total_tags}")

    if mismatches:
        print(f"Mismatched tags at indices: {mismatches}")

# iterate over files in the ground truth folder
for gt_filename in os.listdir(gt_folder):
    gt_filepath = os.path.join(gt_folder, gt_filename)

    # open and read the ground truth file
    with open(gt_filepath, 'r', encoding='utf-8') as gt_file:
        gt_content = gt_file.read()

    # iterate over files in the predicted folder
    for pred_filename in os.listdir(pred_folder):
        pred_filepath = os.path.join(pred_folder, pred_filename)

        # check if the file names match (without extensions)
        if os.path.splitext(gt_filename)[0] == os.path.splitext(pred_filename)[0]:
            # open and read the predicted file
            with open(pred_filepath, 'r', encoding='utf-8') as pred_file:
                pred_content = pred_file.read()

            # Find tags and positions in both ground truth and predicted content
            listtags_gt = find_tags_with_positions(gt_content)
            listtags_predicted = find_tags_with_positions(pred_content)

            # Compare the tag lists
            compare_tag_lists(listtags_predicted, listtags_gt)


Total predicted tags: 4
Tag 0: Tag mismatch
Tag 1: Tag matches, but positions differ
Tag 2: Tag mismatch

Summary:
Correct tags: 1 out of 4
Correct positions: 0 out of 4
Correct content: 0 out of 4
Perfect matches: 0 out of 4
Mismatched tags at indices: [0, 2]
Total predicted tags: 2
Tag 0: Tag matches, but positions differ
Tag 1: Tag matches, but positions differ

Summary:
Correct tags: 2 out of 2
Correct positions: 0 out of 2
Correct content: 0 out of 2
Perfect matches: 0 out of 2
Total predicted tags: 0

Summary:
Correct tags: 0 out of 0
Correct positions: 0 out of 0
Correct content: 0 out of 0
Perfect matches: 0 out of 0
Total predicted tags: 10
Tag 0: Tag mismatch
Tag 1: Tag mismatch
Tag 2: Tag matches, but positions differ
Tag 3: Tag matches, but positions differ
Tag 4: Tag mismatch
Tag 5: Tag matches, but positions differ

Summary:
Correct tags: 3 out of 10
Correct positions: 0 out of 10
Correct content: 0 out of 10
Perfect matches: 0 out of 10
Mismatched tags at indices: [0, 1,

In [11]:
import os
from bs4 import BeautifulSoup
from collections import defaultdict

# assign directories
gt_folder = 'BGdataset/filtered100html'
pred_folder = 'transcriptions_BG_internVL/html'

def find_tags_with_positions(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    body = soup.body
    if not body:
        return []

    results = []
    body_str = str(body)

    for tag in body.find_all(['b', 'u', 'i', 's']):
        tag_str = str(tag)
        opening_pos = body_str.index(tag_str)
        closing_pos = opening_pos + len(tag_str) - len(f"</{tag.name}>")
        text_content = tag.get_text(strip=True)

        results.append({
            'tag': tag.name,
            'opening_position': opening_pos,
            'closing_position': closing_pos,
            'content': text_content
        })
    print(results)

    return results

def compare_tag_lists(predicted, ground_truth):
    metrics = defaultdict(int)
    metrics['total_tags'] = len(predicted)

    for pred, gt in zip(predicted, ground_truth):
        if pred['tag'] == gt['tag']:
            metrics['correct_tags'] += 1
            if pred['opening_position'] == gt['opening_position'] and pred['closing_position'] == gt['closing_position']:
                metrics['correct_positions'] += 1
                if pred['content'] == gt['content']:
                    metrics['correct_content'] += 1
                    metrics['perfect_matches'] += 1
    print(metrics)
    return metrics

def process_files(gt_folder, pred_folder):
    total_metrics = defaultdict(int)

    for gt_filename in os.listdir(gt_folder):
        gt_basename = os.path.splitext(gt_filename)[0]
        gt_filepath = os.path.join(gt_folder, gt_filename)
        pred_filepath = os.path.join(pred_folder, f"{gt_basename}.html")

        if not os.path.exists(pred_filepath):
            print(f"Warning: No matching predicted file for {gt_filename}")
            continue

        with open(gt_filepath, 'r', encoding='utf-8') as gt_file, open(pred_filepath, 'r', encoding='utf-8') as pred_file:
            gt_content = gt_file.read()
            pred_content = pred_file.read()

        listtags_gt = find_tags_with_positions(gt_content)
        listtags_predicted = find_tags_with_positions(pred_content)

        file_metrics = compare_tag_lists(listtags_predicted, listtags_gt)

        for key, value in file_metrics.items():
            total_metrics[key] += value

    return total_metrics

def calculate_percentages(metrics):
    total_tags = metrics['total_tags']
    if total_tags == 0:
        return {k: 0 for k in metrics}

    return {
        'correct_tags': (metrics['correct_tags'] / total_tags) * 100,
        'correct_positions': (metrics['correct_positions'] / total_tags) * 100,
        'correct_content': (metrics['correct_content'] / total_tags) * 100,
        'perfect_matches': (metrics['perfect_matches'] / total_tags) * 100
    }

def main():
    total_metrics = process_files(gt_folder, pred_folder)
    percentages = calculate_percentages(total_metrics)

    print(f"\nRisultati finali su tutti i file processati:")
    print(f"Totale tag processati: {total_metrics['total_tags']}")
    print(f"Percentuale di tag correttamente riconosciuti: {percentages['correct_tags']:.2f}%")
    print(f"Percentuale di posizioni corrette: {percentages['correct_positions']:.2f}%")
    print(f"Percentuale di contenuti corrispondenti: {percentages['correct_content']:.2f}%")
    print(f"Percentuale di corrispondenze perfette (tag, contenuto e posizione): {percentages['perfect_matches']:.2f}%")

if __name__ == "__main__":
    main()

[{'tag': 'u', 'opening_position': 92, 'closing_position': 99, 'content': 'real'}, {'tag': 'u', 'opening_position': 182, 'closing_position': 191, 'content': 'almost'}, {'tag': 'u', 'opening_position': 271, 'closing_position': 284, 'content': 'dreadfully'}]
[{'tag': 'b', 'opening_position': 51, 'closing_position': 55, 'content': '/'}, {'tag': 'u', 'opening_position': 71, 'closing_position': 84, 'content': 'how I feel'}, {'tag': 's', 'opening_position': 102, 'closing_position': 262, 'content': "Sister confessed hateMy dear sweet dearly I'm not happy — thanmorehere —that'sthe reason of my love — Jai a gore from a very true"}, {'tag': 'i', 'opening_position': 197, 'closing_position': 206, 'content': "that's"}, {'tag': 'u', 'opening_position': 305, 'closing_position': 319, 'content': 'Mr. Dickens'}, {'tag': 'b', 'opening_position': 359, 'closing_position': 377, 'content': 'Sherlock Holmes'}]
defaultdict(<class 'int'>, {'total_tags': 6, 'correct_tags': 1})
[{'tag': 'u', 'opening_position': 28