In [None]:
%matplotlib inline

In [None]:
scanbotsdk_license_key: str = None
training_dir: str = None
explain_image_path: str = None
config_debug_path: str = None
num_jobs: int = 4

# Document Quality Analyzer - Debug Analysis

In [None]:
from pathlib import Path

print(f"For {Path(explain_image_path).relative_to(training_dir)}:")

The following image visualizes what text the DoQA detects in your image.<br>
Based on this information, DoQA will compare the characteristics of your image to the characteristics of images seen during training.
If large parts of the document's text (>25%) are not detected at all or are incorrectly annotated, please contact Scanbot SDK support. Please provide the image and this analysis report.

In [None]:
import scanbotsdk

from character_annotations_plot import plot_annotations

training_dir = Path(training_dir)
explain_image_path = Path(explain_image_path)

scanbotsdk.set_logging(False)
scanbotsdk.initialize(scanbotsdk_license_key)
document_quality_analyzer = scanbotsdk.DocumentQualityAnalyzerTrainingDataAnnotator()
image = scanbotsdk.ImageRef.from_path(explain_image_path)
annotations = document_quality_analyzer.run(image=image)

plt = plot_annotations(explain_image_path, annotations)
plt.show(block=False)

In [None]:
import pickle
from CharacterClusteringTransformer import CharacterClusteringTransformer
from PixelClustering import PixelClusteringTransformer
import pandas as pd
from configurator_utils import  load_samples_from_training_dir, get_character_properties, all_features
import numpy as np
import matplotlib.pyplot as plt
import configurator_utils

pixel_clustering: PixelClusteringTransformer = pickle.load(open(config_debug_path, "rb"))['pixel_clustering']

rgb_color_bin_representatives = pixel_clustering.rgb_color_bin_representatives()

gray_pixels = configurator_utils.load_gray_pixels_from_image(explain_image_path)
pixel_hist = pixel_clustering.create_hist(gray_pixels)

if len(pixel_hist) > 0:
    fig, ax = plt.subplots(figsize=(6, 4))

    x_positions = range(len(pixel_hist))
    bars = ax.bar(x_positions, pixel_hist, alpha=0.7, color='lightblue', edgecolor='black')

    for i, (bar, rgb_color) in enumerate(zip(bars, rgb_color_bin_representatives)):
        bar_height = bar.get_height()

        circle_y = bar_height + max(pixel_hist) * 0.05
        circle = plt.Rectangle((i-0.4, circle_y), height=max(pixel_hist) * 0.05, width=.8,
                           facecolor=rgb_color, edgecolor='black', linewidth=1, zorder=3)
        ax.add_patch(circle)

    ax.set_xlabel('Pixel Color')
    ax.set_ylabel('Normalized Frequency')
    ax.set_title('Pixel Brightness Histogram')
    ax.set_xticks(x_positions)
    ax.set_ylim(0, max(pixel_hist) * 1.2)

    plt.tight_layout()
    plt.show(block=False)

## Similar images in training data

The following images from the training data are most similar to your image.<br>
The similarity is in the range 0% (not similar at all) to 100% (identical).<br>
Using this information, you can fine-tune your training data: E.g. if you want that your image is classified as "acceptable", but it is currently reported as "unacceptable" by DoQA, try adding more similar images to the "good" training data or removing very similar images from the "bad" training data.

In [None]:
sample = {
    "image_path": explain_image_path,
    "character_level_annotations": pd.DataFrame(
        get_character_properties(annotations.character_level_annotations, all_features)
    ),
    f"pixel_histogram_{pixel_clustering.n_pixel_clusters}": pixel_hist,
}

samples = load_samples_from_training_dir(
    training_dir=training_dir,
    smoke_test=False,
    document_quality_analyzer=document_quality_analyzer,
    num_jobs=num_jobs,
    cache_enabled=True,
    show_progress=False,
)
X = pd.DataFrame(samples)

clustering: CharacterClusteringTransformer = pickle.load(open(config_debug_path, "rb"))['clustering']

good_bad_cluster_hists = clustering.transform(X)
sample_cluster_hist = clustering.transform(pd.DataFrame([sample]))[0]

max_distance = np.sqrt(len(sample_cluster_hist))
X['distance'] = [
    np.linalg.norm(sample_cluster_hist - np.asarray(feature_clusters)) / max_distance
    for feature_clusters in good_bad_cluster_hists]

fig, axs = plt.subplots(5, 5, figsize=(13, 9), height_ratios=[0.03, 1, 0.1, 0.03, 1], tight_layout=True)
for ax in axs.flatten():
    ax.axis('off')

for index, (_, row) in enumerate(X.nsmallest(10, 'distance').iterrows()):
    rel_path = Path(row['image_path']).relative_to(training_dir)
    similarity = (1 - row['distance']) * 100
    title = f"{rel_path}\nSimilarity: {similarity:.2f}%"
    x = index % 5
    y = (index // 5) * 3
    ax_title = axs.flatten()[x + 5*y]
    ax_title.set_title(title, fontsize=8)
    ax_title.barh([0], [similarity], height=0.5, color='green' if row['label'] == 1 else 'red')
    ax_title.barh([0], [100], height=0.5, color='none', edgecolor='black', linewidth=1)
    ax_title.set_xlim(0, 100)
    ax_title.set_ylim(-0.5, 0.5)
    ax_title.set_anchor('S')
    ax_img = axs.flatten()[x + 5*(y+1)]
    ax_img.imshow(plt.imread(row['image_path']))
    ax_img.set_anchor('S')
plt.tight_layout()
plt.show(block=False)