In [None]:
%matplotlib inline

In [None]:
scanbotsdk_license_key: str
training_dir: str
explain_image_path: str
config_debug_path: str

# Document Quality Analyzer - Debug Analysis

In [None]:
from pathlib import Path

print(f"For {Path(explain_image_path).relative_to(training_dir)}:")

The following image visualizes what text the DoQA detects in your image.<br>
Based on this information, DoQA will compare the characteristics of your image to the characteristics of images seen during training.
If there seems to be something wrong with the text detection, please contact Scanbot SDK support. Please provide the image and this analysis report.

In [None]:
import scanbotsdk

from character_annotations_plot import plot_annotations

training_dir = Path(training_dir)
explain_image_path = Path(explain_image_path)

scanbotsdk.set_logging(False)
scanbotsdk.initialize(scanbotsdk_license_key)
document_quality_analyzer = scanbotsdk.DocumentQualityAnalyzerTrainingDataAnnotator()
image = scanbotsdk.ImageRef.from_path(explain_image_path)
annotations = document_quality_analyzer.run(image=image)

plt = plot_annotations(explain_image_path, annotations)
plt.show(block=False)

## Similar images in training data

The following images from the training data are most similar to your image.<br>
The similarity is in the range 0% (not similar at all) to 100% (identical).<br>
Using this information, you can fine-tune your training data: E.g. if you want that your image is classified as "acceptable", but it is currently reported as "unacceptable" by DoQA, try adding more similar images to the "good" training data or removing very similar images from the "bad" training data.

In [None]:
import pickle
import pandas as pd
from configurator_utils import load_samples_from_training_dir, get_character_properties, all_features
from CharacterClusteringTransformer import CharacterClusteringTransformer
import numpy as np

sample = dict(
    image_path=explain_image_path,
    character_level_annotations=pd.DataFrame(
        get_character_properties(annotations.character_level_annotations, all_features)
    ),
)

samples = load_samples_from_training_dir(
    training_dir=training_dir,
    smoke_test=False,
    document_quality_analyzer=document_quality_analyzer,
    num_jobs=4,
    cache_enabled=True,
    show_progress=False,
)
X = pd.DataFrame(samples)

clustering: CharacterClusteringTransformer = pickle.load(open(config_debug_path, "rb"))['clustering']

good_bad_cluster_hists = clustering.transform(X)
sample_cluster_hist = np.asarray(clustering.transform(pd.DataFrame([sample]))[0])

X['distance'] = [np.linalg.norm(sample_cluster_hist - np.asarray(reference)) for reference in good_bad_cluster_hists]

fig, axs = plt.subplots(2, 5, figsize=(13, 8))
for index, (_, row) in enumerate(X.nsmallest(10, 'distance').iterrows()):
    rel_path = Path(row['image_path']).relative_to(training_dir)
    similarity = (1 - row['distance']) * 100
    title = f"{rel_path}\nSimilarity: {similarity:.2f}%"
    ax = axs.flatten()[index]
    ax.set_title(title, fontsize=8)
    ax.imshow(plt.imread(row['image_path']))
    ax.axis('off')
plt.tight_layout()
plt.show(block=False)


