# Test Parser Against Labelled Set

In [None]:
import json
import math
import os
from collections import defaultdict
from typing import List

import editdistance
import matplotlib.pyplot as plt
from shapely.geometry import Polygon, Point


def load_parser_data(json_path: str) -> List[dict]:
    parser_data = []
    for file_name in os.listdir(json_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(json_path, file_name)
            with open(file_path, "r") as f:
                json_data = json.load(f)
                parser_data.append(json_data)
    return parser_data


def load_labelstudio_data(json_path: str) -> List[dict]:
    with open(json_path) as f:
        labelstudio_data = json.load(f)
    return labelstudio_data


def merge_data(labelstudio_data: List[dict], parser_data: List[dict]) -> List[dict]:
    merged_list = []
    for dict1 in labelstudio_data:
        for dict2 in parser_data:
            if (
                dict1["ocr"].split("/")[-1].replace("%20", " ").replace(".png", "")
                == dict2["document_id"]
            ) and ("label" in dict1.keys()):
                merged_dict = {**dict1, **dict2}
                merged_list.append(merged_dict)
    return merged_list


def dict_to_polygon(d: dict) -> Polygon:
    x, y, width, height = d["x"], d["y"], d["width"], d["height"]
    # calculate the four corners of the rectangle
    p1, p2, p3, p4 = (x, y), (x + width, y), (x + width, y + height), (x, y + height)
    # create the polygon
    polygon = Polygon([p1, p2, p3, p4])
    return polygon


def coords_to_polygon(coords: List[tuple], x_scale=1, y_scale=1) -> Polygon:
    # divide the x and y coords by the scaling factors
    coords = [(p[0] / x_scale, p[1] / y_scale) for p in coords]
    # create the polygon
    polygon = Polygon(coords)
    return polygon


def flatten(lst: List) -> List:
    flat_list = []
    for item in lst:
        if isinstance(item, list):
            flat_list.extend(flatten(item))
        else:
            flat_list.append(item)
    return flat_list


def plot_overlaps(boxes_ls: List[Polygon], boxes_lp: List[Polygon]) -> None:
    fig, ax = plt.subplots()
    # plot the first set of polygons
    for polygon in boxes_ls:
        x, y = polygon.exterior.xy
        ax.fill(x, y, alpha=0.5, fc="red")
    # plot the second set of polygons
    for polygon in boxes_lp:
        x, y = polygon.exterior.xy
        ax.fill(x, y, alpha=0.5, fc="blue")
    # display the plot
    plt.show()


def highest_iou(polygons1: List[Polygon], polygons2: List[Polygon]) -> List[dict]:
    result = []
    for i, polygon1 in enumerate(polygons1):
        max_iou = 0
        max_index = None
        for j, polygon2 in enumerate(polygons2):
            iou = polygon1.intersection(polygon2).area / polygon1.union(polygon2).area
            if iou > max_iou:
                max_iou = iou
                max_index = j
        if max_iou > 0:
            result.append(
                {"label_poly_ix": i, "parser_poly_ix": max_index, "frac_iou": max_iou}
            )
    return result


def calculate_levenshtein_distances(
    dic: dict, ious: List[dict], same_block_threshold=0.9
) -> dict:
    distances = []
    normalised_distances = []
    for iou in ious:
        if iou["frac_iou"] > same_block_threshold:
            ls_text = dic["transcription"][iou["label_poly_ix"]]
            parser_text = dic["pdf_data"]["text_blocks"][iou["parser_poly_ix"]]["text"][
                0
            ]
            distance = editdistance.eval(ls_text, parser_text)
            normalized_distance = distance / max(len(ls_text), len(parser_text))
            distances.append(distance)
            normalised_distances.append(normalized_distance)
    levenshtein_dict = {
        "distances": distances,
        "normalised_distances": normalised_distances,
    }
    return levenshtein_dict

In [128]:
label_json_path = "/home/stefan/Downloads/project-1-at-2023-02-15-15-45-7541cc40.json"
parser_json_path = (
    "/home/stefan/PycharmProjects/navigator-document-parser-pdf/app/data/processed/"
)
parser_data = load_parser_data(parser_json_path)

labelstudio_data = load_labelstudio_data(label_json_path)
merged_list = merge_data(labelstudio_data, parser_data)

for d in merged_list:
    width_lp, height_lp = d["pdf_data"]["page_metadata"][0]["dimensions"]
    boxes_ls = [dict_to_polygon(dd) for dd in d["label"]]
    boxes_lp = [
        coords_to_polygon(dd["coords"], width_lp / 100, height_lp / 100)
        for dd in d["pdf_data"]["text_blocks"]
    ]
    d["label_polys"] = boxes_ls
    d["parser_polys"] = boxes_lp
    labels_area = sum([b.area for b in boxes_ls])
    parser_area = sum([b.area for b in boxes_lp])
    d["percentage_coverage_labels"] = 1e2 * labels_area / 1e4
    d["percentage_coverage_parser"] = 1e2 * parser_area / 1e4
    d["len_text_labels"] = sum(map(len, d["transcription"]))
    d["len_text_parser"] = sum(
        [len(t["text"][0]) for t in d["pdf_data"]["text_blocks"]]
    )
    d["labels_vs_parser_text_len_ratio"] = d["len_text_labels"] / d["len_text_parser"]
    ious_dict = highest_iou(boxes_ls, boxes_lp)
    d["intersection_over_unions"] = ious_dict
    d["levenshtein_distances"] = calculate_levenshtein_distances(d, ious_dict)
    # d['weighted_normalised_levenshtein']