# Cleaning

## Images

In [None]:
import os
from pathlib import Path

import imagehash
from PIL import Image

In [None]:
def get_image_hash(image_path: Path) -> str:
    """Get perceptual hash of an image"""
    with Image.open(image_path) as img:
        img_hash = imagehash.phash(img)
    return img_hash


def reference_images(references_dir: Path) -> dict:
    """Get perceptual hash of an image"""
    img_hashes = {}
    for img_file in os.listdir(references_dir):
        img_path = os.path.join(references_dir, img_file)
        img_hash = get_image_hash(img_path)
        img_hashes[img_file] = img_hash
    return img_hashes


def is_similar_to_reference(references_dir: Path, image_path: Path, threshold: int = 5):
    """Check if an image is similar to a reference image"""
    img_hash = get_image_hash(image_path)
    reference_hashes = reference_images(references_dir=references_dir)

    for ref_hash in reference_hashes.values():
        hash_diff = img_hash - ref_hash
        if hash_diff <= threshold:
            return True

    return False


def filter_images_in_document(
    references_dir: Path, images_dir: Path, threshold: int = 5
):
    """Filter images in a directory based on similarity to reference images"""
    for filename in os.listdir(images_dir):
        if filename.endswith((".png", ".jpg", ".jpeg")):
            img_path = os.path.join(images_dir, filename)
            is_similar = is_similar_to_reference(
                references_dir=references_dir, image_path=img_path, threshold=threshold
            )
            if is_similar:
                os.remove(img_path)

In [None]:
filter_images_in_document(
    r"/mnt/mydisk/Projects/plu/references/images",
    r"/mnt/mydisk/Projects/plu/data/raw/Grenoble/Par Zone Agricoles/images",
    threshold=5,
)

## Text

In [None]:
import multiprocessing
import os
import re

import markdown
import pandas as pd
import spacy
from IPython.display import Markdown, display
from tqdm import tqdm
from pathlib import Path

In [None]:
md_path = (
    r"/mnt/mydisk/Projects/plu/data/raw/Grenoble/Dispositions_Generales/ocr_results.md"
)
img_path = r"/mnt/mydisk/Projects/plu/data/raw/Grenoble/Dispositions_Generales/images"

with open(md_path, "r", encoding="utf-8") as file:
    md_content = file.read()

# Open images directory
img_files = os.listdir(img_path)
display(Markdown(md_content))