# Calculate the jaccard score for every prediction

In [1]:
from pathlib import Path
from loguru import logger
import numpy as np
from tifffile import imread
import typer
from tqdm import tqdm

NA_VALUE = 255

In [2]:
def iterate_through_mask_pairs(submission_dir: Path, actual_dir: Path):
    """
    For each tif in the actual directory, find the corresponding prediction tif, read
    them both in, and yield the (pred, actual) tuple
    """
    for predicted_path in submission_dir.glob("*.tif"):
        filename = predicted_path.name
        label_path = actual_dir / filename
        assert label_path.exists(), f"Could not find expected file: {filename}"
        actual = imread(label_path)
        pred = imread(predicted_path)
        yield pred, actual

In [3]:
def intersection_over_union(df, total=None):
    """Calculate the actual metric"""
    intersection = 0
    union = 0
    for pred, actual in tqdm(array_pairs, total=total):
        invalid_mask = actual == NA_VALUE
        actual = np.ma.masked_array(actual, invalid_mask)
        pred = np.ma.masked_array(pred, invalid_mask)
        intersection += np.logical_and(actual, pred).sum()
        union += np.logical_or(actual, pred).sum()
    if union < 1:
        raise ValueError("At least one image must be in the actual data set")
    return intersection / union

In [4]:
submission_dir = Path.cwd().parent / "output_data"
actual_dir = Path.cwd().parent / "data" / "raw" / "train_features" / "train_labels"

n_expected = len(list(submission_dir.glob("*.tif")))
array_pairs = iterate_through_mask_pairs(submission_dir, actual_dir)
logger.info(f"calculating score for {n_expected} image pairs ...")
score = intersection_over_union(array_pairs, total=n_expected)
logger.success(f"overall score: {score}")

2021-10-06 10:44:40.424 | INFO     | __main__:<module>:6 - calculating score for 30 image pairs ...
100%|██████████| 30/30 [00:00<00:00, 44.64it/s]
2021-10-06 10:44:41.145 | SUCCESS  | __main__:<module>:8 - overall score: 0.6553508561416426
