# SignVer Test Harness

In [None]:
import os
import sys
import random
import itertools
from typing import Tuple

import PIL
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# %load_ext nb_black

## Load data and generate pairs

In [None]:
class SignatureDataset:
    """
    Captures a signature dataset into an abstract, representative collection.

    Currently supported datasets are: CEDAR Signatures

    Attributes:
        root_dir: path to the root directory location of the dataset
        dataset: naming identifier for the dataset ('cedar')

    """

    def __init__(self, root_dir, dataset_name):
        self.root_dir = root_dir
        self.dataset = dataset_name

        self.images, self.image_metadata = self.collect_cedar_images(
            # root_dir, target_shape=(128, 128)
            root_dir, target_shape=(224, 224)
        )
        self.img_path_lookup = self.image_metadata.image_path.to_dict()
        self.assign_train_test_split()
        self.verification_pairs = self.generate_verification_pairs(self.image_metadata)

    @classmethod
    def collect_cedar_images(self, root_path: str, target_shape: Tuple):
        """
        Collates CEDAR Signature Dataset into a representative dictionary.

        Args:
            root_path: the root path of a locally saved CEDAR dataset (https://cedar.buffalo.edu/NIJ/data/)
            target_shape: desired image shape for loading imaeges

        """

        images = []
        image_metadata = []

        for author in range(1, 56):
            for subfolder in ["full_org", "full_forg"]:
                for sig_number in range(1, 25):

                    prefix = "original" if subfolder == "full_org" else "forgeries"
                    filename = f"{prefix}_{author}_{sig_number}.png"
                    img_path = os.path.join(root_path, subfolder, filename)

                    # img = load_img(
                    #     path=img_path, target_size=target_shape, color_mode="grayscale"
                    # )
                    # arr = img_to_array(img).reshape(1, *target_shape)

                    # images.append(arr)
                    image_metadata.append(
                        (
                            author,
                            "original" if subfolder == "full_org" else "forgery",
                            sig_number,
                            img_path,
                        )
                    )

        # images = np.vstack(images)
        image_metadata = pd.DataFrame(
            image_metadata, columns=["author", "label", "sig_number", "image_path"]
        )

        return images, image_metadata

    def assign_train_test_split(self, train_size=0.8):
        """
        Assign train vs test labels by author to self.image_metadata

        """
        random.seed(42)

        authors = self.image_metadata.author.unique()
        random.shuffle(authors)

        train_size = int(np.floor(train_size * len(authors)))
        train, test = authors[:train_size], authors[train_size:]

        self.image_metadata["train_test"] = self.image_metadata.author.apply(
            lambda x: "train" if x in train else "test"
        )

    @classmethod
    def generate_verification_pairs(self, image_metadata):
        """
        Generates a list of verification pairs (genuine/genuine, genuine/forged, genuine/unskilled_forged) for each author
        from a given metadata dataframe.

        To maintain a balanced dataset, genuine/forged and genuine/unskilled pairs are randomly sampled down
        to maintain the same number of examples as genuine/genuine.

        Args:
            image_metadata: dataframe of labels where index matches each feature index in features

        Returns:
            author_dfs: dataframe of signature pairs (as specified by index into metadata dataframe) for each author and label type

        TO-DO - return this as a shuffled, useable set of data for training/inference

        """

        np.random.seed(42)
        author_dfs = []

        for author in image_metadata.author.unique():

            author_subset = image_metadata[image_metadata.author == author]

            genuine_idx_array = author_subset[
                author_subset.label == "original"
            ].index.values

            forged_idx_array = author_subset[
                author_subset.label == "forgery"
            ].index.values

            unskilled_index_array = image_metadata[
                ~image_metadata.index.isin(
                    np.concatenate([genuine_idx_array, forged_idx_array])
                )
            ].index.values

            # identify (genuine, genuine) combinatorial pairs
            genuine_idx_combinations = list(
                itertools.combinations(genuine_idx_array, 2)
            )

            # identify (genuine, forged) cartesian product pairs; sample down to have same number pairs as genuine
            forged_idx_combinations = np.array(
                list(itertools.product(genuine_idx_array, forged_idx_array))
            )
            forged_indicies = np.random.choice(
                list(range(len(forged_idx_combinations))),
                size=len(genuine_idx_combinations),
                replace=False,
            )
            forged_idx_combinations = forged_idx_combinations[forged_indicies].tolist()

            # identify (genuine, unskilled_forged) pairs by randomly sampling from list of OTHER authors signatures
            unskilled_idx_combinations = list(
                zip(
                    np.random.choice(
                        genuine_idx_array,
                        size=len(genuine_idx_combinations),
                        replace=True,
                    ),
                    np.random.choice(
                        unskilled_index_array,
                        size=len(genuine_idx_combinations),
                        replace=False,
                    ),
                )
            )

            sig_pairs = {
                0: forged_idx_combinations,  # for contrastive loss, dissimilar pairs are label 0
                1: genuine_idx_combinations,
                2: unskilled_idx_combinations,
            }

            for label, combs in sig_pairs.items():
                author_df = pd.DataFrame(
                    sig_pairs[label],
                    columns=["anchor_idx", "alt_idx"],
                )
                author_df["author"] = author
                author_df["label"] = label

                author_dfs.append(author_df)

        return pd.concat(author_dfs)

In [None]:
%%time

# CEDAR_PATH = "/content/gdrive/MyDrive/FF20: SignVer/data/cedar"
CEDAR_PATH = "/content/local_data/cedar"

cedar_dataset = SignatureDataset(root_dir=CEDAR_PATH, dataset_name="cedar")

## Identify Validation Pairs Only

In [None]:
def gather_validation_pairs(verification_pairs, image_metadata, idx_filename_mapping):
    """
    Given a dataframe of verification pairs for each author and mapping of index to filepaths,
    this function collects image pairs (filenames) and labels (int) and returns them as a
    nested tf.data.Dataset object.

    """
    authors = image_metadata[image_metadata.train_test == 'test'].author.unique()
    verification_pairs = verification_pairs[(verification_pairs.author.isin(authors)) & (verification_pairs.label != 2)]

    verification_pairs.anchor_idx = verification_pairs.anchor_idx.apply(lambda x: idx_filename_mapping[x])
    verification_pairs.alt_idx = verification_pairs.alt_idx.apply(lambda x: idx_filename_mapping[x])

    return verification_pairs

In [None]:
gather_validation_pairs(cedar_dataset.verification_pairs,
                        cedar_dataset.image_metadata,
                        cedar_dataset.img_path_lookup)

## Calculate similarity

NOTE - I don't have a function that does this for you, but basically for the image pairs in the dataframe above, you'll need to:
1. load the image from its filepath
2. preprocess each image
3. get embeddings from your model for each image
4. calculate cosine distance on each pair


This will leave you with y_pred, and the y_true is the "label" column in the dataframe...

## Characterize Performance

Then using the y_pred and y_true, run the following functions

In [None]:
def find_eer(fprs, tprs, thresholds):
    """
    Given a set of False Positive Rates, True Positive Rates, and corresponding thresholds output by
    sklearn.metrics.roc_curve(), calculate the Equal Error Rate (EER).

    EER (also known as Crossover Error Rate) is the point on the ROC curve where FPR==FNR.

    """
    fnrs = 1 - tprs

    crossover = np.argwhere(np.diff(np.sign(fprs - fnrs)))

    if len(crossover) > 1:
        crossover_idx = np.argwhere(np.diff(np.sign(fprs - fnrs)))[0].item()
    else:
        crossover_idx = np.argwhere(np.diff(np.sign(fprs - fnrs))).item()

    crossover_thresh = thresholds[crossover_idx]
    crossover_error = fnrs[crossover_idx]

    return crossover_idx, crossover_thresh, crossover_error

In [None]:
def find_max_accuracy(fprs, tprs, thresholds):

    accs = []

    for fpr, tpr, thresh in list(zip(fprs, tprs, thresholds)):

        accuracy = 0.5 * (tpr + (1 - fpr))  # tnr = (1-fpr)
        accs.append(accuracy)

    max_acc_idx = np.argmax(accs)

    return accs[max_acc_idx]

In [None]:
def get_metrics(fprs, tprs, thresholds):

  crossover_idx, crossover_thresh, crossover_error = find_eer(fprs, tprs, thresholds)
  max_accuracy = find_max_accuracy(fprs, tprs, thresholds)

  return {'crossover_idx': crossover_idx,
          'crossover_thresh': crossover_thresh,
          'crossover_error': crossover_error,
          'max_accuracy': max_accuracy}


In [None]:
fprs, tprs, thresholds = roc_curve(
    y_true=records_triplet.label, y_score=records_triplet.distance, pos_label=0
)

fnrs = 1 - tprs

In [None]:
plt.plot(fprs, tprs)
plt.title("ROC Curve")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")

In [None]:
get_metrics(fprs, tprs, thresholds)