## Image Comparison Using Perceptual Hashing and Nearest Neighbors
This notebook describes a personal project on strategies for image comparisons. 3 methods are used to find matching images between two folders (referred to as "target" folder and "reference" folder)

1) Perceptual hashing alone (Hamming distance = 0) <br>
2) Matching names (No hashing, just unique image names) <br>
3) Nearest Neighbors (Hamming distance)

In [None]:
import os
import sys
import glob
from PIL import Image
import imagehash
import csv
from annoy import AnnoyIndex
import pandas as pd

In [None]:
#Getting all image files in the "target" folder and adding to a list
files_target = []
files_target.extend(glob.glob(glob.escape(r'C:\Users\dramadas\target1a') + '/**/images/**/*', recursive=True))
files_target.extend(glob.glob(glob.escape(r'C:\Users\dramadas\target1b') + '/**/images/**/*', recursive=True))
print(*files_target, sep= "\n")

In [None]:
#Getting all image files in the "reference" folder and adding to a list
files_reference = []
ref_dir = 'C:\Users\dramadas\reference'
#Need to esacpe special chars
files_reference.extend(glob.glob(glob.escape(ref_dir) + '/**/*', recursive=True))
print(*files_reference, sep="\n")

### 1) The following code will find exact matches, i.e. Hamming distance = 0

This method loops through each image in Reference folder and finds matches in target folders

In [None]:
#Creating a dict of hashes for target images for easy comparison later
target_dict = {}
for target_image in files_target:
    try:
        img = Image.open(target_image)
        img_hash = imagehash.phash(img)
        #print(img_hash)
        target_dict[img_hash] = target_image
        img.close()
    except OSError as error:
        print("Error: ", target_image)

In [None]:
#Loop through reference images and check if exists in target dict:
with open('results_reference.csv', 'w', newline = '') as file:
    writer = csv.writer(file)
    headings = ['Image directory', 'Matching Image', 'Result', 'Size']
    writer.writerow(headings)
    
    for ref_image in files_reference:
        try:
            img1 = Image.open(ref_image)
            img1_hash = imagehash.phash(img1)
            if img1_hash in target_dict:
                print("Success", ref_image)
                writer.writerow([ref_image, target_dict[img1_hash], 'Pass', img1.size])
            else:
                writer.writerow([ref_image, 'NA', 'Fail', img1.size])
            img1.close()
        except OSError as error:
            print("Error:", ref_image)
            writer.writerow([ref_image, 'NA', 'Error'])

In [None]:
#For debug only, if need to find files of a certain extension
# for i in files_reference:
#     if i.endswith('.tiff'):
#         print(i)

### 2) Next, we try to find matching images based on their unique names
This method was viable due to the unique naming of images within the reference and target folders. It is quick but has limitations. It requires post-processing and some manual filtering of the resulting csv. <br>
In future, this method can be updated to match only if one unique name exists in target folder.

In [None]:
# For each file in target, store the file name in a dict
target_name_dict = {}
for target_image in files_target:
    img1_name = os.path.basename(target_image)
    target_name_dict[img1_name] = target_image


In [None]:
#this only works for unique names currently
with open('results_reference_names.csv', 'w', newline = '') as file:
    writer = csv.writer(file)
    headings = ['Image directory', 'Matching Name']
    writer.writerow(headings)
    
    for ref_image in files_reference:
        try:
            img1 = Image.open(ref_image)
            img1_hash = imagehash.phash(img1)
            if img1_hash not in target_dict:
                img1_name = os.path.basename(ref_image)
                # comparing names only
                if img1_name in target_name_dict:
                    print(img1_name)
                    print(target_name_dict[img1_name])
                    writer.writerow([ref_image, target_name_dict[img1_name]])
                else:
                    writer.writerow([ref_image, 'Fail'])
            img1.close()
        except OSError as error:
            writer.writerow([ref_image, 'Error'])

### 3) Finally, attempt to find matches using Nearest Neighbors

This method uses Annoy to create a forest to find nearest neighbors.

In [None]:
#This is the remaining Failures (all Passes and Errors have been removed)
df_remaining = pd.read_csv('results_reference_consolidated.csv') #this file has been manually filtered and created from steps above
df_remaining = df_remaining[df_remaining["Result"] == "Fail"]
df_remaining["Result"].describe()

In [None]:
#Make a list of the remaining failures
ref_remaining = df_remaining["Image directory"].values.tolist()

In [None]:
#Using Annoy to create a forest for nearest neighbors
vec_length = 64
nn_dict = {}
for count, f in enumerate(files_target):
    img = Image.open(f)
    img_hash = imagehash.phash(img)
    hash_array = img_hash.hash.astype('int').flatten();
    nn_dict[count] = hash_array
    #Need to know indexes to know where to store the next image
    print(count)

In [None]:
#Try using hamming distance instead, find nearest neighbors
num_trees = 200
num_neighbors = 5 #one of these will be the searched reference image

with open('results_reference_hamming.csv', 'w', newline = '') as file:
    writer = csv.writer(file)
    headings = ['Image directory', 'Nearest Neighbors 1', 'Hamming Distances']
    writer.writerow(headings)
    
    for ref_image in ref_remaining:
        print(ref_image)
        try:
            img1 = Image.open(ref_image)
            img1_hash = imagehash.phash(img1)
            
            hash_array = img1_hash.hash.astype('int').flatten();
            #This index is hardcoded to be the next index after all the target images
            nn_dict[4641] = hash_array
            #Using annoy to find nearest hamming distances
            t = AnnoyIndex(vec_length, "hamming")
            for key, value in nn_dict.items():
                t.add_item(key,value)
            t.build(num_trees)
            #Passing the hardcoded index in
            neighbors = t.get_nns_by_item(4641, num_neighbors, include_distances = True)
            print(files_target[neighbors[0][1]])
            print("Closest distance: ", neighbors[1][1])
            writer.writerow([ref_image, files_target[neighbors[0][1]], neighbors[1][1]])
            img1.close()
        except OSError as error:
            writer.writerow([ref_image, 'NA', 'Error'])

### This section below is to filter/merge matches by Hamming distance and can be run independently of code above

In [None]:
#Merge 2.0 Hamming distances into consolidated excel
df_consolidated = pd.read_csv('results_reference_consolidated.csv')
df_ham = pd.read_csv('results_reference_hamming.csv')

In [None]:
df_ham = df_ham[df_ham["Hamming Distances"] == 2.0] # Lower values indicate closer matches.
df_ham

In [None]:
outer_merged = pd.merge(df_consolidated, df_ham, how = "outer", on=["Image directory"])

In [None]:
outer_merged.to_csv('results_reference_consolidated_hamming2.csv') 