# Find Similar Images

In [3]:
import numpy as np
import numpy.linalg as linalg
import pickle, os, time
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import cv2

### Extract features using ResNet 50 and calculate Euclidean loss

In [4]:
model = ResNet50(weights='imagenet', include_top=False,
                 input_shape=(224, 224, 3))



In [5]:
# function that gets features from resnet
def get_features(model, img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    features = model.predict(img_data).flatten()
    features = features / linalg.norm(features)
    return features 

In [6]:
# gets the euclidean distance and cosine loss between a pair of feature representation
def get_difference(feature1, feature2):
    euclidean_loss = np.power(feature1 - feature2, 2)
    euclidean_loss = np.sqrt(np.sum(euclidean_loss))
#     cosine_loss = (feature1.T @ feature2)
    return euclidean_loss

In [13]:
mypath = '../images/all images/'
files_path = [mypath + x for x in os.listdir(mypath) if not x.startswith('.')]

In [14]:
coco_m, coco_f, sim_m, sim_f, coco_m_files, coco_f_files, sim_m_files, sim_f_files = ([] for i in range(8))

for file in files_path:
    base = os.path.basename(file).split('_')
    features = get_features(model, file)
    if base[0] is 'f' and base[1] is 'f':
        sim_f.append(features)
        sim_f_files.append(file)
    elif base[0] is 'f' and base[1] is 'm':
        sim_m.append(features)
        sim_m_files.append(file)
    elif base[0] is 'm' and base[1] is 'm':
        coco_m.append(features)
        coco_m_files.append(file)
    else: 
        coco_f.append(features)
        coco_f_files.append(file)

In [15]:
import csv
with open('similarMatch.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['COCO Image', 'Match Image', 'Euclidean Loss'])
    for i, male in enumerate(coco_m):
        diffs = []
        for j, female in enumerate(sim_f):
            euc = get_difference(male, female)
            diffs.append(euc)
        best_i = np.argsort(diffs)[0]
        coco_file = os.path.basename(coco_m_files[i])
        sim_f_file = os.path.basename(sim_f_files[best_i])
        print('COCO File: {0}, Similar Match: {1}, Euclidean Loss: {2}'.format(coco_file, sim_f_file, diffs[best_i]))
        writer.writerow([coco_file, sim_f_file, diffs[best_i]])
    for i, female in enumerate(coco_f):
        diffs = []
        for j, male in enumerate(sim_m):
            euc = get_difference(female, male)
            diffs.append(euc)
        best_i = np.argsort(diffs)[0]
        coco_file = os.path.basename(coco_f_files[i])
        sim_m_file = os.path.basename(sim_m_files[best_i])
        print('COCO File: {0}, Similar Match: {1}, Euclidean Loss: {2}'.format(coco_file, sim_m_file, diffs[best_i]))
        writer.writerow([coco_file, sim_m_file, diffs[best_i]])

COCO File: m_m_frisbee_127263.jpg, Similar Match: f_f_sports_135604_4.jpg, Euclidean Loss: 1.0684254169464111
COCO File: m_m_frisbee_227482.jpg, Similar Match: f_f_sports_135604_4.jpg, Euclidean Loss: 1.1695038080215454
COCO File: m_m_frisbee_291619.jpg, Similar Match: f_f_frisbee_291619_2.jpg, Euclidean Loss: 1.0963425636291504
COCO File: m_m_frisbee_325991.jpg, Similar Match: f_f_sports_135604_4.jpg, Euclidean Loss: 1.0964782238006592
COCO File: m_m_frisbee_328238.jpg, Similar Match: f_f_sports_135604_4.jpg, Euclidean Loss: 1.1178133487701416
COCO File: m_m_frisbee_88485.jpg, Similar Match: f_f_frisbee_88485_2.jpg, Euclidean Loss: 1.1360191106796265
COCO File: m_m_racket_127530.jpg, Similar Match: f_f_racket_64718_1.jpg, Euclidean Loss: 1.102043867111206
COCO File: m_m_racket_19432.jpg, Similar Match: f_f_racket_19432_2.jpg, Euclidean Loss: 1.2175593376159668
COCO File: m_m_racket_55950.jpg, Similar Match: f_f_racket_55950_3.jpg, Euclidean Loss: 1.109898567199707
COCO File: m_m_racke

### t-SNE Visualization of Images

In [16]:
image_features = []
for file in os.listdir(mypath):
    if not file.startswith('.'):
        filename = mypath + file
        features = get_features(model, filename)
        image_features.append(features)
image_features = np.array(image_features)

In [40]:
tsne = TSNE(n_components=2).fit_transform(image_features)

In [41]:
# normalize to [0, 1]
x, y = tsne[:,0], tsne[:,1]
x = (x-np.min(x)) / (np.max(x) - np.min(x)) * 1000
y = (y-np.min(y)) / (np.max(y) - np.min(y)) * 1000

In [42]:
# plot the images
tsne_plot = 255 * np.ones((1000, 1000, 3), np.uint8)
for i, file in enumerate(os.listdir(mypath)):
    if not file.startswith('.'):
        filename = mypath + file
        image = cv2.imread(filename)
        dim = (50, 50)
        # scale the image to 50 x 50
        image = cv2.resize(image, dim)
        ty = int(y[i])
        tx = int(x[i])

        try:
            tsne_plot[ty:ty + 50, tx:tx + 50, :] = image
        except:
            print(filename)

cv2.imshow('t-SNE', tsne_plot)
cv2.waitKey()

/Users/dorazhao/Desktop/images/f_f_surfboard_177357_4.jpg
/Users/dorazhao/Desktop/images/f_f_skateboard_31093_2.jpg
/Users/dorazhao/Desktop/images/f_f_surfboard_115898_5.jpg
/Users/dorazhao/Desktop/images/m_m_surfboard_7278.jpg
/Users/dorazhao/Desktop/images/f_f_surfboard_4765_2.jpg
/Users/dorazhao/Desktop/images/f_m_refrigerator_456662_5.jpg


102

In [44]:
# save tsne plot
cv2.imwrite('../results/tsne_plot.jpg', tsne_plot)

True