# Find Similar Images

In [36]:
import numpy as np
import numpy.linalg as linalg
import pickle, os, time
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import cv2

### Extract features using ResNet 50 and calculate Euclidean loss

In [29]:
model = ResNet50(weights='imagenet', include_top=False,
                 input_shape=(224, 224, 3))



In [37]:
# function that gets features from resnet
def get_features(model, img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    features = model.predict(img_data).flatten()
    features = features / linalg.norm(features)
    return features 

In [5]:
# gets the euclidean distance and cosine loss between a pair of feature representation
def get_difference(feature1, feature2):
    euclidean_loss = np.power(feature1 - feature2, 2)
    euclidean_loss = np.sqrt(np.sum(euclidean_loss))
#     cosine_loss = (feature1.T @ feature2)
    return euclidean_loss

In [7]:
mypath = '/Users/dorazhao/Desktop/images/'
files_path = [mypath + x for x in os.listdir(mypath) if not x.startswith('.')]

In [168]:
coco_m, coco_f, sim_m, sim_f, coco_m_files, coco_f_files, sim_m_files, sim_f_files = ([] for i in range(8))

for file in files_path:
    base = os.path.basename(file).split('_')
    features = get_features(model, file)
    if base[0] is 'f' and base[1] is 'f':
        sim_f.append(features)
        sim_f_files.append(file)
    elif base[0] is 'f' and base[1] is 'm':
        sim_m.append(features)
        sim_m_files.append(file)
    elif base[0] is 'm' and base[1] is 'm':
        coco_m.append(features)
        coco_m_files.append(file)
    else: 
        coco_f.append(features)
        coco_f_files.append(file)

In [61]:
import csv
with open('similarMatch.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['COCO Image', 'Match Image', 'Euclidean Loss'])
    for i, male in enumerate(coco_m):
        diffs = []
        for j, female in enumerate(sim_f):
            euc = get_difference(male, female)
            diffs.append(euc)
        best_i = np.argsort(diffs)[0]
        coco_file = os.path.basename(coco_m_files[i])
        sim_f_file = os.path.basename(sim_f_files[best_i])
        print('COCO File: {0}, Similar Match: {1}, Euclidean Loss: {2}'.format(coco_file, sim_f_file, diffs[best_i]))
        writer.writerow([coco_file, sim_f_file, diffs[best_i]])

COCO File: m_m_frisbee_227482.jpg, Similar Match: f_f_sports_135604_4.jpg, Euclidean Loss: 1.1695038080215454
COCO File: m_m_frisbee_88485.jpg, Similar Match: f_f_frisbee_88485_2.jpg, Euclidean Loss: 1.1360191106796265
COCO File: m_m_tie_152214.jpg, Similar Match: f_f_tie_131444_1.jpg, Euclidean Loss: 1.1849696636199951
COCO File: m_m_surfboard_115898.jpg, Similar Match: f_f_surfboard_115898_1.jpg, Euclidean Loss: 0.9874610900878906
COCO File: m_m_frisbee_328238.jpg, Similar Match: f_f_sports_135604_4.jpg, Euclidean Loss: 1.1178133487701416
COCO File: m_m_sports_135604.jpg, Similar Match: f_f_sports_371552_4.jpg, Euclidean Loss: 1.1168785095214844
COCO File: m_m_tie_170099.jpg, Similar Match: f_f_frisbee_325991_1.jpg, Euclidean Loss: 1.2257771492004395
COCO File: m_m_racket_88970.jpg, Similar Match: f_f_sports_429690_1.jpg, Euclidean Loss: 1.1238709688186646
COCO File: m_m_tie_21604.jpg, Similar Match: f_f_tie_21604_1.jpg, Euclidean Loss: 1.1359667778015137
COCO File: m_m_surfboard_325

In [62]:
with open('similarMatch.csv', 'a') as f:
    writer = csv.writer(f)
    for i, female in enumerate(coco_f):
        diffs = []
        for j, male in enumerate(sim_m):
            euc = get_difference(female, male)
            diffs.append(euc)
        best_i = np.argsort(diffs)[0]
        coco_file = os.path.basename(coco_f_files[i])
        sim_m_file = os.path.basename(sim_m_files[best_i])
        print('COCO File: {0}, Similar Match: {1}, Euclidean Loss: {2}'.format(coco_file, sim_m_file, diffs[best_i]))
        writer.writerow([coco_file, sim_m_file, diffs[best_i]])

COCO File: m_f_handbag_81594.jpg, Similar Match: f_m_handbag_81594_3.jpg, Euclidean Loss: 1.198861002922058
COCO File: m_f_toothbrush_445999.jpg, Similar Match: f_m_toothbrush_445999_2.jpg, Euclidean Loss: 1.1346672773361206
COCO File: m_f_refrigerator_22705.jpg, Similar Match: f_m_refrigerator_456662_1.jpg, Euclidean Loss: 1.1000168323516846
COCO File: m_f_hairdrier_350002_2.jpg, Similar Match: f_m_hairdrier_350002_5.jpg, Euclidean Loss: 1.1814556121826172
COCO File: m_f_refrigerator_280930.jpg, Similar Match: f_m_refrigerator_150417_1.jpg, Euclidean Loss: 1.2233757972717285
COCO File: m_f_refrigerator_280918.jpg, Similar Match: f_m_refrigerator_280930_1.jpg, Euclidean Loss: 1.212421178817749
COCO File: m_f_toothbrush_465179.jpg, Similar Match: f_m_toothbrush_465179_1.jpg, Euclidean Loss: 1.215056300163269
COCO File: m_f_refrigerator_150417.jpg, Similar Match: f_m_refrigerator_150417_1.jpg, Euclidean Loss: 1.138541579246521
COCO File: m_f_toothbrush_324614.jpg, Similar Match: f_m_toot

In [172]:
mypath = '/Users/dorazhao/Desktop/images/'
basepath = 'm_m_skateboard_125472.jpg'
file = mypath + basepath
male = get_features(model, file)
diffs = []
for j, female in enumerate(sim_f):
    euc = get_difference(male, female)
    diffs.append(euc)

In [173]:
indices = np.argsort(diffs)
for i in indices:
    print('Filename: {0} Loss: {1}'.format(sim_f_files[i], diffs[i]))

Filename: /Users/dorazhao/Desktop/images/f_f_skateboard_13201_4.jpg Loss: 1.1681889295578003
Filename: /Users/dorazhao/Desktop/images/f_f_frisbee_227482_4.jpg Loss: 1.199068546295166
Filename: /Users/dorazhao/Desktop/images/f_f_skateboard_125472_1.jpg Loss: 1.2023388147354126
Filename: /Users/dorazhao/Desktop/images/f_f_skateboard_72281_5.jpg Loss: 1.2032126188278198
Filename: /Users/dorazhao/Desktop/images/f_f_skateboard_125472_2.jpg Loss: 1.2058347463607788
Filename: /Users/dorazhao/Desktop/images/f_f_skateboard_31093_3.jpg Loss: 1.21902334690094
Filename: /Users/dorazhao/Desktop/images/f_f_surfboard_190007_1.jpg Loss: 1.2260627746582031
Filename: /Users/dorazhao/Desktop/images/f_f_sports_135604_2.jpg Loss: 1.2294954061508179
Filename: /Users/dorazhao/Desktop/images/f_f_skateboard_72281_2.jpg Loss: 1.2304123640060425
Filename: /Users/dorazhao/Desktop/images/f_f_skateboard_125472_4.jpg Loss: 1.2352534532546997
Filename: /Users/dorazhao/Desktop/images/f_f_skateboard_128699_1.jpg Loss: 

### t-SNE Visualization of Images

In [38]:
image_features = []
for file in os.listdir(mypath):
    if not file.startswith('.'):
        filename = mypath + file
        features = get_features(model, filename)
        image_features.append(features)

In [39]:
image_features = np.array(image_features)
image_features.shape

(303, 100352)

In [40]:
tsne = TSNE(n_components=2).fit_transform(image_features)

In [41]:
# normalize to [0, 1]
x, y = tsne[:,0], tsne[:,1]
x = (x-np.min(x)) / (np.max(x) - np.min(x)) * 1000
y = (y-np.min(y)) / (np.max(y) - np.min(y)) * 1000

In [42]:

# we'll put the image centers in the central area of the plot
# and use offsets to make sure the images fit the plot

# init the plot as white canvas
tsne_plot = 255 * np.ones((1000, 1000, 3), np.uint8)

# now we'll put a small copy of every image to its corresponding T-SNE coordinate
for i, file in enumerate(os.listdir(mypath)):
    if not file.startswith('.'):
        filename = mypath + file
        image = cv2.imread(filename)
        dim = (50, 50)
        # scale the image to put it to the plot
        image = cv2.resize(image, dim)
        ty = int(y[i])
        tx = int(x[i])
        # put the image to its t-SNE coordinates using numpy sub-array indices
        try:
            tsne_plot[ty:ty + 50, tx:tx + 50, :] = image
        except:
            print(filename)

cv2.imshow('t-SNE', tsne_plot)
cv2.waitKey()


/Users/dorazhao/Desktop/images/f_f_surfboard_177357_4.jpg
/Users/dorazhao/Desktop/images/f_f_skateboard_31093_2.jpg
/Users/dorazhao/Desktop/images/f_f_surfboard_115898_5.jpg
/Users/dorazhao/Desktop/images/m_m_surfboard_7278.jpg
/Users/dorazhao/Desktop/images/f_f_surfboard_4765_2.jpg
/Users/dorazhao/Desktop/images/f_m_refrigerator_456662_5.jpg


102

In [44]:
# save tsne plot
cv2.imwrite('../results/tsne_plot.jpg', tsne_plot)

True