# 이미지간 유사도 분석해보기

## 1. MNIST 손글씨 숫자이미지 데이터셋 살펴보기

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interact
from sklearn.datasets import fetch_openml

In [2]:
MNIST = fetch_openml('mnist_784', version=1)

In [3]:
MNIST['data']

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
import numpy as np
np.sqrt(784)

28.0

In [6]:
MNIST['target']

0        5
1        0
2        4
3        1
4        9
        ..
69995    2
69996    3
69997    4
69998    5
69999    6
Name: class, Length: 70000, dtype: category
Categories (10, object): ['0', '1', '2', '3', ..., '6', '7', '8', '9']

In [4]:
images = MNIST['data'].to_numpy() / 255.
labels = MNIST['target'].to_numpy().astype(np.int64)

In [7]:
@interact(index=(0,1000), continuous_update=False)
def show_image(index=0):
    plt.figure(figsize=(2,2))
    plt.title(f"label: {labels[index]}")
    plt.imshow(images[index].reshape(28,28))
    plt.show()

interactive(children=(IntSlider(value=0, description='index', max=1000), Output()), _dom_classes=('widget-inte…

## 2. Euclidean distance 와 cosine similarity 구현해보기

In [8]:
# 이미지 벡터간 거리계산
def distance(x0, x1):
    x = x0 - x1
    distance = np.sqrt(x @ x)
    return distance

In [18]:
# 이미지 벡터간 각도계산
def angle(x0, x1):
    nominator = x0@x1
    denominator = np.sqrt((x0@x0)*(x1@x1))
    angle = np.arccos(nominator/denominator)
    return angle

In [10]:
# 500장 이미지 벡터간 거리비교
distances = []
for i in range(len(images[:500])):
    for j in range(len(images[:500])):
        distances.append(distance(images[i], images[j]))

In [14]:
@interact(first=(0, 499), second=(0, 499), continuous_update=False)
def show_dist_img(first, second):
    plt.figure(figsize=(8,4))
    f = images[first].reshape(28, 28)
    s = images[second].reshape(28, 28)

    ax0 = plt.subplot2grid((2, 2), (0, 0))
    ax1 = plt.subplot2grid((2, 2), (1, 0))
    ax2 = plt.subplot2grid((2, 2), (0, 1), rowspan=2)

    ax0.imshow(f, cmap='gray')
    ax1.imshow(s, cmap='gray')
    ax2.hist(np.array(distances), bins=50)
    d = distance(f.ravel(), s.ravel())
    ax2.axvline(x=d, ymin=0, ymax=40000, color='C1', linewidth=4)
    ax2.text(0, 16000, "Distance is {:.2f}".format(d), size=12)
    ax2.set(xlabel='distance', ylabel='number of images')
    plt.show()

interactive(children=(IntSlider(value=249, description='first', max=499), IntSlider(value=249, description='se…

In [19]:
# 500장 이미지 벡터간 거리비교
angles = []
for i in range(len(images[:500])):
    for j in range(len(images[:500])):
        angles.append(angle(images[i], images[j]))

In [20]:
@interact(first=(0, 499), second=(0, 499), continuous_update=False)
def show_angle_img(first, second):
    plt.figure(figsize=(8,4))
    f = images[first].reshape(28, 28)
    s = images[second].reshape(28, 28)

    ax0 = plt.subplot2grid((2, 2), (0, 0))
    ax1 = plt.subplot2grid((2, 2), (1, 0))
    ax2 = plt.subplot2grid((2, 2), (0, 1), rowspan=2)

    ax0.imshow(f, cmap='gray')
    ax1.imshow(s, cmap='gray')
    ax2.hist(np.array(angles), bins=50)
    d = angle(f.ravel(), s.ravel())
    ax2.axvline(x=d, ymin=0, ymax=40000, color='C1', linewidth=4)
    ax2.text(0, 16000, "Angle is {:.2f}".format(d), size=12)
    ax2.set(xlabel='angle', ylabel='number of images')
    plt.show()

interactive(children=(IntSlider(value=249, description='first', max=499), IntSlider(value=249, description='se…

## 3. 가장 유사한 후보이미지 추출하기

In [22]:
# 가장 가까운 거리의 이미지 인덱스 찾기
def most_similar_dist_image(idx):
    distances = np.zeros((500))
    for i in range(500):
        distances[i] = distance(images[idx], images[i])
    idx = np.where(distances == sorted(distances)[1])[0] # 자기 자신은 제외
    return idx

In [26]:
# 가장 가까운 거리의 이미지 인덱스 찾기
def most_similar_angle_image(idx):
    angles = np.zeros((500))
    for i in range(500):
        angles[i] = angle(images[idx], images[i])
    idx = np.where(angles == sorted(angles)[1])[0] # 자기 자신은 제외
    return idx

In [27]:
@interact(idx=(0, 499), continuous_update=False)
def show_most_similar(idx=0):
    plt.figure(figsize=(8,4))
    f = images[idx].reshape(28, 28)
    similar_idx = most_similar_dist_image(idx)[0]
    s = images[similar_idx].reshape(28, 28)
    
    ax0 = plt.subplot(1,2,1)
    ax1 = plt.subplot(1,2,2) 
    ax0.imshow(f, cmap='gray')
    ax0.set_title(f"original: {idx}")
    ax1.imshow(s, cmap='gray')
    ax1.set_title(f"most_similar: {similar_idx}")
    plt.show()

interactive(children=(IntSlider(value=0, description='idx', max=499), Output()), _dom_classes=('widget-interac…

In [28]:
@interact(idx=(0, 499), continuous_update=False)
def show_most_similar(idx=0):
    plt.figure(figsize=(8,4))
    f = images[idx].reshape(28, 28)
    similar_idx = most_similar_angle_image(idx)[0]
    s = images[similar_idx].reshape(28, 28)
    
    ax0 = plt.subplot(1,2,1)
    ax1 = plt.subplot(1,2,2) 
    ax0.imshow(f, cmap='gray')
    ax0.set_title(f"original: {idx}")
    ax1.imshow(s, cmap='gray')
    ax1.set_title(f"most_similar: {similar_idx}")
    plt.show()

interactive(children=(IntSlider(value=0, description='idx', max=499), Output()), _dom_classes=('widget-interac…