In [None]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import DataLoader
from PIL import Image
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import pickle
import matplotlib.pyplot as plt
from PIL import Image
import shutil

### unzip_file(zip_file): 파일 압축 해제

In [1]:
import zipfile

def unzip_file(zip_file):
  with zipfile.ZipFile(zip_file, 'r') as zip_ref:
      zip_ref.extractall()

In [None]:
unzip_file('angry_face_n.zip')

In [None]:
unzip_file('angry_face.zip')
unzip_file('angry_body.zip')

### to_pkl(file_path, feature_vector): 특징벡터를 pkl 파일로 저장

In [3]:
def to_pkl(file_path,feature_vector):
  with open(file_path, 'wb') as f:
    pickle.dump(feature_vector, f)

### pkl_load(file_path): pkl 파일을 로드

In [None]:
def pkl_load(file_path):
  with open(file_path, 'rb') as f:
    data = pickle.load(f)
  return data

### feature_dict(file_path): 이미지에 대해 resnet으로 feature 추출하고 저장

In [None]:
def feature_dict(file_path):
    # 전처리
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    # resnet 사전학습 상태로 로드
    model = models.resnet18(pretrained=True)
    model = nn.Sequential(*list(model.children())[:-1])  # 마지막 분류 레이어 제거
    model.eval()

    feature_dict = {}
    # 특징 벡터 추출
    image_paths = [os.path.join(file_path, img) for img in sorted(os.listdir(file_path))]
    for img_path in image_paths:
        if os.path.isfile(img_path):  
            image_name = os.path.basename(img_path) 
            image = Image.open(img_path).convert("RGB")
            image = preprocess(image).unsqueeze(0)
            with torch.no_grad():
                feature = model(image).squeeze().numpy()
            feature_dict[image_name] = feature
    return feature_dict

### mean_vector(file_path): feature 값들의 mean vector 계산

In [6]:
def mean_vector(file_path):
  fdict=feature_dict(file_path)
  vectors = list(fdict.values())
  mean_vector = np.mean(vectors, axis=0)
  return mean_vector

# cor(file_path, num, feature_dict, mean_vector) 반환 함수 정의
-> 유사도 구하기

### cor(file_path, feature_dict, mean_vector): 전체 feature와 mean vector의 유사도를 계산 및 정렬

In [None]:
def cor(file_path,feature_dict, mean_vector):

    similarity_list = []

    for image_name, feature in feature_dict.items():
        similarity = cosine_similarity([feature], [mean_vector])[0][0]
        similarity_list.append((image_name, similarity))

    similarity_list.sort(key=lambda x: x[1], reverse=True)
    
    image_list = []
    for image_name, similarity in similarity_list:
        image_path = os.path.join(file_path, image_name)
        image_list.append(image_path)
    return image_list

### cor_show(image_list, num): 유사도 높은 순으로 num 만큼 이미지 시각화

In [None]:
def cor_show(image_list, num):
    num_images = min(num, len(image_list))  # num 또는 전체 이미지 수 중 작은 값
    num_cols = 4  
    num_rows = (num_images // num_cols) + (num_images % num_cols > 0) 
    extra_space_rows = (num_images // 20)  # 20개마다 추가 공간을 확보

    plt.figure(figsize=(20, (num_rows + extra_space_rows) * 5)) 

    current_row_offset = 0  

    for i, img_path in enumerate(image_list[:num_images]):  
        # 20개마다 줄바꿈 간격 
        if i > 0 and i % 20 == 0:
            current_row_offset += 1

        row = (i // num_cols) + current_row_offset
        col = i % num_cols

        plt.subplot(num_rows + extra_space_rows, num_cols, row * num_cols + col + 1)
        img = Image.open(img_path)
        plt.imshow(img)
        plt.axis('off')

    plt.tight_layout()
    plt.show()


### file_copy(img_list, num, new_folder): 폴더에 num개의 이미지 복사

In [None]:
def file_copy(img_list, num, new_folder):

  for i, (img_path, _) in enumerate(img_list[:num]):
      if os.path.exists(img_path):  
          shutil.copy(img_path, new_folder)

  print(f"{new_folder} 폴더에 {num}개의 이미지 복사가 완료되었습니다.")

### file_move(dest_folder, image_list, feature_dict, num): 파일을 dest_folder로 이동

In [None]:
def file_move(dest_folder, image_list, feature_dict, num):
    os.makedirs(dest_folder, exist_ok=True)  
    moved_files = []  

    for file_path in image_list[:num]:  # 최대 num개의 파일만 처리
        if os.path.exists(file_path):  
            file_name = os.path.basename(file_path)  
            new_path = os.path.join(dest_folder, file_name)
            shutil.move(file_path, new_path)
            moved_files.append(file_name)

    # feature_dict에서 이동된 파일 제거
    updated_feature_dict = {key: value for key, value in feature_dict.items() if key not in moved_files}
    return updated_feature_dict

# 학습 데이터 구성

## angry 학습 데이터 (500개)
-> angry_face, angry_body, angry_face_n에 대해 각각 유사한 이미지 추출

In [None]:
unzip_file('angry_face.zip')
unzip_file('angry_body.zip')
unzip_file('angry_face_n.zip')
unzip_file('dog_angry_crop_img.zip')

전체 angry 이미지에 대해 feature 추출 -> angry_feature_dict.pkl

In [None]:
angry_feature_dict=pkl_load('angry_feature_dict.pkl')

대표 이미지들에 대해 mean_vector 계산

In [None]:
ang_face_vector=mean_vector('angry_face')
ang_body_vector=mean_vector('angry_body')
ang_face_n_vector=mean_vector('angry_face_n')

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 183MB/s]


### angry_face_vector와 유사한 이미지 220장 추출

In [None]:
ang_face_list=cor('dog_angry_crop_img',220,angry_feature_dict,ang_face_vector)

In [None]:
folder_path = '/content/dog_angry_crop_img'

if os.path.exists(folder_path): 
    file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
    print(f"폴더 '{folder_path}'에 있는 파일 개수: {file_count}")
else:
    print(f"폴더 '{folder_path}'가 존재하지 않습니다.")


폴더 '/content/dog_angry_crop_img'에 있는 파일 개수: 740


전체 angry 데이터 중에서 ang_face_vector와 유사한 데이터 220개를 새로운 폴더로 이동

In [None]:
angry_feature_dict=file_move('/content/angry_train_img',ang_face_list,angry_feature_dict)

In [None]:
folder_path = '/content/dog_angry_crop_img'

if os.path.exists(folder_path): 
    file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
    print(f"폴더 '{folder_path}'에 있는 파일 개수: {file_count}")
else:
    print(f"폴더 '{folder_path}'가 존재하지 않습니다.")


폴더 '/content/dog_angry_crop_img'에 있는 파일 개수: 520


### angry_body_vector와 유사한 이미지 220장 추출

In [None]:
ang_body_list=cor('dog_angry_crop_img',220,angry_feature_dict,ang_body_vector)

남은 angry 데이터 중에서 ang_body_vector와 유사한 데이터 220개를 새로운 폴더로 이동

In [None]:
angry_feature_dict=file_move('/content/angry_train_img',ang_face_list,angry_feature_dict)

### angry_face_n_vector와 유사한 이미지 60장 추출

In [None]:
ang_face_n_list=cor('dog_angry_crop_img',60,angry_feature_dict,ang_face_n_vector)

[('Md5JkFuXpbftuCAxmpqIeTpqNNwbSB423_crop_1.jpg', 0.91637385), ('7p0ocgRn42u21i2utwENqotyqJoX6P751_crop_1.jpg', 0.915748), ('j5KdqDqt6W928Mmokf8zWXwj7tH6Hi884_crop_1.jpg', 0.8943096), ('lwJ10itB6vdK8vtXVBCw9VjAkQJi4o76_crop_1.jpg', 0.8867295), ('SoVoK90zyhVmYP3AdEldtMmXQTHWJm643_crop_1.jpg', 0.88565063), ('LATTp1CcV5vi3ZgCXcTkDDOnq7scFb167_crop_1.jpg', 0.8824178), ('wKtKH0fj4VG0ic2BlEyKgpGO6YGOm1225_crop_1.jpg', 0.8760853), ('PmM7uMpAn8pQn9rWOG455vhSsUT0Xv730_crop_1.jpg', 0.87140036), ('XiL5OnGXaN7icBLe7Y3GTIecOMKy8z566_crop_1.jpg', 0.86924374), ('N1tCByCeHkRvMiECUoUelkZULbo4Aq359_crop_1.jpg', 0.8626189), ('a4GsaAzQhY00zmyR14x7BX1K8LaACM586_crop_1.jpg', 0.8602503), ('cc79mOhzoG4YClBSctDepqTNwwN7Ld632_crop_1.jpg', 0.8584822), ('OwyzX6VUIHYHejdMcEb7GK6IMRR5VA774_crop_2.jpg', 0.8577877), ('Z1LvV6pZShKUjgtpuTqjPpQixzJWXi148_crop_1.jpg', 0.85765666), ('iCn0DKqylzUMJGWkVR81Muk6HIMrvc919_crop_1.jpg', 0.8504272), ('0rdx3OrwENAIWpp9Bunro1ku6Zwbqv6_crop_1.jpg', 0.8497025), ('7kgoRu1DY8ZxkMR5NFI6

남은 angry 데이터 중에서 ang_face_n_vector와 유사한 데이터 60개를 새로운 폴더로 이동

In [None]:
angry_feature_dict=file_move('/content/angry_train_img',ang_face_n_list,angry_feature_dict)

In [None]:
folder_path = '/content/angry_train_img'

if os.path.exists(folder_path):
    file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
    print(f"폴더 '{folder_path}'에 있는 파일 개수: {file_count}")
else:
    print(f"폴더 '{folder_path}'가 존재하지 않습니다.")


폴더 '/content/angry_train_img'에 있는 파일 개수: 500


In [None]:
shutil.make_archive('angry_train_img', 'zip', 'angry_train_img')

'/content/angry_train_img.zip'

## happy 학습 데이터 (500개)
-> happy_face, happy_body, play에 대해 각각 유사한 이미지 추출

In [None]:
unzip_file('happy_face.zip')
unzip_file('happy_body.zip')
unzip_file('dog_happy_crop_img.zip')

전체 happy 이미지에 대해 feature 추출 -> happy_feature_dict.pkl

In [None]:
happy_feature_dict=pkl_load('happy_feature_dict.pkl')

대표 이미지들에 대해 mean_vector 계산

In [None]:
hap_face_vector=mean_vector('happy_face')
hap_body_vector=mean_vector('happy_body')

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 133MB/s]


### hap_face_vector와 유사한 이미지 200장 추출

In [None]:
hap_face_list=cor('dog_happy_crop_img',200,happy_feature_dict,hap_face_vector)

[('lXz4bZwzWzQwtaWAjGqj3cmaCjg6Iv606_crop_1.jpg', 0.8713993), ('happy_dog_830_crop_1.jpg', 0.8648623), ('happy_dog_89_crop_1.jpg', 0.86461633), ('happy_dog_831_crop_1.jpg', 0.86442626), ('xt37zG27DNAubuAtHNh2FRPe42Xa8M677_crop_1.jpg', 0.86386025), ('2gUa6Q1EmPsDanb1TRdPt6Im0llmf7616_crop_1.jpg', 0.8611378), ('happy_dog_747_crop_1.jpg', 0.860144), ('happy_dog_600_crop_1.jpg', 0.85986435), ('tUfV37QU3HSxfyz6LBpktlZ53SPwqc835_crop_1.jpg', 0.8597549), ('happy_dog_254_crop_1.jpg', 0.85746825), ('happy_dog_78_crop_1.jpg', 0.8557918), ('happy_dog_560_crop_1.jpg', 0.8551856), ('happy_dog_673_crop_1.jpg', 0.8544166), ('happy_dog_34_crop_1.jpg', 0.8528702), ('happy_dog_682_crop_1.jpg', 0.851732), ('PNBP1LhXU0LIbTu35bp0E5ft6xMEH0696_crop_1.jpg', 0.85120404), ('o1xdQR6vUihH6EgnM6vsq4LD1Miry5673_crop_1.jpg', 0.85119766), ('pJe6KmRDKqiuNMvjoV0I8t9gBg3Ps6713_crop_1.jpg', 0.8506112), ('KtX6RcyAAB71RmzQ22vIfBGFhKw451691_crop_1.jpg', 0.85057217), ('happy_dog_68_crop_1.jpg', 0.8502045), ('happy_dog_754_c

In [None]:
folder_path = '/content/dog_happy_crop_img'

if os.path.exists(folder_path): 
    file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
    print(f"폴더 '{folder_path}'에 있는 파일 개수: {file_count}")
else:
    print(f"폴더 '{folder_path}'가 존재하지 않습니다.")


폴더 '/content/dog_happy_crop_img'에 있는 파일 개수: 1810


전체 happy 데이터 중에서 hap_face_vector와 유사한 데이터 200개를 새로운 폴더로 이동

In [None]:
happy_feature_dict=file_move('/content/happy_train_img',hap_face_list,happy_feature_dict)

In [None]:
folder_path = '/content/dog_happy_crop_img'

if os.path.exists(folder_path): 
    file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
    print(f"폴더 '{folder_path}'에 있는 파일 개수: {file_count}")
else:
    print(f"폴더 '{folder_path}'가 존재하지 않습니다.")


폴더 '/content/dog_happy_crop_img'에 있는 파일 개수: 1610


In [None]:
len(happy_feature_dict)

1610

### hap_body_vector와 유사한 이미지 200장 추출

In [None]:
hap_body_list=cor('dog_happy_crop_img',200,happy_feature_dict,hap_body_vector)

[('ZzHR7fc5ORV22IzGJtHf7YXkwcV1uZ98_crop_1.jpg', 0.8664421), ('happy_dog_90_crop_1.jpg', 0.864267), ('wiR5KjD68669Ni32zj4BhlKlErKs3T88_crop_1.jpg', 0.86399114), ('2bVW67KtR6xL7eph5kzXcLeBMoGojl273_crop_1.jpg', 0.860564), ('happy_dog_778_crop_1.jpg', 0.86004966), ('WhSn5FCAx5qwZFBp1BQ7t4L3aoC9Wt661_crop_1.jpg', 0.85774165), ('happy_dog_165_crop_2.jpg', 0.8548385), ('WRxBcDAqPgvAcOTiERQvkzPFk0yiWW886_crop_1.jpg', 0.85358083), ('lC5nVPgr6J0Q2dmEITgCdkBQz8Vdbp115_crop_1.jpg', 0.8523338), ('PKUrWSe458HpovDqVNJNvy5kcjYm1w729_crop_1.jpg', 0.8518263), ('happy_dog_724_crop_1.jpg', 0.84917414), ('qtRovxpfYJcWpJHkWUBjLOVGhNJ4Ko612_crop_1.jpg', 0.84673613), ('bf4J9U1GphPID6C1PxuodJ3BLj4dsR232_crop_1.jpg', 0.8454961), ('happy_dog_740_crop_1.jpg', 0.84437805), ('HQsw77F0DThSuw0W7cFX3h5n4oI5gB752_crop_1.jpg', 0.84431815), ('happy_dog_903_crop_1.jpg', 0.84385514), ('happy_dog_360_crop_1.jpg', 0.8429209), ('KjTXSxVswZ5LFeuY1XWOStfUbOCmXs85_crop_1.jpg', 0.842752), ('ZEASq8FxIstHibdJjQXsom1DtEfKAk625_cro

남은 happy 데이터 중에서 hap_face_vector와 유사한 데이터 200개를 새로운 폴더로 이동

In [None]:
happy_feature_dict=file_move('/content/happy_train_img',hap_body_list,happy_feature_dict)

### play와 유사한 이미지 100장 추출

In [None]:
unzip_file('play.zip')
unzip_file('dog_play_crop_img.zip')

In [None]:
play_feature_dict=pkl_load('play_feature_dict.pkl')

In [None]:
hap_play_vector=mean_vector('play')



In [None]:
hap_play_list=cor('dog_play_crop_img',100,play_feature_dict,hap_play_vector)

[('dog_351_crop_1.jpg', 0.86603165), ('dog_38_crop_1.jpg', 0.8505391), ('dog_107_crop_1.jpg', 0.8478582), ('dog_368_crop_1.jpg', 0.8478396), ('dog_383_crop_2.jpg', 0.84776986), ('dog_225_crop_1.jpg', 0.8464482), ('dog_360_crop_1.jpg', 0.8448675), ('dog_361_crop_1.jpg', 0.84449637), ('dog_359_crop_3.jpg', 0.8433934), ('dog_709_crop_1.jpg', 0.8422948), ('dog_710_crop_1.jpg', 0.84181464), ('dog_70_crop_1.jpg', 0.83996695), ('dog_590_crop_1.jpg', 0.83825207), ('dog_448_crop_1.jpg', 0.836641), ('dog_618_crop_1.jpg', 0.8362254), ('dog_392_crop_1.jpg', 0.8362202), ('dog_473_crop_1.jpg', 0.83475226), ('dog_347_crop_1.jpg', 0.8335177), ('dog_364_crop_1.jpg', 0.8330883), ('dog_60_crop_1.jpg', 0.83275723), ('dog_411_crop_1.jpg', 0.8322841), ('dog_118_crop_2.jpg', 0.83192533), ('dog_451_crop_1.jpg', 0.83185416), ('dog_288_crop_1.jpg', 0.8312609), ('dog_178_crop_1.jpg', 0.8310127), ('dog_243_crop_1.jpg', 0.8307017), ('dog_725_crop_1.jpg', 0.8303301), ('dog_389_crop_1.jpg', 0.8302338), ('dog_967_cro

남은 happy 데이터 중에서 hap_play_vector와 유사한 데이터 100개를 새로운 폴더로 이동

In [None]:
play_feature_dict=file_move('/content/happy_train_img',hap_play_list,play_feature_dict)

In [None]:
folder_path = '/content/happy_train_img'

if os.path.exists(folder_path):
    file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
    print(f"폴더 '{folder_path}'에 있는 파일 개수: {file_count}")
else:
    print(f"폴더 '{folder_path}'가 존재하지 않습니다.")


폴더 '/content/happy_train_img'에 있는 파일 개수: 500


In [None]:
shutil.make_archive('happy_train_img', 'zip', 'happy_train_img')

'/content/happy_train_img.zip'