# 상위 100명의 선호도 정보 저장

In [1]:
import pandas as pd

def _df_load(path, state='train'):
    df = pd.read_csv(path)
    df_count = df.groupby(['응답자 ID']).size()
    df_count.name = f'{state} 설문 응답 수'

    return df, df_count

def _top100_filtering(df, top100_ids, state='train', save=True):
    top100_df = df[df['응답자 ID'].isin(top100_ids)].reset_index(drop=True)
    if save:
        top100_df.to_csv(f'top100_{state}_preference.csv', index=False)
    
    return top100_df

def make_top100_csv(train_csv, val_csv, save=True):
    # csv 파일로부터 데이터 불러오기
    df_train, train_count = _df_load(train_csv, 'train')
    df_val, val_count = _df_load(val_csv, 'val')

    # 몇 가지 전처리
    df_sum = pd.concat([train_count, val_count],axis=1)
    df_sum = df_sum.fillna(0).astype(int)    # 결측치 0으로 채우기
    df_sum['합계'] = df_sum['train 설문 응답 수'] + df_sum['val 설문 응답 수']    # '합계' 열 추가
    df_sum = df_sum.sort_values(by='합계', ascending=False)    # '합계' 열 기준으로 내림차순 정렬

    # df_sum의 합계를 기준으로 상위 100개 응답자 ID 추출하여 리스트로 저장
    top100_ids = df_sum.head(100).index.tolist()

    # 상위 100개의 유효한 응답자 ID를 가진 데이터만 추출
    top100_train_df = _top100_filtering(df_train, top100_ids, 'train', save=save)
    top100_val_df = _top100_filtering(df_val, top100_ids, 'val', save=save)

    return top100_train_df, top100_val_df


# Mission 2-2에서 생성한 csv 파일의 경로
t_pref = 'train_preference.csv'
v_pref = 'val_preference.csv'

#t_top100_pref, v_top100_pref = make_top100_csv(t_pref, v_pref, save=True)

In [2]:
# 결과 보기 좋게 HTML편집
from IPython.display import display_html
def display_left(*args):
    html_str = ''
    for df in args:
        html_str += f'<div style="margin-right:30px;">{df.to_html()}</div>'
    display_html(f'<div style="display: flex;">{html_str}</div>', raw=True)

# 데이터 결과 미리 확인
display_left(t_top100_pref.head(), v_top100_pref.head())

Unnamed: 0,응답자 ID,파일명,스타일 선호 여부
0,368,W_06753_60_mods_M.jpg,스타일 선호
1,368,W_06686_70_hippie_M.jpg,스타일 선호
2,368,W_15453_70_hippie_M.jpg,스타일 비선호
3,368,W_06843_60_mods_M.jpg,스타일 선호
4,368,W_06896_10_sportivecasual_M.jpg,스타일 선호

Unnamed: 0,응답자 ID,파일명,스타일 선호 여부
0,368,W_04622_60_mods_M.jpg,스타일 선호
1,368,W_04678_50_ivy_M.jpg,스타일 선호
2,368,W_15791_70_hippie_M.jpg,스타일 비선호
3,368,W_16034_80_bold_M.jpg,스타일 비선호
4,368,W_06551_60_mods_M.jpg,스타일 비선호


# train data로부터 feature vector 추출

In [1]:
# 필요 패키지 import 
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
import torchvision.transforms as transforms


# 모델 재현을 위한 랜덤시드 고정
def set_random_seed(seed_value=42):
    # Python의 기본 난수 시드 설정
    random.seed(seed_value)
    # NumPy 난수 시드 설정
    np.random.seed(seed_value)
    # PyTorch 난수 시드 설정 (CPU)
    torch.manual_seed(seed_value)
    # PyTorch 난수 시드 설정 (GPU)
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    # CuDNN 설정
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_random_seed()

In [2]:
import torchvision.models as models

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# 학습된 가중치로 모델 불러오기
model = models.resnet18(weights=None, num_classes=31)
model.load_state_dict(torch.load('./best_model.pth', weights_only=True))    # 가중치 로드
model = model.to(device)
model.eval()    # 학습을 하는 것이 아니므로 eval 모드로 설정

cuda


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [3]:
# 특징 추출을 위한 Class 선언
class FeatureExtractor(nn.Module):
    def __init__(self, original_model):
        super(FeatureExtractor, self).__init__()
        self.features = nn.Sequential(*list(original_model.children())[:-1])    # 마지막 fc layer 제외
    
    def forward(self, x):
        x = self.features(x)
        return torch.flatten(x, 1)

feature_extractor = FeatureExtractor(model)
feature_extractor = feature_extractor.to(device)

In [8]:
# 이미지 전처리 함수(모델 학습 시 적용한 전처리와 동일하게 적용)
def _preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    image = Image.open(image_path).convert('RGB')
    return transform(image).unsqueeze(0)    # 배치 차원 추가

# feature vector 추출 함수
def _extract_features(image_path):
    
    image = _preprocess_image(image_path).to(device)
    with torch.no_grad():
        features = feature_extractor(image)
    return features.cpu().numpy().flatten()

# 여러 이미지로부터 feature vector 추출
def extract_features_from_images(dir_path, image_list):
    features = {}
    for img in image_list:
        path = os.path.join(dir_path, img)
        feature = _extract_features(path)
        features[img] = np.array(feature)
    return features

In [23]:
# 모든 train image에 대한 feature vector 추출(시간이 꽤 걸릴 수 있음)
t_img_dir = '../resized_data/training_image'    # 224x224로 resize된 train image 경로
t_img_list = os.listdir(t_img_dir)

extracted_features = extract_features_from_images(t_img_dir, t_img_list)

# feature의 shape 확인(fc layer 이전은 512개의 feature)
print(f"이미지 수: {len(extracted_features)}, feature shape: {extracted_features[t_img_list[0]].shape}")

이미지 수: 4070, feature shape: (512,)


In [14]:
# 추출된 특징을 데이터프레임으로 변환 후 저장
t_feature_vectors = pd.DataFrame(extracted_features).T
#t_feature_vectors.to_csv('train_feature_vectors.csv')
t_feature_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
T_00253_60_popart_W.jpg,0.365814,0.082958,0.774777,0.791083,0.003264,0.087018,0.000000,0.040257,0.988961,0.008170,...,0.115817,0.080808,0.229180,0.081813,0.124989,0.251451,0.001143,0.188494,0.417812,0.065942
T_00456_10_sportivecasual_M.jpg,0.018442,0.066147,0.927391,0.520467,0.223312,0.907714,0.097888,0.498306,0.010099,0.008688,...,0.067645,0.500632,0.274380,0.369760,0.220593,0.196583,0.070142,0.441210,1.036374,0.039431
T_00588_10_sportivecasual_M.jpg,0.005348,0.042649,0.914440,0.391033,0.064130,0.579048,0.026233,0.301474,0.009904,0.011196,...,0.000101,0.092515,0.268349,0.056357,0.084848,0.077214,0.000000,0.550048,1.008492,0.000000
T_00770_60_minimal_W.jpg,0.216130,0.663869,0.053755,0.408991,0.197909,0.579666,0.715446,0.414065,0.164051,0.739263,...,0.083022,0.270680,1.212233,1.322776,0.176348,0.136355,0.385658,0.110632,0.024413,0.050312
T_00893_90_hiphop_W.jpg,0.355945,0.115561,0.849082,0.085283,0.073014,0.212951,0.261763,0.222841,0.357590,0.142900,...,0.228384,0.698955,1.060397,0.105515,0.087108,0.291533,0.009923,0.020875,0.406595,0.138964
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W_71923_60_mods_M.jpg,0.020825,0.035797,0.042857,0.096171,0.355142,0.511621,0.820499,0.409456,0.175342,0.266595,...,0.517299,0.290398,0.186518,0.426478,0.174009,0.091237,0.105439,0.490526,0.180422,0.073446
W_71933_60_mods_M.jpg,0.018558,0.002600,0.026073,0.223916,0.385836,0.955653,0.754085,0.436177,0.103581,0.119609,...,0.580206,0.308451,0.063685,0.364826,0.125350,0.227154,0.100034,0.797877,0.370006,0.132271
W_71934_60_mods_M.jpg,0.022694,0.012713,0.069024,0.155497,0.307939,0.725009,0.945096,0.428299,0.061789,0.219381,...,0.546403,0.475295,0.181790,0.414278,0.132369,0.092862,0.048592,0.623932,0.317617,0.071168
W_71935_60_mods_M.jpg,0.013106,0.000000,0.100428,0.154640,0.332283,0.940560,1.173493,0.397911,0.145460,0.201914,...,0.703312,0.531796,0.049949,0.476040,0.028925,0.113300,0.057774,0.763710,0.536583,0.129568


# 이미지 유사도를 활용한 스타일 선호 예측

In [15]:
# top100 선호도 데이터 로드
t_top100_pref = pd.read_csv('top100_train_preference.csv')
v_top100_pref = pd.read_csv('top100_val_preference.csv')

t_top100_pref['스타일 선호 여부'] = t_top100_pref['스타일 선호 여부'].apply(lambda x: 1 if x == '스타일 선호' else 0)
v_top100_pref['스타일 선호 여부'] = v_top100_pref['스타일 선호 여부'].apply(lambda x: 1 if x == '스타일 선호' else 0)

In [25]:
# feature vector 파일 각종 전처리
t_feature_vectors = pd.read_csv('train_feature_vectors.csv', index_col=0)
t_feature_vectors['feature_vector'] = t_feature_vectors.values.tolist()     # 512차원의 feature vector를 하나의 list로 변환
t_feature_reset = t_feature_vectors.reset_index().rename(columns={'index': '파일명'})
t_feature_simplified = t_feature_reset[['파일명', 'feature_vector']]    # 파일명과 feature vector만 남기기
t_feature_simplified

Unnamed: 0,파일명,feature_vector
0,T_00253_60_popart_W.jpg,"[0.36581373, 0.082958266, 0.7747775, 0.7910825..."
1,T_00456_10_sportivecasual_M.jpg,"[0.018442497, 0.06614705, 0.92739135, 0.520467..."
2,T_00588_10_sportivecasual_M.jpg,"[0.0053476035, 0.042649195, 0.91444, 0.3910330..."
3,T_00770_60_minimal_W.jpg,"[0.21612972, 0.66386896, 0.05375537, 0.4089910..."
4,T_00893_90_hiphop_W.jpg,"[0.35594502, 0.11556094, 0.84908164, 0.0852832..."
...,...,...
4065,W_71923_60_mods_M.jpg,"[0.020825205, 0.035796657, 0.042856712, 0.0961..."
4066,W_71933_60_mods_M.jpg,"[0.018558364, 0.0026001367, 0.026072832, 0.223..."
4067,W_71934_60_mods_M.jpg,"[0.022693925, 0.012712655, 0.06902375, 0.15549..."
4068,W_71935_60_mods_M.jpg,"[0.013106279, 0.0, 0.10042794, 0.15464038, 0.3..."


In [17]:
# top100 선호도 데이터에 feature vector를 병합(left merge)
t_merged_df = pd.merge(t_top100_pref, t_feature_simplified, on='파일명', how='left')
t_merged_df

Unnamed: 0,응답자 ID,파일명,스타일 선호 여부,feature_vector
0,368,W_06753_60_mods_M.jpg,1,"[0.07831427, 0.0, 0.05156516, 0.08783708, 0.39..."
1,368,W_06686_70_hippie_M.jpg,1,"[0.31076533, 0.010389824, 0.35296226, 0.687552..."
2,368,W_15453_70_hippie_M.jpg,0,"[0.1519923, 0.014068721, 0.42429367, 0.6144725..."
3,368,W_06843_60_mods_M.jpg,1,"[0.02600474, 0.0, 0.1528273, 0.0393925, 0.2958..."
4,368,W_06896_10_sportivecasual_M.jpg,1,"[0.017528027, 0.08091339, 0.7443924, 0.5389731..."
...,...,...,...,...
4448,67975,T_17798_19_normcore_M.jpg,1,"[0.0031676951, 0.08184793, 0.05668509, 0.33175..."
4449,67975,T_17799_19_normcore_M.jpg,1,"[0.2709516, 0.2439873, 0.010093305, 0.5108831,..."
4450,67975,W_06915_00_metrosexual_M.jpg,0,"[0.030314397, 0.1267456, 0.011266054, 0.424923..."
4451,67975,T_17801_19_normcore_M.jpg,1,"[0.094845966, 0.12566389, 0.0325013, 0.1055609..."


In [19]:
from sklearn.metrics.pairwise import cosine_similarity

pred_list = []

for i in range(len(v_top100_pref)):
    userID = v_top100_pref.loc[i, '응답자 ID']
    img_name = v_top100_pref.loc[i, '파일명']

    # v_top100_pref의 img에 대한 feature vector 추출
    val_img_path = "../resized_data/validation_image/"
    feature_vector = _extract_features(val_img_path + img_name)

    # train image 중 해당 유저가 이미 평가한 image들만 추출
    user_evaluated_items = t_merged_df.loc[t_merged_df['응답자 ID']==userID, :]

    # 해당 유저가 평가한 image들과 v_top100_pref의 img에서 추출한 feature vector와 유사도 계산(코사인 유사도)
    similarity = cosine_similarity([feature_vector], user_evaluated_items['feature_vector'].tolist())

    # 유사도 벡터를 DataFrame으로 변환하여 직관적으로 보기 쉽게 변환
    t_item_similarity_df = pd.DataFrame(similarity, index=[img_name], columns=user_evaluated_items['파일명'])

    # 유사도가 가장 높은 이미지 추출 (!상위 5개에서 voting을 하는 등 추천 방식은 변환 가능할 것)
    top_similar_item = t_item_similarity_df.T.sort_values(by=img_name, ascending=False).head(1).index[0]

    # top_similar_item의 선호 여부 확인 -> 해당 값을 추천 결과로 사용
    top_similar_item_pref = user_evaluated_items[user_evaluated_items['파일명'] == top_similar_item]['스타일 선호 여부'].values[0]

    # 예측값 저장
    pred_list.append(top_similar_item_pref)

In [21]:
# v_top100_pref에 대한 예측값을 열에 추가
v_top100_pref['선호도 예측'] = pred_list
v_top100_pref

Unnamed: 0,응답자 ID,파일명,스타일 선호 여부,선호도 예측
0,368,W_04678_50_ivy_M.jpg,1,1
1,368,W_16848_19_normcore_M.jpg,0,0
2,368,W_06864_10_sportivecasual_M.jpg,1,0
3,368,W_15340_50_ivy_M.jpg,0,0
4,368,W_16034_80_bold_M.jpg,0,0
...,...,...,...,...
1096,67975,W_26965_90_hiphop_M.jpg,0,0
1097,67975,W_06985_00_metrosexual_M.jpg,0,1
1098,67975,T_21992_70_hippie_M.jpg,0,0
1099,67975,T_21988_70_hippie_M.jpg,1,0


In [22]:
# 성능지표 평가 (Accuracy / Precision / Recall)
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy = accuracy_score(v_top100_pref['스타일 선호 여부'], v_top100_pref['선호도 예측'])
precision = precision_score(v_top100_pref['스타일 선호 여부'], v_top100_pref['선호도 예측'])
recall = recall_score(v_top100_pref['스타일 선호 여부'], v_top100_pref['선호도 예측'])

print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

Accuracy: 0.8429, Precision: 0.8131, Recall: 0.7891
