# 상위 100명의 선호도 정보 저장

In [17]:
import pandas as pd

def make_top100_csv(train_csv, val_csv):
    # csv 불러와서 사용
    df_train = pd.read_csv(train_csv)
    df_val = pd.read_csv(val_csv)

    # 응답자 ID 별로 설문 응답의 개수 세기
    train_count = df_train.groupby(['응답자 ID']).count()
    val_count = df_val.groupby(['응답자 ID']).count()

    # 하나의 열만 남기기
    train_count = train_count['스타일 선호 여부']
    val_count = val_count['스타일 선호 여부']

    # 열 이름이 동일하므로 헷갈리지 않게 변경
    train_count.name = 'train 설문 응답 수'
    val_count.name = 'val 설문 응답 수'

    # 두 데이터프레임 합친 후 몇 가지 처리
    df_sum = pd.concat([train_count, val_count],axis=1)
    df_sum = df_sum.fillna(0).astype(int)    # 결측치 0으로 채우기
    df_sum['합계'] = df_sum['train 설문 응답 수'] + df_sum['val 설문 응답 수']    # '합계' 열 추가
    df_sum = df_sum.sort_values(by='합계', ascending=False)    # '합계' 열 기준으로 내림차순 정렬

    #df_sum의 합계를 기준으로 상위 100개 응답자 ID 추출하여 리스트로 저장
    top_100_ids = df_sum.head(100).index.tolist()

    # 각 데이터에서 유효한 데이터만 거르기
    top100_train_df = df_train[df_train['응답자 ID'].isin(top_100_ids)].reset_index(drop=True)
    top100_val_df = df_val[df_val['응답자 ID'].isin(top_100_ids)].reset_index(drop=True)

    # csv로 데이터 저장
    top100_train_df.to_csv('top100_train_preference.csv', index=False)
    top100_val_df.to_csv('top100_val_preference.csv', index=False)


# Mission 2-2에서 생성한 csv 파일의 경로
t_pref = 'train_preference.csv'
v_pref = 'val_preference.csv'

#make_top100_csv(t_pref, v_pref)

# Item-based filtering

### (1) 데이터 준비: user-item 행렬 생성

In [1]:
import pandas as pd

# top100 선호도 데이터 로드
t_top100_pref = pd.read_csv('top100_train_preference.csv')
v_top100_pref = pd.read_csv('top100_val_preference.csv')

def make_user_item_matrix(df):
    # 스타일 선호 여부를 1과 0으로 변환
    df['스타일 선호 여부'] = df['스타일 선호 여부'].apply(lambda x: 1 if x == '스타일 선호' else 0)

    # 선호도 정보를 사용하여 user-item matrix 생성
    user_item_matrix = df.pivot_table(index='응답자 ID', columns='파일명', values='스타일 선호 여부')

    # item-based filtering을 위해 행렬을 전치하여 item-user matrix로 변환(sklearn의 cosine similarity는 row 기준으로 계산이 됨)
    item_user_matrix = user_item_matrix.T

    # 평가하지 않은 item을 확인하기 위해 마스킹 정보 기록
    mask = item_user_matrix.isna()

    # 결측치 0으로 채우기(!0으로 채우는 게 맞을지 고민해보기)
    item_user_matrix.fillna(0, inplace=True)

    return item_user_matrix, mask

t_item_user_matrix, t_mask = make_user_item_matrix(t_top100_pref)
v_item_user_matrix, v_mask = make_user_item_matrix(v_top100_pref)

In [2]:
t_item_user_matrix      # train 데이터의 item-user matrix

응답자 ID,368,837,7658,7905,9096,20768,21432,22324,28371,28571,...,64633,64662,64747,65071,65139,66469,66513,66592,66731,67975
파일명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T_00253_60_popart_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
T_00456_10_sportivecasual_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
T_00588_10_sportivecasual_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
T_00770_60_minimal_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
T_00893_90_hiphop_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W_71923_60_mods_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
W_71933_60_mods_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
W_71934_60_mods_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
W_71935_60_mods_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:
v_item_user_matrix    # validation 데이터의 item-user matrix

응답자 ID,368,837,7658,7905,9096,20768,21432,22324,28371,28571,...,64633,64662,64747,65071,65139,66469,66513,66592,66731,67975
파일명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T_00253_60_popart_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
T_00456_10_sportivecasual_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
T_01123_90_hiphop_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
T_01514_50_ivy_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
T_06910_50_classic_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W_63188_90_kitsch_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
W_63644_10_sportivecasual_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
W_64047_10_sportivecasual_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
W_64332_80_powersuit_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### (2) 유사도 계산: 아이팀 간 유사도를 계산하여 비슷한 사용자나 아이템을 찾음

In [3]:
# 코사인 유사도 계산 (!코사인 유사도 쓰는 게 맞을지..?)
from sklearn.metrics.pairwise import cosine_similarity

#t_item_similarity_matrix = cosine_similarity(t_item_user_matrix)
v_item_similarity_matrix = cosine_similarity(v_item_user_matrix)

In [4]:
# 유사도 행렬을 DataFrame으로 변환하여 쉽게 조작할 수 있도록 함
v_item_similarity_df = pd.DataFrame(v_item_similarity_matrix, index=v_item_user_matrix.index, columns=v_item_user_matrix.index)

# 아이템 유사도 행렬의 일부를 표시하여 확인
v_item_similarity_df.head()

파일명,T_00253_60_popart_W.jpg,T_00456_10_sportivecasual_M.jpg,T_01123_90_hiphop_M.jpg,T_01514_50_ivy_M.jpg,T_06910_50_classic_W.jpg,T_07990_60_mods_M.jpg,T_14538_00_cityglam_W.jpg,T_21986_70_hippie_M.jpg,T_21988_70_hippie_M.jpg,T_21992_70_hippie_M.jpg,...,W_60553_00_cityglam_W.jpg,W_61255_00_cityglam_W.jpg,W_61790_10_sportivecasual_W.jpg,W_62253_19_lounge_W.jpg,W_62313_00_oriental_W.jpg,W_63188_90_kitsch_W.jpg,W_63644_10_sportivecasual_M.jpg,W_64047_10_sportivecasual_W.jpg,W_64332_80_powersuit_W.jpg,W_65122_10_sportivecasual_W.jpg
파일명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T_00253_60_popart_W.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T_00456_10_sportivecasual_M.jpg,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T_01123_90_hiphop_M.jpg,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T_01514_50_ivy_M.jpg,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T_06910_50_classic_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### (3) 추천

In [141]:
# 한 번에 추천하도록 함수 정의(자세한 과정은 아래의 셀들을 참고)
import random

def _get_unrated_items(user_id, mask_df):
    user_mask_df = mask_df.loc[:, [user_id]]
    unrated_items = user_mask_df[user_mask_df[user_id] == True].index.tolist()
    return unrated_items

def _predict_rating(user_id, user_ratings, unrated_items, mask_df, item_similarity_df):
    predicted_ratings = {}      # 미평가된 아이템의 평점 예측 정보

    for item in unrated_items:
        similar_items = item_similarity_df[item]    # 미평가된 아이템의 유사도만 추출

        # 유사도 값의 상위 5%를 기준으로 선택(!5%가 적절한지 고민해보기)
        # (현재 데이터는 한 명의 user가 기껏해야 15개의 item을 평가했으므로 5%로 설정)
        similar_items_sorted = similar_items.sort_values(ascending=False)
        top_n_items = int(len(similar_items_sorted) * 0.05)
        top_similar_items = similar_items_sorted[:top_n_items]

        # top_similar_items의 유사도에 사용자 평점을 가중 평균하여 예측 평점 계산
        weighted_ratings_sum = 0    # 가중 평점의 합(분자)
        similarity_sum = 0          # 모든 유사도의 합(분모)

        for item_id in top_similar_items.index:
            # 사용자가 평가한 아이템만 고려
            if mask_df.loc[item_id, user_id] == False:
                similarity = top_similar_items[item_id]
                rating = user_ratings.loc[item_id].values
                weighted_ratings_sum += similarity * rating
                similarity_sum += similarity

        if similarity_sum != 0:
            predicted_rating = weighted_ratings_sum / similarity_sum
        else:
            predicted_rating = 0

        # 예측된 평점 저장
        predicted_ratings[item] = predicted_rating
    
    return predicted_ratings


# Main 함수
def get_item_based_recommendation(item_user_martix, mask_df, item_similarity_df):
    #user_id = random.choice(item_user_martix.columns)    # 추천할 대상(userID)을 임의로 선택
    user_id = 64747

    # 사용자의 아이템 평점 가져오기
    user_ratings = item_user_martix[[user_id]]

    # 사용자가 아직 평가하지 않은 아이템만 가져오기
    unrated_items = _get_unrated_items(user_id, mask_df)

    # 아직 평가하지 않은 아이템들에 대해 평점 예측
    predicted_ratings = _predict_rating(user_id, user_ratings, unrated_items, mask_df, item_similarity_df)

    # 예측된 평점을 정렬
    top_recommendations = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)

    # 사용자가 평가하지 않은 항목에 대하여 예측 평점이 0이 아닌 item을 추천
    recommended_items = [item for item, score in top_recommendations if score != 0 and score[0] != 0]
    print(f"사용자 {user_id}에게 추천하는 item:\n{recommended_items}")

    return top_recommendations

# Validation 데이터 내 응답자의 스타일 선호 여부 예측(추천 item 목록 생성)
top_recommendations = get_item_based_recommendation(v_item_user_matrix, v_mask, v_item_similarity_df)

사용자 64747에게 추천하는 item:
['T_00253_60_popart_W.jpg', 'W_10028_50_classic_W.jpg', 'W_44918_60_minimal_W.jpg']


In [144]:
top_recommendations[:10]    # 상위 10개 추천 item 살펴보기

[('T_00253_60_popart_W.jpg', array([1.])),
 ('W_10028_50_classic_W.jpg', array([1.])),
 ('W_44918_60_minimal_W.jpg', array([1.])),
 ('T_00456_10_sportivecasual_M.jpg', 0),
 ('T_01123_90_hiphop_M.jpg', 0),
 ('T_01514_50_ivy_M.jpg', 0),
 ('T_06910_50_classic_W.jpg', 0),
 ('T_07990_60_mods_M.jpg', 0),
 ('T_14538_00_cityglam_W.jpg', 0),
 ('T_21986_70_hippie_M.jpg', 0)]

#### 코드 상세히 살펴보기

In [5]:
v_item_user_matrix.columns

Index([  368,   837,  7658,  7905,  9096, 20768, 21432, 22324, 28371, 28571,
       28912, 30790, 35514, 58251, 59083, 59506, 59523, 59637, 59642, 59704,
       59812, 60173, 60184, 60234, 60465, 61104, 61250, 61493, 61859, 62113,
       62155, 62264, 62349, 62361, 62525, 62625, 62653, 62868, 62952, 63057,
       63156, 63207, 63316, 63359, 63369, 63392, 63405, 63424, 63430, 63435,
       63473, 63479, 63481, 63505, 63508, 63526, 63545, 63569, 63571, 63583,
       63601, 63644, 63740, 63742, 63748, 63759, 63769, 63910, 63913, 63927,
       63930, 63934, 64216, 64221, 64223, 64252, 64280, 64295, 64310, 64336,
       64345, 64346, 64364, 64397, 64441, 64460, 64503, 64561, 64571, 64598,
       64633, 64662, 64747, 65071, 65139, 66469, 66513, 66592, 66731, 67975],
      dtype='int64', name='응답자 ID')

In [121]:
import random

# 추천할 사용자를 선택
#user_id = random.choice(v_item_user_matrix.columns)
user_id = 64747

# 아이템 유사도와 사용자의 과거 평점을 기반으로 미평가된 아이템의 평점 예측
predicted_ratings = {}

In [122]:
# 사용자의 아이템 평점 가져오기
user_ratings = v_item_user_matrix[[user_id]]
user_ratings

응답자 ID,64747
파일명,Unnamed: 1_level_1
T_00253_60_popart_W.jpg,0.0
T_00456_10_sportivecasual_M.jpg,0.0
T_01123_90_hiphop_M.jpg,0.0
T_01514_50_ivy_M.jpg,0.0
T_06910_50_classic_W.jpg,0.0
...,...
W_63188_90_kitsch_W.jpg,0.0
W_63644_10_sportivecasual_M.jpg,0.0
W_64047_10_sportivecasual_W.jpg,0.0
W_64332_80_powersuit_W.jpg,0.0


In [123]:
# 사용자가 아직 평가하지 않은 아이템 찾기
def get_unrated_items(user_id, mask):
    user_mask = mask.loc[:, [user_id]]
    unrated_items = user_mask[user_mask[user_id] == True].index.tolist()
    return unrated_items

unrated_items = get_unrated_items(user_id, v_mask)
print(unrated_items)

['T_00253_60_popart_W.jpg', 'T_00456_10_sportivecasual_M.jpg', 'T_01123_90_hiphop_M.jpg', 'T_01514_50_ivy_M.jpg', 'T_06910_50_classic_W.jpg', 'T_07990_60_mods_M.jpg', 'T_14538_00_cityglam_W.jpg', 'T_21986_70_hippie_M.jpg', 'T_21988_70_hippie_M.jpg', 'T_21992_70_hippie_M.jpg', 'W_00004_50_ivy_M.jpg', 'W_00012_50_ivy_M.jpg', 'W_00028_50_ivy_M.jpg', 'W_00033_60_mods_M.jpg', 'W_00073_50_ivy_M.jpg', 'W_00103_70_hippie_M.jpg', 'W_00117_19_normcore_M.jpg', 'W_00152_50_feminine_W.jpg', 'W_00161_60_space_W.jpg', 'W_00191_10_sportivecasual_W.jpg', 'W_00299_50_feminine_W.jpg', 'W_00351_70_hippie_W.jpg', 'W_00359_90_grunge_W.jpg', 'W_00366_60_minimal_W.jpg', 'W_00492_50_ivy_M.jpg', 'W_00496_60_mods_M.jpg', 'W_00511_90_hiphop_M.jpg', 'W_00539_10_sportivecasual_M.jpg', 'W_00540_90_hiphop_M.jpg', 'W_00551_19_normcore_M.jpg', 'W_00598_19_normcore_W.jpg', 'W_00624_80_bodyconscious_W.jpg', 'W_00625_80_bodyconscious_W.jpg', 'W_00709_60_popart_W.jpg', 'W_00716_60_minimal_W.jpg', 'W_00804_50_ivy_M.jpg', 'W

In [131]:
# unrated_items의 모든 아이템에 대해 예측 평점 계산
for item in unrated_items:
    # 미평가된 아이템에 대한 유사도 계산
    similar_items = v_item_similarity_df[item]

    # 유사도 값의 상위 5%를 기준으로 선택 (!5%가 적절한지..?)
    # (현재 데이터는 한 명의 user가 기껏해야 15개의 item을 평가했으므로 5%로 설정)
    similar_items_sorted = similar_items.sort_values(ascending=False)
    top_n_items = int(len(similar_items_sorted) * 0.05)
    top_similar_items = similar_items_sorted.iloc[:top_n_items]

    # 유사 아이템 간의 유사도를 가중치로 하여 사용자의 평점을 가중 평균하여 예측 평점 계산
    weighted_ratings_sum = 0 # 분자
    similarity_sum = 0 # 분모

    for item_id in top_similar_items.index:
        # 사용자가 평가한 아이템만 고려
        if v_mask.loc[item_id, user_id] == False:
            similarity = top_similar_items[item_id]
            rating = user_ratings.loc[item_id].values
            weighted_ratings_sum += similarity * rating
            similarity_sum += similarity

    if similarity_sum != 0:
        predicted_rating = weighted_ratings_sum / similarity_sum
    else:
        predicted_rating = 0

    # 예측된 평점 저장
    predicted_ratings[item] = predicted_rating

In [132]:
# 예측된 평점을 정렬하여 상위 추천 항목 가져오기
top_recommendations = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)
top_recommendations[:10]

[('T_00253_60_popart_W.jpg', array([1.])),
 ('W_10028_50_classic_W.jpg', array([1.])),
 ('W_44918_60_minimal_W.jpg', array([1.])),
 ('T_00456_10_sportivecasual_M.jpg', 0),
 ('T_01123_90_hiphop_M.jpg', 0),
 ('T_01514_50_ivy_M.jpg', 0),
 ('T_06910_50_classic_W.jpg', 0),
 ('T_07990_60_mods_M.jpg', 0),
 ('T_14538_00_cityglam_W.jpg', 0),
 ('T_21986_70_hippie_M.jpg', 0)]

In [139]:
# 사용자가 평가하지 않은 항목에 대하여 예측 평점이 0이 아닌 item을 추천
recommended_items = [item for item, score in top_recommendations if score != 0 and score[0] != 0]
recommended_items

['T_00253_60_popart_W.jpg',
 'W_10028_50_classic_W.jpg',
 'W_44918_60_minimal_W.jpg']

In [140]:
print(f"사용자 {user_id}에게 추천하는 item:\n{recommended_items}")

사용자 64747에게 추천하는 item:
['T_00253_60_popart_W.jpg', 'W_10028_50_classic_W.jpg', 'W_44918_60_minimal_W.jpg']


### 추천한 이미지가 해당 user가 선호한 이미지와 비슷한지 확인해보자