# 상위 100명의 선호도 정보 저장

In [17]:
import pandas as pd

def make_top100_csv(train_csv, val_csv):
    # csv 불러와서 사용
    df_train = pd.read_csv(train_csv)
    df_val = pd.read_csv(val_csv)

    # 응답자 ID 별로 설문 응답의 개수 세기
    train_count = df_train.groupby(['응답자 ID']).count()
    val_count = df_val.groupby(['응답자 ID']).count()

    # 하나의 열만 남기기
    train_count = train_count['스타일 선호 여부']
    val_count = val_count['스타일 선호 여부']

    # 열 이름이 동일하므로 헷갈리지 않게 변경
    train_count.name = 'train 설문 응답 수'
    val_count.name = 'val 설문 응답 수'

    # 두 데이터프레임 합친 후 몇 가지 처리
    df_sum = pd.concat([train_count, val_count],axis=1)
    df_sum = df_sum.fillna(0).astype(int)    # 결측치 0으로 채우기
    df_sum['합계'] = df_sum['train 설문 응답 수'] + df_sum['val 설문 응답 수']    # '합계' 열 추가
    df_sum = df_sum.sort_values(by='합계', ascending=False)    # '합계' 열 기준으로 내림차순 정렬

    #df_sum의 합계를 기준으로 상위 100개 응답자 ID 추출하여 리스트로 저장
    top_100_ids = df_sum.head(100).index.tolist()

    # 각 데이터에서 유효한 데이터만 거르기
    top100_train_df = df_train[df_train['응답자 ID'].isin(top_100_ids)].reset_index(drop=True)
    top100_val_df = df_val[df_val['응답자 ID'].isin(top_100_ids)].reset_index(drop=True)

    # csv로 데이터 저장
    top100_train_df.to_csv('top100_train_preference.csv', index=False)
    top100_val_df.to_csv('top100_val_preference.csv', index=False)


# Mission 2-2에서 생성한 csv 파일의 경로
t_pref = 'train_preference.csv'
v_pref = 'val_preference.csv'

#make_top100_csv(t_pref, v_pref)

# Item-based filtering

### (1) 데이터 준비: user-item 행렬 생성

In [26]:
import pandas as pd

# top100 선호도 데이터 로드
t_top100_pref = pd.read_csv('top100_train_preference.csv')
v_top100_pref = pd.read_csv('top100_val_preference.csv')

# 스타일 선호 여부를 1과 0으로 변환
t_top100_pref['스타일 선호 여부'] = t_top100_pref['스타일 선호 여부'].apply(lambda x: 1 if x == '스타일 선호' else 0)
v_top100_pref['스타일 선호 여부'] = v_top100_pref['스타일 선호 여부'].apply(lambda x: 1 if x == '스타일 선호' else 0)

# 선호도 정보를 사용하여 user-item matrix 생성
t_user_item_matrix = t_top100_pref.pivot_table(index='응답자 ID', columns='파일명', values='스타일 선호 여부')
v_user_item_matrix = v_top100_pref.pivot_table(index='응답자 ID', columns='파일명', values='스타일 선호 여부')

# 결측치 0으로 채우기
t_user_item_matrix.fillna(0, inplace=True)
v_user_item_matrix.fillna(0, inplace=True)

# item-based filtering을 위해 행렬을 전치하여 item-user matrix로 변환(sklearn의 cosine similarity는 row 기준으로 계산이 됨)
t_item_user_matrix = t_user_item_matrix.T
v_item_user_matrix = v_user_item_matrix.T

In [33]:
t_item_user_matrix

응답자 ID,368,837,7658,7905,9096,20768,21432,22324,28371,28571,...,64633,64662,64747,65071,65139,66469,66513,66592,66731,67975
파일명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T_00253_60_popart_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
T_00456_10_sportivecasual_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
T_00588_10_sportivecasual_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
T_00770_60_minimal_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
T_00893_90_hiphop_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W_71923_60_mods_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
W_71933_60_mods_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
W_71934_60_mods_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
W_71935_60_mods_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [34]:
v_item_user_matrix

응답자 ID,368,837,7658,7905,9096,20768,21432,22324,28371,28571,...,64633,64662,64747,65071,65139,66469,66513,66592,66731,67975
파일명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T_00253_60_popart_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
T_00456_10_sportivecasual_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
T_01123_90_hiphop_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
T_01514_50_ivy_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
T_06910_50_classic_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W_63188_90_kitsch_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
W_63644_10_sportivecasual_M.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
W_64047_10_sportivecasual_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
W_64332_80_powersuit_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### (2) 유사도 계산: 아이팀 간 유사도를 계산하여 비슷한 사용자나 아이템을 찾음

In [35]:
# 코사인 유사도 계산 (!코사인으로 하는 게 맞을지..?)
from sklearn.metrics.pairwise import cosine_similarity

t_item_similarity_matrix = cosine_similarity(t_item_user_matrix)
v_item_similarity_matrix = cosine_similarity(v_item_user_matrix)

In [41]:
# 유사도 행렬을 DataFrame으로 변환하여 쉽게 조작할 수 있도록 함
v_item_similarity_df = pd.DataFrame(v_item_similarity_matrix, index=v_item_user_matrix.index, columns=v_item_user_matrix.index)

# 아이템 유사도 행렬의 일부를 표시하여 확인
v_item_similarity_df.head()

파일명,T_00253_60_popart_W.jpg,T_00456_10_sportivecasual_M.jpg,T_01123_90_hiphop_M.jpg,T_01514_50_ivy_M.jpg,T_06910_50_classic_W.jpg,T_07990_60_mods_M.jpg,T_14538_00_cityglam_W.jpg,T_21986_70_hippie_M.jpg,T_21988_70_hippie_M.jpg,T_21992_70_hippie_M.jpg,...,W_60553_00_cityglam_W.jpg,W_61255_00_cityglam_W.jpg,W_61790_10_sportivecasual_W.jpg,W_62253_19_lounge_W.jpg,W_62313_00_oriental_W.jpg,W_63188_90_kitsch_W.jpg,W_63644_10_sportivecasual_M.jpg,W_64047_10_sportivecasual_W.jpg,W_64332_80_powersuit_W.jpg,W_65122_10_sportivecasual_W.jpg
파일명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T_00253_60_popart_W.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T_00456_10_sportivecasual_M.jpg,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T_01123_90_hiphop_M.jpg,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T_01514_50_ivy_M.jpg,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T_06910_50_classic_W.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
