## 선호장르 순위에 점수부여
1. 선호장르에 순위를 매기고 점수를 부여해서 각각 사용자별로 유사도 점수계산
2. 입력받은 임의의 사용자와 비슷한 유사도 가진 사람 10명의 청취기록 추출
   (유사한 사람의 청취기록이 없을 시 그 다음으로 유사한 사람불러와서 10명채움)
3. 임의의 사용자의 선호장르 순위 가져와서 청취목록중에서 가장높은 점수가진 20곡 추천

In [29]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# CSV 파일 읽기
like_genre_df = pd.read_csv('like_genre_df_a.csv')
mml_info_df = pd.read_csv('mml_info_a.csv')


# 점수 매핑을 위한 딕셔너리 생성
priority_scores = {1: 5, 2: 4, 3: 3, 4: 2, 5: 1}

# PRIORITY 컬럼에 따라 SCORE 컬럼 생성
like_genre_df['SCORE'] = like_genre_df['PRIORITY'].map(priority_scores)

# 유저별 장르 점수 피벗 테이블 생성
user_genre_score_table = like_genre_df.pivot_table(index='USER_ID_NEW', 
                                                   columns='GENRE_NAME', 
                                                   values='SCORE', 
                                                   fill_value=0, 
                                                   aggfunc='sum')

# 코사인 유사도 계산
cosine_sim = cosine_similarity(user_genre_score_table)


# 유사도 행렬을 DataFrame으로 변환
cosine_sim_df = pd.DataFrame(cosine_sim, 
                             index=user_genre_score_table.index, 
                             columns=user_genre_score_table.index)


In [30]:
cosine_sim_df

USER_ID_NEW,00C9LUAB,00GV0hH5,00VkSb7E,00Wrdzp6,00Y4Fyc0,00ZDycPL,00daK56D,00qKj1UD,00uJu9nC,012crtJW,...,zzDzO5DE,zzEm7r9J,zzN5QFnv,zzRcO1Oq,zzXqgiZT,zziHtQ88,zzp13r0n,zzq2i5A7,zzu5cY0U,zzxq1lih
USER_ID_NEW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00C9LUAB,1.000000,0.672727,0.563636,0.527273,0.436364,0.618182,0.327273,0.272727,0.345455,0.654545,...,0.800000,0.672727,0.509091,0.490909,0.945455,0.727273,0.472727,0.236364,0.763636,0.400000
00GV0hH5,0.672727,1.000000,0.454545,0.600000,0.800000,0.781818,0.654545,0.709091,0.345455,0.581818,...,0.927273,0.945455,0.709091,0.709091,0.854545,0.745455,0.618182,0.509091,0.454545,0.600000
00VkSb7E,0.563636,0.454545,1.000000,0.345455,0.327273,0.618182,0.509091,0.527273,0.236364,0.472727,...,0.400000,0.454545,0.400000,0.636364,0.509091,0.727273,0.618182,0.527273,0.509091,0.654545
00Wrdzp6,0.527273,0.600000,0.345455,1.000000,0.454545,0.436364,0.236364,0.290909,0.363636,0.600000,...,0.672727,0.672727,0.527273,0.381818,0.563636,0.654545,0.363636,0.200000,0.381818,0.381818
00Y4Fyc0,0.436364,0.800000,0.327273,0.454545,1.000000,0.854545,0.854545,0.818182,0.254545,0.418182,...,0.690909,0.781818,0.618182,0.654545,0.618182,0.618182,0.636364,0.672727,0.236364,0.654545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zziHtQ88,0.727273,0.745455,0.727273,0.654545,0.618182,0.781818,0.418182,0.436364,0.436364,0.763636,...,0.781818,0.854545,0.709091,0.490909,0.745455,1.000000,0.454545,0.290909,0.563636,0.509091
zzp13r0n,0.472727,0.618182,0.618182,0.363636,0.636364,0.581818,0.818182,0.854545,0.236364,0.400000,...,0.545455,0.545455,0.472727,0.981818,0.545455,0.454545,1.000000,0.927273,0.236364,0.981818
zzq2i5A7,0.236364,0.509091,0.527273,0.200000,0.672727,0.563636,0.909091,0.927273,0.109091,0.181818,...,0.345455,0.400000,0.345455,0.909091,0.345455,0.290909,0.927273,1.000000,0.072727,0.927273
zzu5cY0U,0.763636,0.454545,0.509091,0.381818,0.236364,0.381818,0.145455,0.090909,0.454545,0.709091,...,0.581818,0.472727,0.218182,0.236364,0.690909,0.563636,0.236364,0.072727,1.000000,0.163636


In [32]:
def get_top_10_similar_users_listening(user_id, mml_info_df, cosine_sim_df):
    # 입력받은 사용자와 가장 유사도가 높은 사용자들 찾기 (충분히 큰 숫자로 범위 확장)
    similar_users = cosine_sim_df[user_id].sort_values(ascending=False).index.tolist()
    similar_users.remove(user_id)  # 입력받은 사용자 제거

    # 유사한 사용자들의 청취 목록 찾기
    listening_lists = []
    for similar_user in similar_users:
        if len(listening_lists) < 10:
            user_listening = mml_info_df[mml_info_df['USER_ID_NEW'] == similar_user]
            if not user_listening.empty:
                listening_lists.append(user_listening)
        else:
            break

    return pd.concat(listening_lists) if listening_lists else pd.DataFrame()  # 청취 목록 반환

# 예제 사용자 ID (임의의 사용자 ID를 여기에 입력)
user_id = '012crtJW'  # 이 부분을 실제 사용자 ID로 변경

# 유사한 사용자들의 청취 목록 가져오기
top_10_similar_users_listening = get_top_10_similar_users_listening(user_id, mml_info_df, cosine_sim_df)

# 결과 출력
print(top_10_similar_users_listening)


                       Title                    Artist     Genre  play_time  \
8017                Crusader  Washington Marching Band  OST / 전체        273   
8740             El Capitano  Washington Marching Band  OST / 전체        157   
15087            BE YOURSELF           Various Artists   POP / 락        175   
17152               NO ENTRY           Various Artists   POP / 락        253   
18611   What A Beautiful Day             The Levellers   POP / 락        225   
...                      ...                       ...       ...        ...   
109930           그대 사랑안에 머물러                       김정민  가요 / 발라드        133   
121790                 난 너에게                       민들레  가요 / 발라드         94   
121878            오비이락(烏飛梨落)                     더 클래식  가요 / 발라드         90   
136263            아껴둔 사랑을 위해                       이주원   가요 / 전체        237   
137574                  자유시대                      모자이크   가요 / 전체         90   

              created_at USER_ID_NEW  
8017    2021

In [33]:
def get_top_20_songs_by_genre_preference(user_id, user_genre_score_table, listening_lists):
    # 예제 사용자의 장르 선호도 순위 가져오기
    user_preferences = user_genre_score_table.loc[user_id].sort_values(ascending=False)

    # 청취 목록에서 선호 장르에 해당하는 곡들 찾기
    preferred_songs = listening_lists[listening_lists['Genre'].isin(user_preferences.index)]

    # 장르 선호도를 기준으로 곡들을 정렬하고 상위 20곡 선택
    preferred_songs['genre_preference_score'] = preferred_songs['Genre'].map(user_preferences)
    top_20_songs = preferred_songs.sort_values(by='genre_preference_score', ascending=False).head(20)

    return top_20_songs

# 예제 사용자의 장르 순위가 가장 높은 곡 20곡 추출
top_20_songs_by_genre_preference = get_top_20_songs_by_genre_preference(user_id, user_genre_score_table, top_10_similar_users_listening)

# 결과 출력
print(top_20_songs_by_genre_preference)


                Title        Artist     Genre  play_time        created_at  \
121878     오비이락(烏飛梨落)         더 클래식  가요 / 발라드         90  2022-12-24 13:46   
116077            씽씽씽           동물원  가요 / 발라드        229  2021-03-15 18:43   
103694            은하수           박상민  가요 / 발라드        268  2021-08-09 14:08   
104285             미련           노영심  가요 / 발라드        259  2021-08-15 16:23   
108837     Summertime           신효범  가요 / 발라드        191   2021-08-10 5:37   
111824            긴 잠           INO  가요 / 발라드        410  2021-08-21 22:50   
116653  너의 마음을 내게 준다면  핑클 (Fin.K.L)  가요 / 발라드        223   2021-08-10 4:27   
113690          깊은 슬픔           양진석  가요 / 발라드        264   2021-05-20 9:55   
109240         너의 뒤에서  핑클 (Fin.K.L)  가요 / 발라드        177  2021-05-18 19:03   
107649            눈동자  핑클 (Fin.K.L)  가요 / 발라드        213  2021-05-18 19:07   
118526        다시 찾은 꿈           홍성수  가요 / 발라드        202   2021-08-10 3:26   
120811          너의 사랑            제이  가요 / 발라드        146  2021-0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preferred_songs['genre_preference_score'] = preferred_songs['Genre'].map(user_preferences)
