In [92]:
import pandas as pd

# Load and display the first few rows of the dataset to understand its structure
file_path = 'like_genre_df_new.csv'
data = pd.read_csv(file_path)
data.head(10)


Unnamed: 0,PRIORITY,GENRE_NAME,USER_ID_NEW
0,1,Ballade,00C9LUAB
1,2,rock & metal,00C9LUAB
2,3,R&B/Soul,00C9LUAB
3,4,electronic,00C9LUAB
4,5,dance,00C9LUAB
5,1,dance,00daK56D
6,2,J-POP,00daK56D
7,3,rap & hiphop,00daK56D
8,4,Ballade,00daK56D
9,5,R&B/Soul,00daK56D


In [68]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# 데이터 전처리
# 각 사용자별로 장르에 대한 선호도 벡터를 생성

# 우선 순위에 따라 점수를 반대로 부여 (낮은 우선 순위에 높은 점수)
max_priority = data['PRIORITY'].max()
data['SCORE'] = max_priority - data['PRIORITY'] + 1

# 사용자별로 장르 선호도 행렬을 생성
user_genre_matrix = data.pivot_table(index='USER_ID_NEW', columns='GENRE_NAME', values='SCORE', fill_value=0)

# MinMaxScaler를 사용하여 각 사용자의 점수를 0과 1 사이로 정규화
scaler = MinMaxScaler()
user_genre_matrix_scaled = pd.DataFrame(
    scaler.fit_transform(user_genre_matrix),
    index=user_genre_matrix.index,
    columns=user_genre_matrix.columns
)

user_genre_matrix_scaled.head()


GENRE_NAME,Ballade,CCM,Gugak,J-POP,New Age,OST,POP,R&B/Soul,children,classic,...,electronic,fork & blues,game,indie,jazz,musical,rap & hiphop,rock & metal,trot,world music
USER_ID_NEW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00C9LUAB,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,...,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0
00GV0hH5,0.8,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.6,0.0,0.0
00VkSb7E,0.6,0.0,0.0,0.0,0.0,0.4,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0
00Wrdzp6,0.8,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,1.0,0.0
00Y4Fyc0,0.6,0.0,0.0,0.8,0.0,0.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0


In [69]:
from sklearn.metrics.pairwise import cosine_similarity

# 사용자 간의 코사인 유사도 계산
cosine_sim = cosine_similarity(user_genre_matrix_scaled)
cosine_sim_df = pd.DataFrame(cosine_sim, index=user_genre_matrix_scaled.index, columns=user_genre_matrix_scaled.index)

# 결과 확인
cosine_sim_df.head()


USER_ID_NEW,00C9LUAB,00GV0hH5,00VkSb7E,00Wrdzp6,00Y4Fyc0,00ZDycPL,00daK56D,00qKj1UD,00uJu9nC,012crtJW,...,zzDzO5DE,zzEm7r9J,zzN5QFnv,zzRcO1Oq,zzXqgiZT,zziHtQ88,zzp13r0n,zzq2i5A7,zzu5cY0U,zzxq1lih
USER_ID_NEW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00C9LUAB,1.0,0.672727,0.563636,0.527273,0.436364,0.618182,0.327273,0.272727,0.345455,0.654545,...,0.8,0.672727,0.509091,0.490909,0.945455,0.727273,0.472727,0.236364,0.763636,0.4
00GV0hH5,0.672727,1.0,0.454545,0.6,0.8,0.781818,0.654545,0.709091,0.345455,0.581818,...,0.927273,0.945455,0.709091,0.709091,0.854545,0.745455,0.618182,0.509091,0.454545,0.6
00VkSb7E,0.563636,0.454545,1.0,0.345455,0.327273,0.618182,0.509091,0.527273,0.236364,0.472727,...,0.4,0.454545,0.4,0.636364,0.509091,0.727273,0.618182,0.527273,0.509091,0.654545
00Wrdzp6,0.527273,0.6,0.345455,1.0,0.454545,0.436364,0.236364,0.290909,0.363636,0.6,...,0.672727,0.672727,0.527273,0.381818,0.563636,0.654545,0.363636,0.2,0.381818,0.381818
00Y4Fyc0,0.436364,0.8,0.327273,0.454545,1.0,0.854545,0.854545,0.818182,0.254545,0.418182,...,0.690909,0.781818,0.618182,0.654545,0.618182,0.618182,0.636364,0.672727,0.236364,0.654545


In [93]:
def find_similar_users(user_id, similarity_matrix, top_n=10):
    """
    주어진 사용자 ID에 대해 가장 유사한 사용자 top_n명을 찾는 함수.

    :param user_id: 찾고자 하는 사용자의 ID
    :param similarity_matrix: 사용자 간의 유사도를 나타내는 DataFrame
    :param top_n: 반환할 상위 유사 사용자의 수
    :return: 가장 유사한 사용자 top_n명의 ID와 유사도
    """
    if user_id in similarity_matrix.index:
        # 사용자 ID에 대한 유사도 점수를 모두 가져옴 (자기 자신 제외)
        sim_scores = similarity_matrix[user_id].drop(index=[user_id])
        # 가장 유사한 사용자 top_n명을 찾음
        top_users = sim_scores.sort_values(ascending=False).head(top_n)
        return top_users
    else:
        return "입력된 사용자 ID가 데이터에 없습니다."

# 사용자로부터 입력받기
input_user_id = input("사용자 ID를 입력하세요: ")

# 입력받은 사용자 ID에 대한 유사한 사용자 찾기
similar_users = find_similar_users(input_user_id, cosine_sim_df, top_n=10)
print(similar_users)


사용자 ID를 입력하세요: 012crtJW
USER_ID_NEW
guBgDmqd    1.000000
34JJMMw4    0.981818
EnbckhnR    0.981818
onxMA9oN    0.981818
ro3FOKBs    0.981818
ThmZH6GH    0.981818
F0jOVvJY    0.981818
RDKDAWCW    0.981818
wnBgkeyV    0.981818
dahZpJYS    0.981818
Name: 012crtJW, dtype: float64


In [94]:
# user_music_log.csv 파일 로드 및 첫 몇 줄을 표시하여 데이터 구조 확인
music_log_path = 'user_music_log.csv'
music_log_data = pd.read_csv(music_log_path)
music_log_data.head()


Unnamed: 0,SONG_TITLE,ARTIST_NAME,play_time,created_at,USER_ID_NEW
0,Back Porch (Live),Presidents Of The United States Of America,306,2021-03-24 21:26,00daK56D
1,Back Porch (Live),Presidents Of The United States Of America,306,2021-02-24 21:25,00daK56D
2,Back Porch (Live),Presidents Of The United States Of America,306,2021-02-12 15:09,00daK56D
3,Back Porch (Live),Presidents Of The United States Of America,306,2021-02-01 20:03,00daK56D
4,Back Porch (Live),Presidents Of The United States Of America,306,2021-01-24 14:35,00daK56D


In [95]:
# 이전 단계에서 찾은 유사한 사용자 10명의 ID 추출
similar_user_ids = similar_users.index.tolist()

# 유사한 사용자들의 음악 로그 필터링
similar_users_music_log = music_log_data[music_log_data['USER_ID_NEW'].isin(similar_user_ids)]

# 각 곡별로 청취 횟수 계산
top_songs = similar_users_music_log.groupby(['SONG_TITLE', 'ARTIST_NAME']).size().sort_values(ascending=False).head(10)
top_songs


SONG_TITLE                  ARTIST_NAME     
Release (Live)              Vashawn Mitchell    22
Steps In The Sand           Wayne Gratz         21
행복한 추억의 시간                  엄지은                 19
심청가 -- 심청이 인당수에 뛰어드는데       김수연                 16
한오백년                        김영임                  6
Butterfly Masters           Various Artists      4
Et Moi...                   Edith Piaf           4
나랑 산책할래요? (Vietato Fumare)  델리스파이스               4
깊은 슬픔                       양진석                  4
PERFUME                     Various Artists      3
dtype: int64

In [96]:
top_songs_per_user = {}
number_of_users_to_check = 10  # 확인할 유사 사용자 수
top_songs_per_user_limit = 10  # 추출할 노래의 상한선

# `music_log_data`와 `find_similar_users` 함수가 정의되었다고 가정합니다.
# `similar_user_ids`가 정의되었다고 가정합니다.

# 유사한 사용자 ID 목록을 초기화합니다.
checked_user_count = 0

for user_id in similar_user_ids:
    # 해당 사용자의 청취 기록 필터링
    user_music_log = music_log_data[music_log_data['USER_ID_NEW'] == user_id]   

    # 청취 기록이 있는 경우에만 진행
    if not user_music_log.empty:
        # 각 곡별로 청취 횟수를 계산하고 상위 10곡을 추출
        top_songs_user = user_music_log.groupby(['SONG_TITLE', 'ARTIST_NAME']).size().sort_values(ascending=False).head(top_songs_per_user_limit)
        top_songs_per_user[user_id] = top_songs_user
        checked_user_count += 1  # 처리된 유사 사용자 수를 업데이트합니다.
    else:
        # 청취 기록이 없는 유사한 사용자에 대한 처리
        print(f"No data available for user {user_id}. Moving to next similar user.")

    # 유사 사용자 처리 수가 목표치에 도달하면 반복문 종료
    if checked_user_count >= number_of_users_to_check:
        break

# 모든 유사 사용자를 확인한 후, 원하는 수의 사용자 데이터를 얻지 못한 경우 처리
if checked_user_count < number_of_users_to_check:
    print(f"Only found data for {checked_user_count} similar users out of the desired {number_of_users_to_check}.")

top_songs_per_user


No data available for user guBgDmqd. Moving to next similar user.
No data available for user EnbckhnR. Moving to next similar user.
No data available for user ThmZH6GH. Moving to next similar user.
No data available for user wnBgkeyV. Moving to next similar user.
No data available for user dahZpJYS. Moving to next similar user.
Only found data for 5 similar users out of the desired 10.


{'34JJMMw4': SONG_TITLE                  ARTIST_NAME    
 Butterfly Masters           Various Artists    1
 I Got A Reason              Journey            1
 나랑 산책할래요? (Vietato Fumare)  델리스파이스             1
 사랑은 계절따라                    Various Artists    1
 dtype: int64,
 'onxMA9oN': SONG_TITLE  ARTIST_NAME
 한오백년        김영임            4
 Dat         Alexian        1
 dtype: int64,
 'ro3FOKBs': SONG_TITLE                    ARTIST_NAME     
 Release (Live)                Vashawn Mitchell    22
 Steps In The Sand             Wayne Gratz         21
 행복한 추억의 시간                    엄지은                 19
 심청가 -- 심청이 인당수에 뛰어드는데         김수연                 16
 Et Moi...                     Edith Piaf           4
 With This Love                Joy Enriquez         3
 PERFUME                       Various Artists      3
 PITTSBURGH                    Various Artists      3
 Polvere Di Stelle (Stardust)  Henghel Gualdi       3
 Porno Star                    Buckcherry           3
 dtype: int64,
 