In [11]:
import sqlite3
import numpy as np
import time

In [2]:
import pandas as pd

def load_data(path=None):
    if path is None:
        path = "../data/train_triplets.txt"

    data = pd.read_csv(path, sep="\t", names=["user_id", "song_id", "play_count"])
    print(data.head(5))

    return data

In [3]:
data = load_data()

                                    user_id             song_id  play_count
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995           1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9           1
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B           2
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFNSP12AF72A0E22           1
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFOVM12A58A7D494           1


In [4]:
print(data.shape)

(48373586, 3)


In [None]:
def analyse_play_count_by(data, column_to_group, column_to_sum, ascending=False):
    """
    按照指定的列分组统计，并进行降序排列
    """
    result = data.groupby(column_to_group)[column_to_sum].sum().reset_index()
    result = result.sort_values(by=column_to_sum, ascending=ascending)
    return result

In [6]:
# 以用户为维度，统计播放量
play_count_by_user = analyse_play_count_by(data, "user_id", "play_count", False)
print(play_count_by_user.head(5))

                                         user_id  play_count
36591   093cb74eb3c517c5179ae24caf0ebec51b24d2a2       13132
69497   119b7c88d58d0c6eb051365c103da5caf817bea6        9884
252820  3fa44653315697f42410a30cb766a4eb102080bb        8210
646483  a2679496cd0af9779a92a13ff7c6af5c81ea8c7b        7015
859158  d7d2d888ae04d16e994d6964214a1de81392ee04        6494


In [7]:
# 以歌曲为维度，统计播放量
play_count_by_song = analyse_play_count_by(data, "song_id", "play_count", False)
print(play_count_by_song.head(5))

                   song_id  play_count
25043   SOBONKR12A58A7A7E0      726885
12936   SOAUWYT12A81C206F1      648239
287415  SOSXLTC12AF72A7F54      527893
90798   SOFRQTD12A81C233C0      425463
67917   SOEGIYH12A6D4FC0E3      389880


In [14]:
# 取前10W个用户、前3W首歌曲
user_id_list = play_count_by_user.head(100000) # 活跃用户
song_id_list = play_count_by_song.head(10000) # 热门歌曲

print(user_id_list.shape)
print(song_id_list.shape)

(100000, 2)
(10000, 2)


In [9]:
# 读取歌曲详细信息数据
# 处理 track_metadata.db
db_file = "../data/track_metadata.db"
# cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
# table_names = cursor.fetchall()
# print("数据库中的表:", table_names)
# 获取表的创建语句
# sql_column = "select sql from sqlite_master where type='table' and name='songs'"
# cursor.execute(sql_column)
# create_statement = cursor.fetchall()[0]
# print(f"表songs的创建语句：{create_statement}")

# head_sql = "select * from songs limit 5"
# cursor.execute(head_sql)
# result = cursor.fetchall()
# print(f"songs表的前5行数据：{result}")

def get_song_info(db_file, table_name, column_names, condition):
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    sql = ""
    if 1 == len(column_names):
        sql = f"select {column_names} from {table_name} where condition"
    else:
        sql = f"select {", ".join(column_names)} from {table_name} where condition"
    
    cursor.execute(sql)
    result = cursor.fetchall()

    cursor.close()
    conn.close()

    return result


In [15]:
# 统计歌曲相似度
def calc_song_played_user_count(song_played_id, play_data):
    """
    获取歌曲的播放用户id列表
    """
    song_played_user_list = play_data[play_data["song_id"] == song_played_id]["user_id"].to_list()

    return song_played_user_list

def calc_similarity(song_played_id, recommend_list, play_data):
    """
    根据用户的播放记录，统计已播放歌曲，与推荐列表中的歌曲的相似度
    按照歌曲为维度，统计歌曲相似度：两首歌的相同用户数 / 两首歌的总用户数
    """
    max_similarity = 0
    max_id = 0
    start_time = time.time()

    song_played_user_list = calc_song_played_user_count(song_played_id, play_data)

    for index, song_id in enumerate(recommend_list):
        if index % 1000 == 0:
            end_time = time.time()
            print(f"比较进度: {index + 1} | 当前耗时：{end_time - start_time:.2f}")
            start_time = time.time()
        
        if song_played_id == song_id: # 相同歌曲不比较
            continue

        other_played_list = calc_song_played_user_count(song_id, play_data)

        common_count = len(set(song_played_user_list) & set(other_played_list))
        total_count = len(set(song_played_user_list) | set(other_played_list))

        similarity = common_count / total_count
        if similarity > max_similarity:
            max_similarity = similarity
            max_id = song_id

    return max_similarity, max_id

unique_hot_song = song_id_list["song_id"].unique() # 以热门歌曲为潜在推荐列表，从其中找出与用户已听音乐相似的歌曲
# unique_hot_song = song_id_list.head(10)["song_id"].unique()
print(f"开始加载活跃用户的播放记录……")
hot_play_data = data.loc[data["user_id"].isin(user_id_list["user_id"])]
print(f"加载活跃用户的播放记录：完成")

result, most_recomend_song_id = calc_similarity("SOBONKR12A58A7A7E0", unique_hot_song, hot_play_data)

print(result)
print(most_recomend_song_id)

开始加载活跃用户的播放记录……
加载活跃用户的播放记录：完成
比较进度: 1 | 当前耗时：2.32
比较进度: 1001 | 当前耗时：1397.81
比较进度: 2001 | 当前耗时：1348.02
比较进度: 3001 | 当前耗时：1402.26
比较进度: 4001 | 当前耗时：1419.25
比较进度: 5001 | 当前耗时：1439.88
比较进度: 6001 | 当前耗时：1243.34
比较进度: 7001 | 当前耗时：1352.83
比较进度: 8001 | 当前耗时：1252.67
比较进度: 9001 | 当前耗时：1309.55
0.49661746617466174
SOAUWYT12A81C206F1


In [None]:
import time
start_time = time.time()
end_time = time.time()
print(f"当前耗时：{end_time - start_time}")