In [3]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import sqlite3
from sklearn.preprocessing import LabelEncoder
from scipy.sparse.linalg import svds

In [4]:
def load_play_data(path=None):
    """
    加载播放数据
    """
    if path is None:
        path = "../data/train_triplets.txt"
    
    data = pd.read_csv(path, sep="\t", names=["user_id", "song_id", "play_count"
    ])

    return data

In [3]:
play_data = load_play_data()

In [5]:
def read_song_info(table_name, song_columns, song_db=None):
    """
    读取歌曲数据库的指定字段，并转换成dataframe对象
    """
    if song_db is None:
        song_db = "../data/track_metadata.db"
    
    conn = sqlite3.connect(song_db)
    cursor = conn.cursor()
    sql = f"select {', '.join(song_columns)} from {table_name}"
    song_df = pd.read_sql_query(sql, conn)

    conn.close()

    return song_df

In [5]:
song_colunms = ["song_id", "title"]
song_data = read_song_info("songs", song_colunms)

In [6]:
print(play_data.shape)
print(song_data.shape)

(48373586, 3)
(1000000, 2)


In [7]:
print(play_data.head(5))

                                    user_id             song_id  play_count
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995           1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9           1
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B           2
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFNSP12AF72A0E22           1
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFOVM12A58A7D494           1


In [8]:
print(song_data.head(5))

              song_id              title
0  SOQMMHC12AB0180CB8       Silent Night
1  SOVFVAK12A8C1350D9        Tanssi vaan
2  SOGTUKN12AB017F4F1  No One Could Ever
3  SOBNYVR12A8C13558C      Si Vos Querés
4  SOHSBXH12A8C13B0DF   Tangle Of Aspens


In [6]:
def calc_song_score(play_data):
    # 1. 转换字符串列为分类类型 (减少内存+加速分组)
    play_data["user_id"] = play_data["user_id"].astype("category")
    play_data["song_id"] = play_data["song_id"].astype("category")

    # 2. 按用户+歌曲分组求和
    grouped = play_data.groupby(["user_id", "song_id"], as_index=False, observed=True)["play_count"].sum()
    
    # 3. 计算用户级总播放次数
    grouped["user_total"] = grouped.groupby("user_id", observed=True)["play_count"].transform("sum")
    
    # 4. 计算占比
    grouped["score"] = grouped["play_count"] / grouped["user_total"]
    
    return grouped[["user_id", "song_id", "score"]]

In [10]:
score_data = calc_song_score(play_data)

In [11]:
print(score_data.shape)

(48373586, 3)


In [13]:
print(score_data.head(10))

                                    user_id             song_id     score
0  00000b722001882066dff9d2da8a775658053ea0  SOBQJJX12A6D4F7F01  0.153846
1  00000b722001882066dff9d2da8a775658053ea0  SOBSSGK12A6D4F9EF1  0.076923
2  00000b722001882066dff9d2da8a775658053ea0  SOCTXQW12A6D4F70AD  0.076923
3  00000b722001882066dff9d2da8a775658053ea0  SOCZQCY12AC468E40F  0.076923
4  00000b722001882066dff9d2da8a775658053ea0  SOFLJQZ12A6D4FADA6  0.076923
5  00000b722001882066dff9d2da8a775658053ea0  SOJOJUN12A8AE47E1D  0.076923
6  00000b722001882066dff9d2da8a775658053ea0  SOKBXYC12A6D4F59D6  0.076923
7  00000b722001882066dff9d2da8a775658053ea0  SOMRTLE12A58A78D26  0.076923
8  00000b722001882066dff9d2da8a775658053ea0  SORDKNX12A8C13A45F  0.076923
9  00000b722001882066dff9d2da8a775658053ea0  SOTCWRK12AB017E47D  0.076923


In [12]:
print(score_data[score_data["user_id"].isin(["00000b722001882066dff9d2da8a775658053ea0", "00001638d6189236866af9bbf309ae6c2347ffdc"])])

                                     user_id             song_id     score
0   00000b722001882066dff9d2da8a775658053ea0  SOBQJJX12A6D4F7F01  0.153846
1   00000b722001882066dff9d2da8a775658053ea0  SOBSSGK12A6D4F9EF1  0.076923
2   00000b722001882066dff9d2da8a775658053ea0  SOCTXQW12A6D4F70AD  0.076923
3   00000b722001882066dff9d2da8a775658053ea0  SOCZQCY12AC468E40F  0.076923
4   00000b722001882066dff9d2da8a775658053ea0  SOFLJQZ12A6D4FADA6  0.076923
5   00000b722001882066dff9d2da8a775658053ea0  SOJOJUN12A8AE47E1D  0.076923
6   00000b722001882066dff9d2da8a775658053ea0  SOKBXYC12A6D4F59D6  0.076923
7   00000b722001882066dff9d2da8a775658053ea0  SOMRTLE12A58A78D26  0.076923
8   00000b722001882066dff9d2da8a775658053ea0  SORDKNX12A8C13A45F  0.076923
9   00000b722001882066dff9d2da8a775658053ea0  SOTCWRK12AB017E47D  0.076923
10  00000b722001882066dff9d2da8a775658053ea0  SOTDOKZ12A8C137FCD  0.076923
11  00000b722001882066dff9d2da8a775658053ea0  SOUBEXV12AB01804A4  0.076923
12  00001638d6189236866af

In [7]:
def fill_song_info(play_data, song_data, join_column):
    """
    填充歌曲信息
    """
    # 预处理歌曲数据
    song_lookup = (song_data
                   .drop_duplicates(join_column)  # 确保键唯一
                   .set_index(join_column)  # 建立索引
                   .astype({"title": "category"}))  # 类型优化
    
    # 分列映射避免内存溢出
    play_data["title"] = play_data[join_column].map(song_lookup["title"])
    # play_data["release"] = play_data[join_column].map(song_lookup["release"])
    # play_data["artist_name"] = play_data[join_column].map(song_lookup["artist_name"])
    # play_data["artist_familiarity"] = play_data[join_column].map(song_lookup["artist_familiarity"])
    # play_data["artist_hotttnesss"] = play_data[join_column].map(song_lookup["artist_hotttnesss"])
    
    return play_data

In [14]:
user_song_data = fill_song_info(score_data, song_data, "song_id")

In [15]:
print(user_song_data.head(5))

                                    user_id             song_id     score  \
0  00000b722001882066dff9d2da8a775658053ea0  SOBQJJX12A6D4F7F01  0.153846   
1  00000b722001882066dff9d2da8a775658053ea0  SOBSSGK12A6D4F9EF1  0.076923   
2  00000b722001882066dff9d2da8a775658053ea0  SOCTXQW12A6D4F70AD  0.076923   
3  00000b722001882066dff9d2da8a775658053ea0  SOCZQCY12AC468E40F  0.076923   
4  00000b722001882066dff9d2da8a775658053ea0  SOFLJQZ12A6D4FADA6  0.076923   

                            title  
0                  Rods And Cones  
1                           Heela  
2  Babe_ You Turn Me On (Paris 2)  
3                     Ela É Bamba  
4                        Tive Sim  


In [8]:
def create_label_encoder(data, column_name_list):
    """
    创建将user_id和song_id编码成数字的labelEncoder
    """
    encoder_map = {}
    for column_name in column_name_list:
        encoder = LabelEncoder()
        encoder.classes_ = data[column_name].cat.categories
        encoder_map[column_name] = encoder
    
    return encoder_map

In [17]:
encoder_map = create_label_encoder(user_song_data, ["user_id", "song_id"])

In [18]:
user_song_data["user_id"] = encoder_map["user_id"].transform(user_song_data["user_id"])

In [19]:
user_song_data["song_id"] = encoder_map["song_id"].transform(user_song_data["song_id"])

In [20]:
print(user_song_data.head(5))

   user_id  song_id     score                           title
0        0    26214  0.153846                  Rods And Cones
1        0    27715  0.076923                           Heela
2        0    44239  0.076923  Babe_ You Turn Me On (Paris 2)
3        0    47860  0.076923                     Ela É Bamba
4        0    87066  0.076923                        Tive Sim


In [9]:
def build_user_song_score_matrix(data, index, column, value):
    """
    构建用户-歌曲评分矩阵
    """
    rows = data[index].values
    cols = data[column].values
    scores = data[value].values

    # 获取矩阵维度
    n_users = data[index].max() + 1
    n_items = data[column].max() + 1
    

    return csr_matrix((scores, (rows, cols)), shape=(n_users, n_items))


In [22]:
user_song_csr_matrix = build_user_song_score_matrix(user_song_data, "user_id", "song_id", "score")

In [23]:
print(user_song_csr_matrix.shape)

(1019318, 384546)


In [24]:
def svd_divide(data, n_components):
    """
    将矩阵进行svd分解
    """
    U, s, Vt = svds(data, n_components)
    S = np.diag(s)

    return U, S, Vt

In [25]:
U, S, Vt = svd_divide(user_song_csr_matrix, 50)

In [26]:
print(U.shape)
print(S.shape)
print(Vt.shape)

(1019318, 50)
(50, 50)
(50, 384546)


In [27]:
def get_recomendation(user_id, U, S, Vt, user_encoder, song_encoder, used_items, recommd_num=10):
    """
    获取用户推荐列表
    """
    user_encoded_id = user_encoder.transform([user_id])[0]
    item_scores = U[user_encoded_id] @ S @ Vt

    used_items_encoded_id = song_encoder.transform(used_items)
    item_scores[used_items_encoded_id] = 0

    recommend_items = np.argsort(item_scores)[::-1][:recommd_num]

    orig_item_id = song_encoder.inverse_transform(recommend_items)

    return orig_item_id


In [28]:
user_id = "00000b722001882066dff9d2da8a775658053ea0"
user_encoder = encoder_map["user_id"]
song_encoder = encoder_map["song_id"]
used_items = play_data[play_data["user_id"] == user_id]["song_id"].values
recommend_list = get_recomendation(user_id, U, S, Vt, user_encoder, song_encoder, used_items, recommd_num=10)

In [29]:
print(recommend_list)
songs = song_data[song_data["song_id"].isin(recommend_list)]["title"].values

Index(['SOVWBYM12A6D4F8A22', 'SONHWUN12AC468C014', 'SOSYOHI12A8C144584',
       'SOIBCIC12A58A7B55B', 'SOHFJAQ12AB017E4AF', 'SODCNJX12A6D4F93CB',
       'SOQPGDF12AB01858C5', 'SOWRREB12A6D4FA7CB', 'SOIKQFR12A6310F2A6',
       'SOEOLGZ12A8C134706'],
      dtype='object')


In [30]:
print("\n".join(songs))

Rock Star
You'll Never Know (My Love) (Bovellian 07 Mix)
West One (Shine On Me)
Video Killed The Radio Star
Jamaica Roots II(Agora E Sempre)
Eternal Flame (Single Version)
Rianna
Esisti Tu
Robot Soul (Radio Edit)
Lord I Guess I'll Never Know


In [10]:
def load_clean_data(path=None):
    """
    加载数据并进行清理
    """
    action_score_data = calc_song_score(load_play_data(path))
    
    label_encoder = create_label_encoder(action_score_data, ["user_id", "song_id"])
    action_score_data["user_id"] = label_encoder["user_id"].transform(action_score_data["user_id"])
    action_score_data["song_id"] = label_encoder["song_id"].transform(action_score_data["song_id"])

    score_matrix = build_user_song_score_matrix(action_score_data, "user_id", "song_id", "score")

    return score_matrix, label_encoder

In [11]:
score_matrix, label_encoder = load_clean_data()

In [12]:
print(score_matrix.shape)

(1019318, 384546)


In [15]:
class ScoringBySVD:
    """
    通过SVD矩阵分解，求得“用户-物品”的评分
    """
    def __init__(self, n_components):
        """
        初始化方法
        """
        self.n_components = n_components
        self.U = None # 用户潜在特征矩阵
        self.S = None # 特征重要性矩阵
        self.Vt = None # 物品潜在特征矩阵的转置

    def fit(self, X, y=None):
        """
        将传入的数据，分解为U、S、V三个矩阵
        """
        U, s, Vt = svds(X, self.n_components)
        S = np.diag(s)

        self.U = U
        self.S = S
        self.Vt = Vt

    def predict(self, X):
        """
        预测评分
        """
        scores = self.U[X] @ self.S @ self.Vt

        return scores


In [16]:
svd_transformer = ScoringBySVD(n_components=50)
svd_transformer.fit(score_matrix)

In [23]:
user_id_2 = ["00000b722001882066dff9d2da8a775658053ea0", ]
user_id_2_encodered = label_encoder["user_id"].transform(user_id_2)[0]
scores = svd_transformer.predict(user_id_2_encodered)

In [26]:
print(label_encoder["song_id"].transform(np.argsort(scores)[::-1][:10]))

ValueError: y contains previously unseen labels: 87066