In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import sqlite3

In [2]:
def load_play_data(path=None):
    """
    加载播放数据
    """
    if path is None:
        path = "../data/train_triplets.txt"
    
    data = pd.read_csv(path, sep="\t", names=["user_id", "song_id", "play_count"
    ])

    return data

In [3]:
play_data = load_play_data()

In [4]:
def read_song_info(table_name, song_columns, song_db=None):
    """
    读取歌曲数据库的指定字段，并转换成dataframe对象
    """
    if song_db is None:
        song_db = "../data/track_metadata.db"
    
    conn = sqlite3.connect(song_db)
    cursor = conn.cursor()
    sql = f"select {', '.join(song_columns)} from {table_name}"
    song_df = pd.read_sql_query(sql, conn)

    conn.close()

    return song_df

In [10]:
song_colunms = ["song_id", "title", "release", "artist_name", "artist_familiarity", "artist_hotttnesss"]
song_data = read_song_info("songs", song_colunms)

In [11]:
print(play_data.shape)
print(song_data.shape)

(48373586, 3)
(1000000, 6)


In [8]:
print(play_data.head(5))

                                    user_id             song_id  play_count
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995           1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9           1
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B           2
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFNSP12AF72A0E22           1
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFOVM12A58A7D494           1


In [12]:
print(song_data.head(5))

              song_id              title  \
0  SOQMMHC12AB0180CB8       Silent Night   
1  SOVFVAK12A8C1350D9        Tanssi vaan   
2  SOGTUKN12AB017F4F1  No One Could Ever   
3  SOBNYVR12A8C13558C      Si Vos Querés   
4  SOHSBXH12A8C13B0DF   Tangle Of Aspens   

                                release       artist_name  artist_familiarity  \
0                 Monster Ballads X-Mas  Faster Pussy cat            0.649822   
1                           Karkuteillä  Karkkiautomaatti            0.439604   
2                                Butter    Hudson Mohawke            0.643681   
3                               De Culo       Yerba Brava            0.448501   
4  Rene Ablaze Presents Winter Sessions        Der Mystic            0.000000   

   artist_hotttnesss  
0           0.394032  
1           0.356992  
2           0.437504  
3           0.372349  
4           0.000000  


In [13]:
def fill_song_info(play_data, song_data, join_column):
    """
    填充歌曲信息
    """
    user_song_data = play_data.merge(song_data, on=join_column, how="left")

    return user_song_data

In [14]:
user_song_data = fill_song_info(play_data, song_data, "song_id")

In [15]:
print(user_song_data.shape)

(49664528, 8)


In [16]:
print(user_song_data.head(5))

                                    user_id             song_id  play_count  \
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995           1   
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9           1   
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B           2   
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFNSP12AF72A0E22           1   
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFOVM12A58A7D494           1   

                             title  \
0                         The Cove   
1             Nothing from Nothing   
2                  Entre Dos Aguas   
3            Under Cold Blue Stars   
4  Riot Radio (Soundtrack Version)   

                                             release    artist_name  \
0                                 Thicker Than Water   Jack Johnson   
1                                         To Die For  Billy Preston   
2                                Flamenco Para Niños  Paco De Lucia   
3           

In [19]:
def calc_song_score(play_data):
    """
    计算当前用户歌单中的歌曲得分
    """
    return play_data.groupby(by=["user_id", "song_id"])["play_count"].sum() / play_data["user_id"].sum()

In [None]:
score_data = calc_song_score(play_data)

In [None]:
print(score_data.head(5))