# 融合LDA主题模型的歌单相似度和歌曲推荐

## 基于歌单的相似度进行推荐

###  数据预处理

#### 筛选需要的数据项

首先对15G的歌单数据进行处理，选取了数据集中歌单的信息，包括歌单名、歌单标签、歌单id、歌单描述、歌单收藏数、歌曲信息，每一个歌单下包含着多首歌曲，每一首歌曲的信息包括歌曲id、歌曲名称、歌手、歌曲热度。  
（15G的数据量较大，在jupyter跑不起来，python文件可以）  
最终输出一个playlist_info的txt文件  
经过筛选 只剩下24951个歌单

In [None]:
import os
import tarfile
import pickle as pk
import sys
import json

def readPlaylist(in_line):
    data = json.loads(in_line)
    name = data['result']['name'] #歌单名称
    tags = ",".join(data['result']['tags']) # 标签以逗号分隔的字符串存储
    subscribed_count = data['result']['subscribedCount'] # 收藏数
    try:
        description=data['result']['description'] #歌单的描述
    except Exception as e:
        description=" "
    if(subscribed_count<100): # 筛选小于100收藏的歌单
        return False
    playlist_id = data['result']['id'] # 歌单的id名
    song_info = ''
    songs = data['result']['tracks'] # 歌曲
    for song in songs: 
        try:
            song_info += "\t"+":::".join([str(song['id']),song['name'],song['artists'][0]['name'],str(song['popularity'])])
            song_info=song_info.replace("\n"," ")
            # 歌曲信息：歌曲id:::歌曲名称:::歌手:::歌曲热度
        except Exception as e:
            continue
     # 歌单信息： 歌单名##歌单标签##歌单id##歌单描述##歌单收藏数##歌曲信息（以:::分割的信息）
    final_info=str(name+"##"+tags+"##"+str(playlist_id)+"##"+str(description)+"##"+str(subscribed_count)+song_info)
    return final_info.replace("\n"," ")
   
def readFile(in_file, out_file):
    out = open(out_file, 'w',encoding="utf-8")
    num=0
    for line in open(in_file,encoding='utf-8'):
        result = readPlaylist(line)
        #print(result)
        num=num+1
        print("------------------",num,"--------------------")
        if(result):
            out.write(str(result)+"\n")  
    out.close()
    
if __name__=="__main__":
    readFile("./playlistdetail.all.json", "./playlist_info_utf8.txt")

#### 构建歌曲（歌单）id和名称的字典

在这一步构建了歌曲和歌单的两个属性id和名称对应的字典。

In [None]:
import pickle as pk
import sys

def getDict(i, dic_playlist, dic_song):
    contents = i.strip().split("\t")
    playlist = contents[0].split("##")
    # 歌单信息： 歌单名##歌单标签##歌单id##歌单描述##歌单收藏数##歌曲信息（以:::分割的信息）
    dic_playlist[playlist[2]] = playlist[0]
    for i in contents[1:]:
        try:
            song_info = i.split(":::")
            dic_song[song_info[0]] = song_info[1]+"\t"+song_info[2]
        except:
            print("error"+i+"\n")


def readFile(in_file, out_playlist, out_song):
    dic_playlist = {}  # dic_playlist[歌单id]=歌单名称
    dic_song = {}  # dic_song[歌曲id]=歌曲名称
    num = 0
    for i in open(in_file, encoding="utf-8"):
        num = num+1
        print("------------------", num, "--------------------")
        getDict(i, dic_playlist, dic_song)
    pk.dump(dic_playlist, open(out_playlist, "wb"))
    pk.dump(dic_song, open(out_song, "wb"))


if __name__ == "__main__":
    readFile("./playlist_info_utf8.txt", "playlist.pkl", "song.pkl")

#### 歌单文本信息分词（utf-8存储）
为了后续的LDA主题模型，对歌单的文本信息进行分词。
歌单的文本信息包括歌单名称、歌单标签和歌单描述。歌单的文本信息中除了中英文，还会有很多表情符（使用unicode编码），虽然表情符一定程度上可以表征歌单的情感以及歌曲的情感，当然要具体细究那就是后续的工作了，此处需要将这些表情符进行过滤。  
1.将需要使用的文本信息进行提取，提取后存储在utf8_playlist.txt中。

In [None]:
import os
import tarfile
import pickle as pk
import sys
import json

def readPlaylist(in_line):
    data = json.loads(in_line)
    name = data['result']['name'] #歌单名称
    tags = ",".join(data['result']['tags']) # 标签以逗号分隔的字符串存储
    try:
        description=data['result']['description'] #歌单的描述
    except Exception as e:
        description="---"
    description=(str(description)).replace("\n","  ")
    subscribed_count = data['result']['subscribedCount'] # 收藏数
    if(subscribed_count<100):
        return False
    playlist_id = data['result']['id'] # 歌单的id名
    return name+"##"+tags+"##"+str(playlist_id)+"##"+str(description)
    # 歌单信息： 歌单名##歌单标签##歌单id##歌单描述

def readFile(in_file, out_file):
    out=open(out_file, 'w',encoding="utf-8")
    num=0
    for line in open(in_file,encoding='utf-8'):
        result=readPlaylist(line)
        num=num+1
        print("------------------",num,"--------------------")
        # print(result)
        if(result):
            out.write(str(result)+"\n")  
    out.close()
    
if __name__=="__main__":
    readFile("D:\playlistdetail.all.json", "./utf8_playlist.txt")

2.将所有的文本信息（标签、名称、描述）合在一起，由此汇成一个“文档”，这一个文档可认为是对歌单的总体描述，然后使用jieba分词，每一个文档分词的结果用list存储起来，最后歌单id和分词后的list以字典形式存储在dic_splited_sentence.pkl。  
（这里也可以考虑加入歌单里的每一首歌的名称，甚至歌的歌词进行主题建模，但由于收集歌词的难度较大，就没有做到这一步）  
由于歌单的描述中会混合非中文的其他语言文字或者一些颜文字、表情等，为了简便分词，将这些词都过滤，并且也根据导入的停用词表，过滤停用词，在这里会根据后面LDA模型的生成情况，添加一些针对歌单描述的特定停用词，如“歌曲”、“专辑”、“歌手”、“年度”、“单曲”、“歌单”等。

In [None]:
import json
import sys
import chardet
import jieba
import pickle as pk
import re
import emoji
import pandas as pd

def getSplitedSentence(i, dic_splited_sentence, stopwords):
    # 获取歌单信息: 歌单名##歌单标签##歌单id##歌单描述
    contents = i.strip().split("\t")
    playlist = contents[0].split("##")
    sentence = " ".join([playlist[0], playlist[1], playlist[3]])
    chinese_sentence = re.sub('[^\u4e00-\u9fa5]', '', sentence)  # 过滤非中文
    word_list = jieba.lcut(chinese_sentence)
    word_list = filter(lambda x: len(x) > 1, word_list)
    word_list = filter(lambda x: x not in stopwords, word_list)  # 过滤停用词
    word_list = list(word_list)
    # print(word_list)
    dic_splited_sentence[playlist[2]] = word_list


def readFile(in_file, out_pk, stopwords):
    dic_splited_sentence = {}
    num = 0
    for i in open(in_file, encoding="utf-8"):
        print("------------", num, "-------------")
        num = num+1
        getSplitedSentence(i, dic_splited_sentence, stopwords)
    pk.dump(dic_splited_sentence, open(out_pk, "wb"))


if __name__ == "__main__":
    stopwords = pd.read_csv("./stopwords.txt", index_col=False,
                            quoting=3, sep="\t", names=['stopword'], encoding='utf-8')
    stopwords = stopwords['stopword'].values
    readFile("./playlist_info_utf8.txt",
             "./dic_splited_sentence.pkl", stopwords)

### 歌单文本信息LDA主题建模


In [None]:
from gensim import corpora, models, similarities
import gensim
import pickle as pkl

all_sentence = []
dic = pkl.load(open("dic_splited_sentence.pkl", "rb"))
for (key, value) in dic.items():
    all_sentence.append(value)
# 词袋模型
dictionary = corpora.Dictionary(all_sentence)
print(type(dictionary))
pkl.dump(dictionary, open("./model/dictionary.pkl", "wb"))

corpus = [dictionary.doc2bow(i) for i in all_sentence]
print(corpus[5])
# lda训练
lda = gensim.models.ldamodel.LdaModel(
    corpus=corpus, id2word=dictionary, num_topics=30)
for topic in lda.print_topics(num_topics=30, num_words=5):
    print(topic[1])

lda.save('./model/playlist_lda.model')

### 协同过滤推荐
在进行了LDA建模后，可以看到建模效果较好，因此考虑使用基于用户的协同过滤算法实现推荐。  
这样解决的问题是，作为用户，当我听到了一个很喜欢的歌单时，我可以通过相似推荐得到与这个歌单相似的歌曲。 
这里具体的实现使用suprise库
#### 构建符合suprise的数据类型
读取模型和构建好的dictionary 根据这一模型 能够获得每一个歌单最符合的主题。
![](./pic1.png)

In [None]:
lda = models.ldamodel.LdaModel.load('./model/playlist_lda.model')
for topic in lda.print_topics(num_topics=30, num_words=5):
    print(topic[1])

dic = pkl.load(open("dic_splited_sentence.pkl","rb"))
doc_list=dic['486555874']
print(doc_list)
dictionary=pkl.load(open("./model/dictionary.pkl","rb"))
bow = [dictionary.doc2bow(doc_list)]
print(bow)
print(list(lda.get_document_topics(bow)))

这里就将歌曲解析成 歌单id 歌曲id 评分 的格式  
由于借用的是基于用户的协同过滤，那么歌单之间的相似性通过评分来决定，这个评分就是LDA模型分出的30种类别号。
将每一首歌按照格式存入formatted_music.txt中

In [None]:
#coding: utf-8
import json
import sys
from gensim import corpora, models, similarities
import pickle as pkl

lda = models.ldamodel.LdaModel.load('./model/playlist_lda.model')  # 加载模型
dic = pkl.load(open("dic_splited_sentence.pkl", "rb"))  # 读入字典
dictionary = pkl.load(open("./model/dictionary.pkl", "rb"))


def isNull(s):
    return len(s.split(",")) > 2


def getSongId(song_info):
    # 歌曲信息：歌曲id:::歌曲名称:::歌手:::歌曲热度
    try:
        song = song_info.split(":::")
        return song[0]
    except Exception as e:
        return ""


def getSongScore(x):
    doc_list = dic[x]
    bow = [dictionary.doc2bow(doc_list)]
    distribution = list(lda.get_document_topics(bow))
    score = 0
    maxp = 0
    for i in distribution[0]:
        if(maxp < i[1]):
            score, maxp = i[0], i[1]
    # print(score,maxp)
    return score


def formatInfo(in_line):
    try:
        contents = in_line.strip().split("\t")
        playlist = contents[0].split("##")
        # 歌单信息playlist contents[0]： 歌单名##歌单标签##歌单id##歌单描述##歌单收藏数
        # 歌曲信息content[1:]（每一条是以:::分割的信息）
        songScore = getSongScore(playlist[2])
        songs_info = map(
            lambda x: playlist[2]+","+str(getSongId(x))+","+str(songScore), contents[1:])
        songs_info = filter(isNull, songs_info)
        return "\n".join(songs_info)
    except Exception as e:
        print(e)
        return False


def readFile(in_file, out_file):
    out = open(out_file, 'w', encoding="utf-8")
    num = 0
    for line in open(in_file, encoding="utf-8"):
        print("-----", num, "-------")
        num = num+1
        result = formatInfo(line)
        if(result):
            out.write(result.strip()+"\n")
    out.close()


readFile("./playlist_info_utf8.txt", "./formatted_music.txt")


####  协同过滤算法
本来使用已经写好的surprise是非常方便的，但是surprise在我的两台电脑都跑不了，绝了，我决定手写。

In [None]:
import os
import io
import pickle as pkl
import pandas as pd
import numpy as np
from surprise import dataset
from surprise import KNNBaseline, Reader

id2name =pkl.load(open("playlist.pkl","rb"))
name2id = {}
for id in id2name:
    name2id[id2name[id]] = id

file_path = os.path.expanduser('./formatted_music.txt')
reader = Reader(line_format='user item rating', sep=',')
music = dataset.load_from_file(file_path, reader=reader)
trainset = music.build_full_trainset() # 数据集准备
algo = KNNBaseline() #KNN算法
algo.train(trainset)

#取一个歌单试试
test = name2id.keys()[39] 
test_id = name2id[test]
test_inner_id = algo.trainset.to_inner_uid(test_id)
print("歌单名称:", test,"歌单id:", test_id,"内部id:", test_inner_id)
playlist_neighbors = algo.get_neighbors(test_inner_id, k=10)

playlist_neighbors = (algo.trainset.to_raw_uid(inner_id)
                       for inner_id in playlist_neighbors)
playlist_neighbors = (id2name[playlist_id]
                       for playlist_id in playlist_neighbors)

print(test, "最接近的10个歌单为：\n")
for i in playlist_neighbors:
    print(i, algo.trainset.to_inner_uid(name2id[i]))
