### 读取数据 创建字典索引

In [1]:
from collections import defaultdict 

aid_to_name = defaultdict(list) 
ait_to_picture = defaultdict(list) 
aname_to_id = defaultdict(list) 
tid_to_name = defaultdict(list) 
tname_to_id = defaultdict(list) 

with open('artists.dat', 'r', encoding='utf-8', errors='ignore') as f: 
    for line in f.readlines(): 
        line=line.strip('\n')
        try:
            artist_id,name,url,picture = line.split('\t')
        except ValueError: 
            print("Record: ", line)
            raise Exception("Failed while unpacking. Not enough arguments to supply.") 
        aid_to_name[artist_id]=name
        aname_to_id[name]=artist_id

with open('tags.dat', 'r', encoding='utf-8', errors='ignore') as f: 
    for line in f.readlines(): 
        line=line.strip('\n')
        try:
            tag_id,name = line.split('\t')
        except ValueError: 
            print("Record: ", line)
            raise Exception("Failed while unpacking. Not enough arguments to supply.") 
        tid_to_name[tag_id]=name
        tname_to_id[name]=tag_id
        

### 导入数据，标准化rating到1-100之间

In [458]:
import numpy as np
def rescale_linear(array):
    minimum, maximum = np.min(array), np.max(array)
    m = (100 - 1) / (maximum - minimum)
    b = 1 - m * minimum
    return m * array + b

array = np.array([1,4,11,5,7])
re_array = rescale_linear(array)
# np.percentile(re_array, 75)

array([  1. ,  30.7, 100. ,  40.6,  60.4])

In [459]:
import pandas as pd
df = pd.read_csv("u_a.csv")
uid_list = df.uid.unique().tolist()
new_df = pd.DataFrame(columns=['uid', 'iid', 'weight','thred'])

for uid in uid_list:
    iid_weight = df.loc[df['uid'] == uid, ['iid','weight']]
    iid_list = iid_weight['iid'].tolist()
    weight_list = iid_weight['weight'].tolist()
    new_weight_list = rescale_linear(np.array(weight_list))
    thred = np.percentile(new_weight_list, 75)
    for i in range(0,len(iid_list)):
        new_df = new_df.append({'uid': uid, 'iid': iid_list[i], 'weight':new_weight_list[i],'thred':thred}, ignore_index=True)
    

  after removing the cwd from sys.path.
  
  interpolation=interpolation)


In [460]:
new_df['uid']=new_df['uid'].astype('object')
new_df['iid']=new_df['iid'].astype('int')
new_df['weight'] = new_df['weight'].values.astype(np.int64)
new_df.to_csv('new_weight.dat', index=False, sep='\t', header=None)
new_df.head(5)

Unnamed: 0,uid,iid,weight,thred
0,2,51,100,19.883494
1,2,52,82,19.883494
2,2,53,80,19.883494
3,2,54,71,19.883494
4,2,55,61,19.883494


### 将数据导入dataset 定义user item 和rating

In [466]:
from __future__ import (absolute_import, division, print_function, unicode_literals)
import os
import io

from surprise import KNNBaseline, Reader
from surprise import Dataset
from surprise import Dataset, print_perf
from surprise.model_selection import cross_validate

columns=['userID', 'itemID', 'rating','thred']
data_artist_df = pd.read_csv("new_weight.dat",header=None, sep='\t',names=columns)
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 100))

# The columns must correspond to user id, item id and ratings (in that order).
data_artist = Dataset.load_from_df(data_artist_df[['userID', 'itemID', 'rating']], reader)


In [469]:
data_artist_df.loc[data_artist_df['userID'] == 2]['thred'].values.tolist()[0]

19.883493793761932

### 用损失最小的模型做预测

#### 交叉验证计算准确率和召回率

In [314]:
from collections import defaultdict

from surprise import Dataset
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise.model_selection import KFold


kf = KFold(n_splits=10)
algo_nmf = KNNBasic()
algo_knn = KNNWithMeans()
algo_svd = KNNWithZScore()
algo_bl = KNNBaseline()

precision_list_nmf = []
result_pre_nmf = []
result_rec_nmf = []

precision_list_knn = []
result_pre_knn = []
result_rec_knn = []

precision_list_svd = []
result_pre_svd = []
result_rec_svd = []

precision_list_bl = []
result_pre_bl = []
result_rec_bl = []

for trainset, testset in kf.split(data_artist):
    algo_nmf.fit(trainset)
    predictions_nmf = algo_nmf.test(testset)
    precision_list_nmf.append(predictions_nmf)
    
    algo_knn.fit(trainset)
    predictions_knn = algo_knn.test(testset)
    precision_list_knn.append(predictions_knn)
    
    algo_svd.fit(trainset)
    predictions_svd = algo_svd.test(testset)
    precision_list_svd.append(predictions_svd)
    
    algo_bl.fit(trainset)
    predictions_bl = algo_bl.test(testset)
    precision_list_bl.append(predictions_bl)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity ma

In [None]:
def precision_recall_at_k(predictions,k):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        threshold = data_artist_df.loc[data_artist_df['userID'] == uid]['thred'].values.tolist()[0]
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

for P in precision_list_knn:
    precisions, recalls = precision_recall_at_k(P,10)
    result_pre_knn.append(sum(prec for prec in precisions.values()) / len(precisions))
    result_rec_knn.append(sum(rec for rec in recalls.values()) / len(recalls))
    
for P in precision_list_nmf:
    precisions, recalls = precision_recall_at_k(P,10)
    result_pre_nmf.append(sum(prec for prec in precisions.values()) / len(precisions))
    result_rec_nmf.append(sum(rec for rec in recalls.values()) / len(recalls))

for p in precision_list_svd:
    precisions, recalls = precision_recall_at_k(p,10)
    result_pre_svd.append(sum(prec for prec in precisions.values()) / len(precisions))
    result_rec_svd.append(sum(rec for rec in recalls.values()) / len(recalls)) 

for p in precision_list_bl:
    precisions, recalls = precision_recall_at_k(p,10)
    result_pre_bl.append(sum(prec for prec in precisions.values()) / len(precisions))
    result_rec_bl.append(sum(rec for rec in recalls.values()) / len(recalls)) 

In [316]:
p_knn = mean(result_pre_knn)
r_knn = mean(result_rec_knn)
f_knn=p_knn*r_knn*2/(p_knn+p_knn)
f_knn

0.7558744039825219

In [317]:
p_svd = mean(result_pre_svd)
r_svd = mean(result_rec_svd)
f_svd=p_svd*r_svd*2/(p_svd+r_svd)
f_svd

0.8370640353530666

In [318]:
p_nmf = mean(result_pre_nmf)
r_nmf = mean(result_rec_nmf)
f_nmf=p_nmf*r_nmf*2/(p_nmf+r_nmf)
f_nmf

0.8372158751947157

In [319]:
p_bl = mean(result_pre_bl)
r_bl = mean(result_rec_bl)
f_bl=p_bl*r_bl*2/(p_bl+r_bl)
f_bl

0.5027735935342803

### 选KNNBasic 调整相似函数

In [365]:
from collections import defaultdict
from surprise import Dataset
from surprise import KNNBasic
from surprise.model_selection import KFold
from surprise import accuracy


trainset = data_artist.build_full_trainset()
testset = trainset.build_testset()

sim_options = {'name': 'cosine', 'user_based': True}
algo_c = KNNBasic(sim_options=sim_options)
algo_c.fit(trainset)
precision_c = algo_c.test(testset)

sim_options = {'name': 'msd', 'user_based': True}
algo_m = KNNBasic(sim_options=sim_options)
algo_m.fit(trainset)
precision_m = algo_m.test(testset)

sim_options = {'name': 'pearson', 'user_based': True}
algo_p = KNNBasic(sim_options=sim_options)
algo_p.fit(trainset)
precision_p = algo_p.test(testset)

sim_options = {'name': 'pearson_baseline', 'user_based': True}
algo_pb = KNNBasic(sim_options=sim_options)
algo_pb.fit(trainset)
precision_pb = algo_pb.test(testset)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [366]:
p_c = []
for i in range(0,len(precision_c)):
    item = precision_c[i]
    if item.r_ui > 0:
        p.append(item)
accuracy.rmse(p, verbose=True)

RMSE: 15.2396


15.239648987957029

In [367]:
p_p = []
for i in range(0,len(precision_p)):
    item = precision_p[i]
    if item.r_ui > 0:
        p_p.append(item)
accuracy.rmse(p_p, verbose=True)

RMSE: 12.3623


12.362319319774915

In [368]:
p_m = []
for i in range(0,len(precision_m)):
    item = precision_m[i]
    if item.r_ui > 0:
        p_m.append(item)
accuracy.rmse(p_m, verbose=True)

RMSE: 0.8837


0.8837125574353569

In [369]:
p_pb = []
for i in range(0,len(precision_pb)):
    item = precision_c[i]
    if item.r_ui > 0:
        p_pb.append(item)
accuracy.rmse(p_pb, verbose=True)

RMSE: 15.2396


15.239648987957029

#### 用KNNBasic建模预测, 对每个user推荐N个artist

In [371]:
from collections import defaultdict

from surprise import KNNBasic

def get_top_n(predictions, n=10):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


trainset = data_artist.build_full_trainset()
sim_options = {'name': 'msd', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

Computing the msd similarity matrix...
Done computing similarity matrix.
2.0 [634, 1250, 2862, 5017, 5026, 5097, 5596, 6638, 3528, 7062]
3.0 [51, 447, 3502, 5960, 6149, 7010, 7641, 9761, 11303, 16317]
4.0 [141, 634, 2263, 2354, 3130, 4324, 4440, 4737, 5017, 5201]
5.0 [141, 2263, 2354, 4440, 4518, 4625, 5017, 5325, 6060, 6638]
6.0 [1270, 1685, 2217, 2301, 2309, 4234, 4491, 4822, 5017, 5325]
7.0 [634, 1153, 2301, 2497, 6060, 6638, 7062, 4669, 8308, 8966]
8.0 [634, 1153, 1879, 2300, 2301, 2497, 2691, 6060, 6638, 4669]
9.0 [634, 2263, 2497, 2697, 4887, 5017, 5325, 6060, 7533, 8173]
10.0 [2100, 2263, 3130, 3906, 4324, 4440, 4518, 4625, 5017, 5026]
11.0 [2301, 2497, 2697, 4174, 4440, 4781, 5017, 6060, 6638, 7062]
12.0 [141, 2301, 2354, 4440, 5017, 5097, 5201, 5325, 5596, 5960]
13.0 [634, 751, 1879, 2300, 4440, 5017, 5026, 5097, 5325, 6060]
14.0 [1696, 1981, 2991, 3334, 3769, 4208, 5367, 6638, 7062, 7532]
15.0 [2263, 2354, 2697, 4324, 4437, 4440, 4518, 5017, 5097, 5574]
16.0 [1153, 1216, 1879

1154.0 [549, 1288, 2236, 3196, 3299, 6076, 6149, 6962, 7010, 7062]
1155.0 [141, 576, 2263, 2944, 3334, 3748, 3798, 4324, 4437, 4440]
1156.0 [141, 1288, 2263, 2354, 2749, 4440, 5017, 5097, 5201, 5596]
1157.0 [1685, 1696, 2332, 2354, 2390, 4625, 5017, 5201, 5574, 5596]
1158.0 [141, 610, 2354, 2390, 2862, 3248, 4440, 5017, 5026, 5201]
1159.0 [141, 1250, 2354, 5017, 5097, 5201, 5596, 5960, 6060, 6797]
1160.0 [751, 2263, 2309, 2354, 2390, 2697, 3798, 4440, 5201, 5325]
1161.0 [634, 1879, 2497, 4440, 5201, 5247, 5325, 5596, 6638, 7533]
1163.0 [141, 1250, 2263, 2354, 2862, 2991, 5017, 5026, 5097, 5201]
1164.0 [2497, 4437, 4440, 4822, 5017, 5097, 5201, 5596, 6149, 6638]
1165.0 [2263, 2354, 2862, 2991, 4440, 4518, 4625, 5017, 5026, 5201]
1166.0 [141, 2236, 2301, 2354, 3692, 3798, 5017, 5325, 5596, 6351]
1167.0 [62, 1288, 1994, 2236, 2263, 2332, 2471, 4348, 4440, 4518]
1170.0 [634, 1879, 2697, 4440, 4737, 5017, 5026, 5325, 5596, 5960]
1171.0 [634, 1879, 1944, 2301, 2354, 2497, 4440, 4625, 5017, 6

In [215]:
rec_list = [634, 1250, 2862, 5017, 5026, 5097, 5596, 6638, 3528, 7062]
for rec in rec_list:
    print(aid_to_name[str(rec)])

Μιχάλης Χατζηγιάννης
Alpha Quadrant
Dudley Taft
F.L.A.B
Eraplee Noisewall Orchestra
X-Patriate (Alan J. Lipman)
SMA
Noel Rosa
Bohren & der Club of Gore
Deja Vue


## 利用user-tag.dat 对user推荐10 个artist

In [158]:
import pandas as pd
df = pd.read_csv("user_taggedartists-timestamps.dat",header=None, sep='\t',names=['uid', 'aid','tid', 'time'])

user_artist = df[['uid','tid']]
user_artist['rating'] = 1
user_artist.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,uid,tid,rating
0,2,13,1
1,2,15,1
2,2,18,1
3,2,21,1
4,2,41,1


In [59]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader()

# The columns must correspond to user id, item id and ratings (in that order).
data_tag = Dataset.load_from_df(user_artist[['uid', 'tid', 'rating']], reader)


#### 用KNNBasic建模 对user推荐50个artist

In [372]:
from collections import defaultdict

from surprise import KNNBasic

def get_top_n(predictions, n=10):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
trainset = data_artist.build_full_trainset()
sim_options = {'name': 'msd', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=50)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

Computing the msd similarity matrix...
Done computing similarity matrix.
2.0 [634, 1250, 2862, 5017, 5026, 5097, 5596, 6638, 3528, 7062, 7533, 7641, 8388, 8579, 9207, 9634, 10797, 8403, 11245, 11246, 11892, 12400, 12547, 12795, 13162, 13686, 14154, 15075, 15330, 15644, 15663, 15813, 16317, 16356, 16504, 16857, 16952, 11272, 17379, 17571, 17846, 17913, 17960, 18158, 18229, 18305, 18336, 2301, 6149, 8459]
3.0 [51, 447, 3502, 5960, 6149, 7010, 7641, 9761, 11303, 16317, 17379, 8459, 10900, 12358, 2446, 6755, 197, 6920, 12007, 14361, 1075, 3685, 3248, 9143, 18706, 7011, 6046, 223, 906, 13595, 5961, 1035, 3413, 2200, 5962, 1298, 5411, 11304, 13953, 7643, 3026, 3280, 7642, 1718, 3249, 15338, 5067, 15880, 227, 3681]
4.0 [141, 634, 2263, 2354, 3130, 4324, 4440, 4737, 5017, 5201, 5596, 6638, 7010, 7062, 7533, 7641, 8388, 8457, 8579, 9207, 9634, 10797, 8403, 11303, 11732, 11892, 11977, 12400, 12547, 12795, 13162, 13686, 13865, 13999, 14057, 14092, 14840, 15075, 15250, 15384, 15644, 15663, 15813, 

1273.0 [141, 2263, 2991, 4440, 4625, 5017, 5097, 5201, 5325, 5596, 6638, 6797, 7010, 7062, 7641, 8388, 8457, 8966, 10250, 10299, 10797, 8403, 11245, 11246, 11303, 11732, 12516, 12547, 12795, 12866, 13162, 13211, 13686, 13865, 13947, 13999, 14057, 14092, 14123, 14154, 13468, 15113, 15330, 15384, 15644, 15663, 15813, 15933, 16356, 16504]
1274.0 [141, 2354, 2862, 2991, 4440, 4518, 5017, 5097, 5201, 5325, 5596, 5960, 6060, 6638, 6797, 7062, 7641, 7735, 7736, 7822, 8308, 8388, 9363, 9476, 10299, 10797, 8403, 12516, 12547, 13162, 13351, 13637, 13686, 13865, 13999, 14092, 14123, 15250, 15384, 15813, 16356, 16504, 16857, 11272, 17424, 17846, 17913, 18336, 18630, 8966]
1275.0 [141, 2263, 2354, 2862, 2991, 4324, 4440, 5026, 5097, 5165, 5201, 5574, 5596, 5960, 6060, 6638, 7062, 7641, 8308, 8388, 8457, 8579, 8966, 9634, 9751, 5038, 10797, 8403, 10993, 11245, 11246, 11732, 11892, 12516, 12547, 12795, 13162, 13351, 13686, 13865, 13999, 14057, 14092, 14123, 13468, 15075, 15330, 15384, 15933, 16857]
1

In [456]:
from collections import Counter
def findTagsByArtist(aid):
    try:
        tags = tag_artist.loc[tag_artist['aid'] == aid]['tid'].values.tolist()
        return tags
    except KeyError:
        return []

from collections import Counter 
tag_weight = pd.DataFrame()
colums = ['uid']

df_artist = data_artist.df
artist_li = df_artist.loc[df_artist['userID'] == 2]['itemID'].values.tolist()
tag_li = []
tag_dic={}

for artist in artist_li:
    tags = findTagsByArtist(artist)
    tag_li.append(tags)


tag_dic = Counter(tag_li[0])

tag_dic

Counter({16: 38,
         25: 41,
         17: 6,
         18: 6,
         56: 2,
         195: 8,
         213: 2,
         230: 1,
         231: 2,
         395: 14,
         508: 2,
         662: 1,
         801: 1,
         831: 1,
         835: 1,
         885: 1,
         887: 1,
         889: 1,
         892: 1,
         895: 1,
         896: 2,
         897: 1,
         899: 1,
         900: 2,
         919: 3,
         956: 1,
         957: 1,
         960: 1,
         961: 1,
         962: 1,
         963: 1,
         964: 1,
         965: 1,
         968: 1,
         969: 1,
         972: 1,
         201: 1,
         200: 2,
         238: 2,
         74: 9,
         642: 1,
         1886: 1,
         24: 17,
         73: 8,
         79: 4,
         81: 2,
         121: 3,
         192: 1,
         210: 4,
         352: 6,
         389: 3,
         850: 3,
         2006: 11,
         509: 1,
         134: 2,
         505: 1,
         363: 1,
         3049: 1,
         3129: 1

In [439]:
#for userId = 2
artist_list = [634, 1250, 2862, 5017, 5026, 5097, 5596, 6638, 3528, 7062, 7533, 7641, 8388, 8579, 9207, 9634, 10797, 8403, 11245, 11246, 11892, 12400, 12547, 12795, 13162, 13686, 14154, 15075, 15330, 15644, 15663, 15813, 16317, 16356, 16504, 16857, 16952, 11272, 17379, 17571, 17846, 17913, 17960, 18158, 18229, 18305, 18336, 2301, 6149, 8459]
d = {'uid':[2],'297':[0.9],'14':[0.81],'18':[0.9],'1264':[0.4]}
tag_weight = pd.DataFrame(data=d)

tag_weight.head(3)
art_weght_dic = {}

def findWeightByTags(tag):
    try:
        weight = tag_weight.loc[tag_weight['uid'] == 2][str(tag)]
        return float(weight)
    except KeyError:
        return -1


for artist in artist_list:
    tags = findTagsByArtist(artist)
    if len(tags) > 0:
        total_weight = 0
        for tag in tags:
            weight = findWeightByTags(tag)
            if weight != -1:
                total_weight = total_weight + weight
        if total_weight>0:
            art_weght_dic[artist]=total_weight
    
print(art_weght_dic) 


{634: 0.9, 1250: 1.71, 5026: 0.4, 3528: 2.43, 7062: 0.9, 8388: 1.71, 11892: 0.81, 12400: 0.9, 15075: 0.9, 15813: 1.71, 16857: 0.9, 17379: 1.71, 17913: 0.9, 17960: 0.9, 18158: 2.61, 18229: 2.61, 6149: 1.71}


In [425]:
sorted_x = sorted(art_weght_dic.items(), key=lambda x: (-x[1], x[0]))
sorted_x

[(18158, 2.61),
 (18229, 2.61),
 (3528, 2.43),
 (1250, 1.71),
 (6149, 1.71),
 (8388, 1.71),
 (15813, 1.71),
 (17379, 1.71),
 (634, 0.9),
 (7062, 0.9),
 (12400, 0.9),
 (15075, 0.9),
 (16857, 0.9),
 (17913, 0.9),
 (17960, 0.9),
 (11892, 0.81),
 (5026, 0.4)]

### 比较artist之间相似度，找到最相似的10个邻居

In [10]:
# 用KNNBaseline建模预测, get the top-N recommendations for an item
# 计算ARTIST和ARTIST之间的相似度
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine','user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x47254b6a0>

In [11]:
# 根据item做预测
itemid = int(aname_to_id['Coldplay'])
item_innerid = algo.trainset.to_inner_iid(itemid)
# 找到最近的10个邻居
item_neighbors = algo.get_neighbors(item_innerid, k=10)
item_neighbors

[9, 23, 27, 28, 31, 32, 36, 41, 43, 62]

In [12]:
# 从近邻的id映射回artist名称
item_rids = []
item_names=[]

for inner_id in item_neighbors:
    a = algo.trainset.to_raw_iid(inner_id)
    item_rids.append(a)                        

for rid in item_rids:
    a = rid_to_name[str(rid)]
    item_names.append(a)

print()
print('The 10 nearest neighbors of Coldplay are:')
for artist in item_names:
    print(artist)

NameError: name 'rid_to_name' is not defined

### 不用看

In [None]:
# # 根据user做预测
# # 设置超参数搜索
# param_grid = {'bsl_options': {'method': ['als', 'sgd'],
#                               'reg': [1, 2]},
#               'k': [2, 3],
#               'sim_options': {'name': ['msd', 'cosine'],
#                               'min_support': [1, 5],
#                               'user_based': [True]}
#               }
# # KNNWithMeans 基于的一个假设也是用户和item的评分有高低，去除一个平均值后再计算。 KNNBaseline基于KNNWithMeans，将均值换成baseline的值
# # 使用网格搜索交叉验证
# grid_search = GridSearch(KNNBaseline, param_grid, measures=['RMSE', 'FCP'])
# # 在数据集上找到最好的参数
# grid_search.evaluate(data)