In [1]:
import pandas as pd

In [11]:
data = pd.read_csv('ratings.csv')
data = data[['userId', 'movieId','rating']]

### 训练集，数据集拆分
把rating按照用户id分组，按0.8:0.2拆分成train_set 和 test_set
保证每个用户看过的电影数量在训练集和测试集中的比例都是0.8:0.2

In [84]:
def train_test_split(data, seed):
    train_set = {}
    test_set = {}
    for user, movies in data.groupby('userId'):
        movies = movies.sample(frac=1, random_state=seed).reset_index(drop=True)
        train = movies[:int(0.8*len(movies))]
        test = movies[int(0.8*len(movies)):]
        train_set[user] = train[['movieId', 'rating']]
        test_set[user] = test[['movieId', 'rating']]
    #print('Data preparation finished')
    return train_set, test_set

train_set, test_set = train_test_split(data, seed = 7)
print(train_set[1].shape[0]/test_set[1].shape[0])

3.9361702127659575


### User-base 推荐系统原理
计算余弦相似度，找到前k个最相似的用户
计算可以推荐电影（k个用户看过，但目标用户没看过的电影）的推荐价值分（某用户的评分乘以他和目标用户的相似度），找到前N个分数最高的电影作为输出。

In [104]:
from operator import itemgetter
import numpy as np

def User_Similarity(train_set):
    # Build inverse table
    item_user = {}
    rating = {}
    for u, items in train_set.items():
        rating[u] = {}
        movies = items['movieId'].tolist()
        ratings = items['rating'].tolist()
        for inum in range(len(movies)):
            i = movies[inum]
            rating[u][i] = ratings[inum]
            if i not in item_user.keys():
                item_user[i] = set()
            item_user[i].add(u)
    print('Inverse table finished')
    # Co-rated items between users
    C = {}
    N = {}
    W = {}
    for u in train_set.keys():
        N[u] = 0
        C[u] = {}
        W[u] = {}
        for v in train_set.keys():
            if v == u:
                continue
            C[u][v] = 0
            W[u][v] = 0
    for i, users in item_user.items():
        for u in users:
            N[u] += 1
            for v in users:
                if v == u:
                    continue
                #urating = rating[u][i]
                #vrating = rating[v][i]
                #corating = -abs(urating - vrating) + 5
                C[u][v] += 1/np.log(1+len(users)*1.0)
    print('Co-rated items count finished')
    # Calculate similarity matrix
    for u, related_users in C.items():
        for v, cuv in related_users.items():
            W[u][v] = cuv/(N[u]*N[v])**0.5
    print('Similarity calculation finished')
    return W

def Recommend(user, train, N, K, W):
    rank = {}
    watched = {}
    already_items = train[user]['movieId']
    for v, wuv in sorted(W[user].items(), key=itemgetter(1), reverse=True)[:K]:
        for i in train[v]['movieId']:
            if i in already_items:
                continue
            if i not in rank.keys():
                rank[i] = 0
                watched[i] = 0
            rank[i] += wuv
            watched[i] += 1
    #for i in rank.keys():
        #rank[i] = rank[i]/watched[i]
    recommend = sorted(rank.items(), key = itemgetter(1), reverse=True)[:N]
    print([movie[1] for movie in recommend])
    recommend = [movie[0] for movie in recommend]
    return recommend

user_simularity = User_Similarity(train_set)
recommend_movies = Recommend(1, train_set, 20, 10 ,user_simularity)
print(recommend_movies)


Inverse table finished
Co-rated items count finished
Similarity calculation finished
[0.6682759700803873, 0.6682759700803873, 0.6049559497088604, 0.6045868714056453, 0.6041357201939179, 0.5412668510341183, 0.5408685716254223, 0.5408685716254223, 0.5408685716254223, 0.5400483421104799, 0.539203212810112, 0.5391503410070806, 0.5378131733417608, 0.5374440950385456, 0.537439072683243, 0.537439072683243, 0.5369929438268183, 0.5366717149713318, 0.5362640988610213, 0.5362640988610213]
[2571, 480, 1391, 1580, 780, 457, 1270, 1909, 2628, 1036, 2700, 858, 380, 2174, 2916, 1371, 1240, 924, 1222, 2640]


### 1.Recall
召回率是指实际发生交互的商品中被预测出来部分的比例<br>

### 2.Precision
准确率是指实推荐的商品中用户真正发生交互商品的比例<br>

In [100]:
def Recall(train, test, N, K, W):
    hit = 0
    realall = 0
    for user in train.keys():
        tu = set(test[user]['movieId'].tolist())
        recommend_list = Recommend(user, train, N, K, W)
        user_hit = 0
        for item in recommend_list:
            if item in tu:
                user_hit += 1
                hit += 1
        #print('For user:', user, ', recall is ', user_hit/len(tu))
        realall += len(tu)
    return hit/(realall*1.0)

def Precision(train, test, N, K, W):
    hit = 0
    realall = 0
    for user in train.keys():
        tu = set(test[user]['movieId'].tolist())
        recommend_list = Recommend(user, train, N, K, W)
        user_hit = 0
        for item in recommend_list:
            if item in tu:
                hit += 1
                user_hit += 1
        #print('For user:', user, ', precision is ', user_hit/N)
        realall += N
    return hit/(realall*1.0)

for K in range(5, 31, 5):
    recall = Recall(train_set, test_set, 20, K, user_simularity)
    precision = Precision(train_set, test_set, 20, K, user_simularity)
    print('K:', K, 'Recall: ', recall, 'Precision: ', precision)

K: 5 Recall:  0.050839986285938187 Precision:  0.08508196721311476
K: 10 Recall:  0.055639907919870696 Precision:  0.09311475409836066
K: 15 Recall:  0.05588480188078562 Precision:  0.09352459016393443
K: 20 Recall:  0.057599059607190084 Precision:  0.09639344262295083
K: 25 Recall:  0.06048880834598619 Precision:  0.10122950819672132
K: 30 Recall:  0.059509232502326495 Precision:  0.09959016393442623
