In [1]:
# 尝试自行实现基于ItemCF的TopN推荐
# 数据源为movielens的1M老数据

In [2]:
# 定几个global变量
K = 80 #推荐中最相似的Top用户数
N = 10 #推荐结果中最Top的item数
R = 1 # 正反馈系数
SEED = 10
TRAIN_RATE = 0.8 # 训练集比例

In [3]:
## 读取数据，数据源为movielens的1M数据
import pandas as pd
rnames = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_table('dataset/ml-1m/ratings.dat', sep='::', header=None, names=rnames, engine='python')

## 截取其中的user_id和movie_id列
df = pd.DataFrame(data = ratings, columns = ['user_id','item_id'])


## 按比例拆分train_set和test_set
## 这只是简单拆分，实际按书中描述，应当分组反复训练
import random

train_set = []
test_set = []
for index, row in df.iterrows():
    if random.random()>TRAIN_RATE:
        test_set.append([row['user_id'], row['item_id']])
    else:
        train_set.append([row['user_id'], row['item_id']])
        
train_set = pd.DataFrame(data = train_set, columns = ['user_id','item_id'])
test_set = pd.DataFrame(data = test_set, columns = ['user_id','item_id'])


In [4]:
user_list = train_set['user_id']
user_list_test = test_set['user_id']
item_list = train_set['item_id']
item_list_test = test_set['item_id']

## 建用户到物品的倒排表user_item
## 同时建立item_user的关系，用于计算模长
user_item = dict()
item_user = dict()


for k in range(item_list.count()):
    u = user_list.iloc[k]
    i = item_list.iloc[k]
    if i not in item_user:
        item_user[i] = set()
    item_user[i].add(u)
    
    if u not in user_item:
        user_item[u] = set()
    user_item[u].add(i)

## 测试集
item_user_test = dict()
user_item_test = dict()
for k in range(user_list_test.count()):
    u = user_list_test.iloc[k]
    i = item_list_test.iloc[k]
    if i not in item_user_test:
        item_user_test[i] = set()
    item_user_test[i].add(u)
    
    if u not in user_item_test:
        user_item_test[u] = set()
    user_item_test[u].add(i)


In [6]:
## 对用户，建立物品之间的关联关系

import math

C = dict() # 普通的稀疏矩阵

for u, items in user_item.items():
    for i in items:
        C.setdefault(i, dict())
        for j in items:
            if i == j:
                continue
            C[i].setdefault(j,0)
            C[i][j] += 1

## 计算用户相关性

W = C.copy() # 余弦相似度

for i, related_items in C.items():
    for j, c_ij in related_items.items():
        W[i][j] = c_ij/math.sqrt(len(item_user[i])*len(item_user[j]))

In [62]:
## ItemCF推荐函数
from operator import itemgetter

def recommend(u, W , topN, relatedK):
    rank = dict()
    for i in user_item[u]:
        for j, w_j in sorted(W[i].items(), key = itemgetter(1), reverse = True)[:relatedK]:
            if j in user_item[u]:
                continue
            rank.setdefault(j,0)
            rank[j] += w_j
    r = sorted(rank.items(), key=itemgetter(1), reverse=True)[:topN]
    return set([i[0] for i in r])
#     return (rank)

    


In [60]:
## 带解释的ItemCF

from operator import itemgetter

class ItemCFRank(object):
    """docstring for ClassName"""
    def __init__(self):
        super(ItemCFRank, self).__init__()
        self.weight = 0
        self.reason = dict()
        

def recommend_with_explanation(u, W , topN, relatedK):
    rank = dict()
    for i in user_item[u]:
        for j, w_j in sorted(W[i].items(), key = itemgetter(1), reverse = True)[:relatedK]:
            if j in user_item[u]:
                continue
            rank.setdefault(j, ItemCFRank())
            rank[j].weight += w_j
            rank[j].reason.setdefault(i, 0)
            rank[j].reason[i] += w_j
    r = sorted(rank.items(), key=itemgetter(1), reverse=True)[:topN]
#     return set([i[0] for i in r])
    return (rank)


In [61]:
recommend_with_explanation(737, W, N, K)

{1265: <__main__.ItemCFRank at 0x15b1adf98>,
 588: <__main__.ItemCFRank at 0x15b1adba8>,
 3114: <__main__.ItemCFRank at 0x15b1adef0>,
 2355: <__main__.ItemCFRank at 0x15b1adf28>,
 356: <__main__.ItemCFRank at 0x15b1adbe0>,
 34: <__main__.ItemCFRank at 0x15b1adda0>,
 260: <__main__.ItemCFRank at 0x15b1addd8>,
 2396: <__main__.ItemCFRank at 0x15b1ade48>,
 364: <__main__.ItemCFRank at 0x15b1ade10>,
 1923: <__main__.ItemCFRank at 0x15b1ade80>,
 595: <__main__.ItemCFRank at 0x15b1adeb8>,
 1197: <__main__.ItemCFRank at 0x15b1adc50>,
 608: <__main__.ItemCFRank at 0x15b1adc88>,
 1517: <__main__.ItemCFRank at 0x15b1adcc0>,
 1784: <__main__.ItemCFRank at 0x15b1add30>,
 3253: <__main__.ItemCFRank at 0x15b1add68>,
 2797: <__main__.ItemCFRank at 0x15b1adcf8>,
 2174: <__main__.ItemCFRank at 0x15b1adc18>,
 480: <__main__.ItemCFRank at 0x15b1adb00>,
 2081: <__main__.ItemCFRank at 0x15b1adb38>,
 1641: <__main__.ItemCFRank at 0x15b1adb70>,
 1210: <__main__.ItemCFRank at 0x15b1ad9e8>,
 39: <__main__.Item

In [34]:
## 计算Recall、Precision、Coverage、Popularity
## 这里定义的流行度是曾经有K个用户进行过行为并取自然对数
## evaluate作用于测试集上

method_list = ['random','most_popular','itemcf']


def evaluate(method = 'itemcf', topN = N, relatedK = K):
    hit = 0
    n_recall = 0
    n_precision = 0
    all_items = len(set(item_list))
    recommended_items = []
    popularity_sum = 0
    
    d = train_set.groupby(['item_id']).size()
    d = sorted(d.items(), key = itemgetter(1), reverse=True)
    
    most_popular_items = set([i[0] for i in d[:topN]])
    
    uniq_item_list = set(item_list)
    
    gini_item_list = {}
    
    for user, items in user_item_test.items():
        if method == 'itemcf':
            rank = recommend(u = user, W = W , topN = topN, relatedK = relatedK)
        elif method == 'most_popular':
            rank = most_popular_items
        elif method == 'random':
            rank = set(random.sample(uniq_item_list, topN))
        else: 
            rank = set()
        hit += len(rank & items)
        n_recall += len(items)
        n_precision += topN
        recommended_items += [i for i in rank]
        popularity_sum += sum([len(item_user[i]) for i in rank])
        
        for i in rank:
            gini_item_list.setdefault(i,0)
            gini_item_list[i] += 1
    
    j = 1
    n = len(gini_item_list)
    G = 0
    gini_item_list = sorted(gini_item_list.items(), key = itemgetter(1), reverse=False)
    g_sum = sum([i[1] for i in gini_item_list])
    
    for i in range(n):
        G += (2 * j - n - 1) * gini_item_list[i][1]
        j += 1
    
    return {'recall': "{0:.2%}".format(hit / (1.0 * n_recall)),
            'precision': "{0:.2%}".format(hit / (1.0 * n_precision)),
            'coverage': "{0:.2%}".format(len(set(recommended_items))/all_items),
            'gini_index': "{0:.2%}".format(G/float(n-1)/g_sum),
            'popularity': "{:.3f}".format(math.log(popularity_sum / n_precision))
           }
    

In [35]:
import prettytable
tb = prettytable.PrettyTable()
tb.field_names = ["Method", "Recall", "Precision", "Coverage", "Gini Index", 'Popularity']
for i in method_list:
    l = []
    l.append(i)
    d = evaluate(method = i)
    tb.add_row(l + list(d.values()))
print(tb)

+--------------+--------+-----------+----------+------------+------------+
|    Method    | Recall | Precision | Coverage | Gini Index | Popularity |
+--------------+--------+-----------+----------+------------+------------+
|    random    | 0.28%  |   0.92%   | 100.00%  |   13.82%   |   5.382    |
| most_popular | 2.77%  |   9.21%   |  0.27%   |   0.00%    |   7.717    |
|    itemcf    | 8.52%  |   28.27%  |  13.66%  |   85.74%   |   7.389    |
+--------------+--------+-----------+----------+------------+------------+
