In [1]:
# 针对Delicious数据集，对SimpleTagBased算法进行改进（使用NormTagBased、TagBased-TFIDF算法）

In [2]:
import math
import random
import operator
import pandas as pd
from collections import defaultdict
import numpy as np

In [3]:
# 加载数据集
df = pd.read_csv('F:/BI/第三周/名企班/code/delicious-2k/user_taggedbookmarks-timestamps.dat', sep='\t')

In [34]:
# 字典类型，保存user对item的tag，即:{userid: {item1:[tag1, tag2], ...}}
records = {}

# 训练集，测试集
train_data = dict()
test_data = dict()

# {用户u:{标签t:用户u使用过标签t的次数}}
user_tags = dict()

# {标签t:{商品i:标签t打在商品i上的次数}}
tag_items = dict()

# {用户u:{商品i:用户u使用商品i的次数}}
user_items = dict()

# {标签t: {用户u: 标签t被用户u使用的次数}}
tag_users = {}

# {商品i: {用户u: 商品i被用户u打过标签的次数}}
item_users = {}

# {商品i: {标签t: 商品i被打过标签t的次数}}
item_tags = {}

In [36]:
%%time
for i in range(df.shape[0]):
    uid = df['userID'][i]
    iid = df['bookmarkID'][i]
    tag = df['tagID'][i]
    # 键不存在时，新增键，且设置value为{}
    records.setdefault(uid,{})
    records[uid].setdefault(iid,[])
    records[uid][iid].append(tag)

Wall time: 14.7 s


In [37]:
print("数据集大小为 %d." % (len(df)))
print("设置tag的人数 %d." % (len(records)))

数据集大小为 437593.
设置tag的人数 1867.


In [38]:
# 数据集抽取作训练

In [40]:
# 将数据集拆分为训练集和测试集
def train_test_split(ratio, seed=100):
    random.seed(seed)
    m, n = 0, 0
    # u 是每个用户id
    for u in records.keys():
    	# i 是每个用户收藏的书签ID
        for i in records[u].keys():
            # ratio比例设置为测试集
            if random.random() < ratio:
                test_data.setdefault(u, {})
                test_data[u].setdefault(i, [])
                # t 是每个书签被打的标签id
                for t in records[u][i]:
                    test_data[u][i].append(t)
                    n += 1
            else:
                train_data.setdefault(u, {})
                train_data[u].setdefault(i, [])
                for t in records[u][i]:
                    train_data[u][i].append(t)  
                    m += 1      
    print("训练集样本数 %d, 测试集样本数 %d" % (len(train_data), len(test_data)))
    print("测试集总标签数:%d" % n)
    print("训练集总标签数:%d" % m)
    print("测试集总标签数占总样本的{:.2f}%".format(n / (m + n) * 100))

In [41]:
%%time
train_test_split(ratio=0.2)

训练集样本数 1860, 测试集样本数 1793
测试集总标签数:173544
训练集总标签数:701642
测试集总标签数占总样本的19.83%
Wall time: 314 ms


In [42]:
# 初始化字典

In [44]:
# 设置字典 mat{index: {item: 1}
def addValueToMat(mat, index, item, value=1):
    # 假如index在mat字典中不存在，则新建该键
    if index not in mat:
        mat.setdefault(index,{})
        mat[index].setdefault(item, value)
    else:
        if item not in mat[index]:
            mat[index][item] = value
        else:
            mat[index][item] += value


# 使用训练集，初始化user_tags, tag_items, user_items
def initStat():
    records=train_data
    # u是用户id, items是书签id的字典
    for u, items in records.items():
    	# i是书签id，tags是标签的列表
        for i, tags in items.items():
        	# tag是标签id
            for tag in tags:
                # 用户和tag的关系
                addValueToMat(user_tags, u, tag, 1)
                # tag和item的关系
                addValueToMat(tag_items, tag, i, 1)
                # 用户和item的关系
                addValueToMat(user_items, u, i, 1)
                # tag和用户的关系
                addValueToMat(tag_users, tag, u, 1)
                # item和用户的关系
                addValueToMat(item_users, i, u, 1)
                # item和tag关系
                addValueToMat(item_tags, i, tag, 1)
    print("user_tags, tag_items, user_items初始化完成.")
    print("len(user_tags): %d, len(tag_items): %d, len(user_items): %d" % \
          (len(user_tags), len(tag_items), len(user_items)))
    print("len(tag_users):", len(tag_users))
    print("len(item_users)", len(item_users))

In [45]:
%%time 
initStat()

user_tags, tag_items, user_items初始化完成.
len(user_tags): 1860, len(tag_items): 36884, len(user_items): 1860
len(tag_users): 36884
len(item_users) 59555
Wall time: 2.4 s


In [46]:
# 算法实现
# 1、NormTagBased算法

In [47]:
def recommend_by_norm(user, N):
    # 先找到用户打过标签的商品{item:[tag...]}
    tagged_items = user_items[user]
    # 创建推荐字典，存储推荐商品和兴趣分
    recommend_items = defaultdict(int)
    # 用户u用过的标签t, 以及他使用标签t的次数wut
    for t, wut in user_tags[user].items():
        # 被打过标签t的商品i，以及商品i被打上标签t的次数wti
        for i, wti in tag_items[t].items():
            # 目的是推荐用户没打过标签的，但又符合他兴趣的商品
            # 因此遇到用户已打过标签的商品就跳过
            if i in tagged_items:
                continue
            recommend_items[i] += (wut / len(user_tags[user])) * (wti / len(tag_users[t]))
    return sorted(recommend_items.items(), 
                 key=operator.itemgetter(1),
                 reverse=True)[0:N]

In [48]:
%%time
recommend_by_norm(8, 3)

Wall time: 28 ms


[(23702, 0.043225773872645555),
 (66188, 0.043179118967625364),
 (28750, 0.04191389791961279)]

In [49]:
# 使用测试集，计算精确率和召回率
def precisionAndRecall(N):
    hit = 0
    h_recall = 0
    h_precision = 0
    # user用户，items是字典{书签ids: tag列表}
    for user, items in test_data.items():
        if user not in train_data:
            continue
        # 获取Top-N推荐列表
        rank = recommend_by_norm_1(user, N)
        # item商品(书签ID)，rui是兴趣分
        for item, rui in rank:
            # 如果推荐的商品在该用户的书签字典中，说明推荐对了，则hit+1
            if item in items:
                hit = hit + 1
        # len(items) 实际打过标签的物品数
        h_recall += len(items)
        h_precision += N
    # 返回精确率、召回率
    prec = hit / (h_precision * 1.0)
    rec = hit / (h_recall * 1.0)
    return prec, rec

In [50]:
%%time 
print("NormTagBased")
testRecommend()

NormTagBased
推荐结果评估
  N        精确率        召回率
  5       0.907%       0.388%
 10       0.638%       0.546%
 20       0.507%       0.868%
 40       0.356%       1.218%
 60       0.287%       1.476%
 80       0.255%       1.750%
100       0.241%       2.061%
Wall time: 6min 20s


In [52]:
# 2、TagBased-TFIDF算法

In [53]:
def recommend_by_tfidf(user, N):
    tagged_items = user_items[user]
    recommend_items = defaultdict(int)
    for t, wut in user_tags[user].items():
        for i, wti in tag_items[t].items():
            if i in tagged_items:
                continue
            recommend_items[i] += (wut / np.log10(1 + len(tag_users[t]))) * wti
    return sorted(recommend_items.items(),
                 key=operator.itemgetter(1),
                 reverse=True)[:N]

In [54]:
%%time
recommend_by_tfidf(8, 3) 

Wall time: 71 ms


[(1416, 113.06884912889319),
 (1526, 98.41355946187662),
 (4639, 90.13454458701486)]

In [55]:
# 使用测试集，计算精确率和召回率
def precisionAndRecall(N):
    hit = 0
    h_recall = 0
    h_precision = 0
    # user用户，items是字典{书签ids: tag列表}
    for user, items in test_data.items():
        if user not in train_data:
            continue
        # 获取Top-N推荐列表
        rank = recommend_by_tfidf(user, N)
        # item商品(书签ID)，rui是兴趣分
        for item, rui in rank:
            # 如果推荐的商品在该用户的书签字典中，说明推荐对了，则hit+1
            if item in items:
                hit = hit + 1
        # len(items) 实际打过标签的物品数
        h_recall += len(items)
        h_precision += N
    # 返回精确率 和 召回率
    prec = hit / (h_precision * 1.0)
    rec = hit / (h_recall * 1.0)
    return prec, rec

In [56]:
%%time
print("TagBased-TFIDF算法:")
testRecommend()

TagBased-TFIDF算法:
推荐结果评估
  N        精确率        召回率
  5       1.008%       0.431%
 10       0.761%       0.652%
 20       0.549%       0.940%
 40       0.402%       1.376%
 60       0.328%       1.687%
 80       0.297%       2.033%
100       0.269%       2.306%
Wall time: 19min 37s
