In [2]:
# 导入包
import random
import math
import time
import numpy as np
import pandas as pd
import warnings
import os
import sys
import operator
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)

In [3]:
def read_data(path):
    data = []
    with open(path,"r") as f:
        for line in f.readlines():
            user,item,tag = line.split("\t")[:3]
            data.append([user, item, tag])
        return data

In [4]:
def split_data(data, M=5, k=1):
    train = []
    test = []
    random.seed(42)
    for line in data:
        if random.randint(0, M) == k:
            test.append(line)
        else:
            train.append(line)
    return train, test

In [5]:
# convert data to dict format
def transform_data(data):   
    data_dict = {}
    for user, item, tag in data:
        if user not in data_dict:
            data_dict[user] = dict()
        data_dict[user][item] = tag 
    return data_dict        

In [6]:
def preprocess_data(path):
    raw_data = read_data(filePath)
    train_set, test_set = split_data(raw_data)
    # train = transform_data(train_set)
    # test = transform_data(test_set)
    return train_set, test_set

In [33]:
def GetRecommendation(result, user):
    rank = result[user]
    return rank
    
def Recall(train_user_items, test_user_items, result):
    hit = 0
    all = 0
    for user in train_user_items.keys():
        tu = test_user_items.get(user)
        if tu is None:
            continue
        rank = GetRecommendation(result, user)
        for item in rank:
            if item in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)
    
def Precision(train_user_items, test_user_items, result):
    hit = 0
    all = 0
    for user in train_user_items.keys():
        tu = test_user_items.get(user)
        if tu is None:
            continue    
        rank = GetRecommendation(result, user)
        for item in rank:
            if item in tu:
                hit += 1
        all += len(rank)
    return hit / (all * 1.0)
    
def Coverage(test_user_items, result):
    recommend_items = set()
    all_items = set()
    for user, items in test_user_items.items():
        for item in items:
            all_items.add(item)
        rank = GetRecommendation(result, user)
        for item in rank:
            recommend_items.add(item)
    return len(recommend_items) / (len(all_items) * 1.0)
    
def Popularity(test_user_items, result):
    item_popularity = dict()
    for user, items in test_user_items.items():
        for item in items:
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] += 1

    ret = 0
    n = 0
    for user in test_user_items.keys():
        rank = GetRecommendation(result, user)
        for item in rank:
            if item not in item_popularity:
                continue
            ret += math.log(1 + item_popularity[item])
            n += 1
        ret /= n * 1.0
    return ret

def CosSim(test_item_tags, i, j):
    ret = 0
    for tag, wib in test_item_tags[i].items():
        if tag in test_item_tags[j]:
            ret += wib * test_item_tags[j][tag]
            
    ni = 0
    nj = 0
    for tag, w in test_item_tags[i].items():
        ni += w*w
    for tag, w in test_item_tags[j].items():
        nj += w*w
    
    if ret == 0:
        return 0
    return ret / math.sqrt(ni * nj)

def Diversity(test_user_items, test_item_tags, result):
    ret = 0
    n = 0
    div = []
    for user in test_user_items.keys():
        rank = GetRecommendation(result, user)
        for i in rank:
            for j in rank:
                if i == j:
                    continue
                ret += CosSim(test_item_tags, i, j)
                n += 1
        ret = ret / (n * 1.0)
        div.append(ret)
    return sum(div) / len(div)

In [24]:
def rec_summary(result, numFold=5):
    precision =0
    recall = 0
    coverage = 0
    popularity = 0 
    diversity = 0

    for i in range(0, numFold):
        precision += Precision(train_user_items, test_user_items, result)
        recall += Recall(train_user_items, test_user_items, result)
        coverage += Coverage(test_user_items, result)
        popularity += Popularity(test_user_items, result)
        diversity += Diversity(test_user_items, test_item_tags, result)

    precision /= numFold
    recall /= numFold
    coverage /= numFold
    popularity /= numFold
    diversity /= numFold

    print('precision = %f' %precision)
    print('recall = %f' %recall)
    print('coverage = %f' %coverage)
    print('popularity = %f' %popularity)
    print('diversity = %f' %diversity)

In [9]:
from collections import defaultdict

def InitStat(data):
    user_tags = defaultdict(lambda: {})
    tag_users = defaultdict(lambda: set())
    tag_items = defaultdict(lambda: {})
    item_tags = defaultdict(lambda: {})
    user_items = defaultdict(lambda: list())
    item_users = defaultdict(lambda: list())
    for user, item, tag in data:
        user_tags[user].setdefault(tag, 0)
        user_tags[user][tag] += 1
        
        tag_users[tag].add(user)       

        tag_items[tag].setdefault(item, 0)
        tag_items[tag][item] += 1
        
        item_tags[item].setdefault(tag, 0)
        item_tags[item][tag] += 1

        user_items[user].append(item)
        
        item_users[item].append(user)
    
    return user_tags, tag_users, tag_items, item_tags, user_items, item_users

In [10]:
def SimpleTagRec(train, N):
    rank = dict()
    result = dict()
    
    for user, item, tag in train:
        item_list = []
        tagged_items = train_user_items[user]
        for tag, wut in train_user_tags[user].items():
            for item, wti in train_tag_items[tag].items():
                if item in tagged_items:
                    continue
                if item not in rank:
                    rank[item] = wut * wti
                else:
                    rank[item] += wut * wti
        rec_items = list(sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[:N])
        for tuple in rec_items:
            item_list.append(tuple[0])
        result[user] = item_list
    return result       
    

In [11]:
def TagBasedTFIDF(train, N):
    rank = dict()
    result = dict()
    
    for user, item, tag in train:
        item_list = []
        tagged_items = train_user_items[user]
        for tag, wut in train_user_tags[user].items():
            for item, wti in train_tag_items[tag].items():
                if item in tagged_items:
                    continue
                if item not in rank:
                    rank[item] = wut * wti / math.log(len(train_tag_users[tag]) + 1)
                else:
                    rank[item] += wut * wti / math.log(len(train_tag_users[tag]) + 1)
        rec_items = list(sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[:N])
        for tuple in rec_items:
            item_list.append(tuple[0])
        result[user] = item_list
    return result          

In [12]:
def TagBasedTFIDFPlus(train, N):
    rank = dict()
    result = dict()
    
    for user, item, tag in train:
        item_list = []
        tagged_items = train_user_items[user]
        for tag, wut in train_user_tags[user].items():
            for item, wti in train_tag_items[tag].items():
                if item in tagged_items:
                    continue
                if item not in rank:
                    rank[item] = wut * wti / math.log(len(train_tag_users[tag]) + 1) / math.log(len(train_item_users[item]) + 1)
                else:
                    rank[item] += wut * wti / math.log(len(train_tag_users[tag]) + 1) / math.log(len(train_item_users[item]) + 1)
        rec_items = list(sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[:N])
        for tuple in rec_items:
            item_list.append(tuple[0])
        result[user] = item_list
    return result       
    

In [13]:
filePath = "./dataset/delicious-2k/user_taggedbookmarks_small.dat"
train, test = preprocess_data(filePath)

In [14]:
train_user_tags, train_tag_users, train_tag_items, train_item_tags, train_user_items, train_item_users = InitStat(train)

In [15]:
test_user_tags, test_tag_users, test_tag_items, test_item_tags, test_user_items, test_item_users = InitStat(test)

In [16]:
simple_tag_result = SimpleTagRec(train, 10)

In [17]:
tfidf_result = TagBasedTFIDF(train, 10)

In [18]:
tfidfPlus_result = TagBasedTFIDFPlus(train, 10)

In [34]:
rec_summary(simple_tag_result)

precision = 0.010596
recall = 0.002181
coverage = 0.011269
popularity = 0.010928
diversity = 0.000832


In [35]:
rec_summary(tfidf_result)

precision = 0.010596
recall = 0.002181
coverage = 0.011269
popularity = 0.011077
diversity = 0.000857


In [36]:
rec_summary(tfidfPlus_result)

precision = 0.008609
recall = 0.001772
coverage = 0.013396
popularity = 0.011087
diversity = 0.000342
