In [1]:
# 导入包
import random
import math
import time
import numpy as np
import pandas as pd
import warnings
import os
import sys
import operator
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)

In [2]:
def read_data(path):
    data = []
    with open(path,"r") as f:
        for line in f.readlines():
            user,movie,rating,_ = line.split("::")
            data.append([user, movie])
        return data

In [3]:
def split_data(data, M=5, k=1):
    train = []
    test = []
    random.seed(42)
    for line in data:
        if random.randint(0, M) == k:
            test.append(line)
        else:
            train.append(line)
    return train, test

In [4]:
# convert data to dict format
def transform_data(data):   
    data_dict = {}
    for user, movie in data:
        if user not in data_dict:
            data_dict[user] = set()
        data_dict[user].add(movie)
        
    data_dict = {user : list(data_dict[user]) for user in data_dict}   
    return data_dict        

In [5]:
def preprocess_data(path):
    raw_data = read_data(filePath)
    train_set, test_set = split_data(raw_data)
    train = transform_data(train_set)
    test = transform_data(test_set)
    return train, test

In [6]:
def GetRecommendation(result, user):
    rank = result[user]
    return rank
    
def Recall(train, test, result):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test.get(user)
        if tu is None:
            continue
        rank = GetRecommendation(result, user)
        for item in rank:
            if item in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)
    
def Precision(train, test, result):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test.get(user)
        if tu is None:
            continue    
        rank = GetRecommendation(result, user)
        for item in rank:
            if item in tu:
                hit += 1
        all += len(rank)
    return hit / (all * 1.0)
    
def Coverage(train, test, result):
    recommend_items = set()
    all_items = set()
    for user in train.keys():
        for item in train[user]:
            all_items.add(item)
        rank = GetRecommendation(result, user)
        for item in rank:
            recommend_items.add(item)
    return len(recommend_items) / (len(all_items) * 1.0)
    
def Popularity(train, test, result):
    item_popularity = dict()
    for user, items in train.items():
        for item in items:
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] += 1

    ret = 0
    n = 0
    for user in train.keys():
        rank = GetRecommendation(result, user)
        for item in rank:
            ret += math.log(1 + item_popularity[item])
            n += 1
        ret /= n * 1.0
    return ret

In [7]:
def rec_summary(result, numFold=5):
    precision =0
    recall = 0
    coverage = 0
    popularity =0

    for i in range(0, numFold):
        precision += Precision(train,test, result)
        recall += Recall(train,test,result)
        coverage += Coverage(train, test, result)
        popularity += Popularity(train, test, result)

    precision /= numFold
    recall /= numFold
    coverage /= numFold
    popularity /= numFold

    print('precision = %f' %precision)
    print('recall = %f' %recall)
    print('coverage = %f' %coverage)
    print('popularity = %f' %popularity)

In [8]:
def ItemSimilarity(train):
    #calculate co-rated users between items
    C = dict()
    N = dict()
    for u, items in train.items():
        for i in items:
            N.setdefault(i,0)
            N[i] += 1
            C.setdefault(i,{})
            for j in items:
                if i == j:
                    continue
                C[i].setdefault(j,0)
                C[i][j] += 1
        
    #calculate finial similarity matrix W 
    W = C.copy()
    for i, related_items in C.items():
        for j, cij in related_items.items():
            W[i][j] = cij / math.sqrt(N[i] * N[j])
    return W

In [11]:
def ItemSimilarityIUF(train):
    #calculate co-rated users between items
    C = dict()
    N = dict()
    for u, items in train.items():
        for i in items:
            N.setdefault(i,0)
            N[i] += 1
            C.setdefault(i,{})
            for j in items:
                if i == j:
                    continue
                C[i].setdefault(j,0)
                C[i][j] += 1 / math.log(1 + len(items) * 1.0)
        
    #calculate finial similarity matrix W 
    W = C.copy()
    for i, related_items in C.items():
        for j, cij in related_items.items():
            W[i][j] = cij / math.sqrt(N[i] * N[j])
    return W

In [16]:
from sklearn.preprocessing import normalize

def ItemSimilarityNorm(train):
    org_W = ItemSimilarity(train)
    W = org_W.copy()
    for i, related_items in W.items():
        max_num = max(related_items)
        W[i] = {key : value / max_num for key,value in related_items.items()}
    return W

In [17]:
def ItemCFRec(train, N):
        
    W = ItemSimilarity(train)
    rank = dict()
    result = dict()
    
    for user in train.keys():
        user_items = train[user]
        item_list = []
        for i in user_items:
            for j, wj in sorted(W[i].items(), key=operator.itemgetter(1), reverse=True)[:20]:
                if j in user_items:
                    continue
                rank.setdefault(j,0)
                rank[j] += wj
        rec_items = list(sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[:N])
        for tuple in rec_items:
            item_list.append(tuple[0])
        result[user] = item_list
    return result

In [18]:
def ItemCFIUFRec(train, N):
        
    W = ItemSimilarityIUF(train)
    rank = dict()
    result = dict()
    
    for user in train.keys():
        user_items = train[user]
        item_list = []
        for i in user_items:
            for j, wj in sorted(W[i].items(), key=operator.itemgetter(1), reverse=True)[:20]:
                if j in user_items:
                    continue
                rank.setdefault(j,0)
                rank[j] += wj
        rec_items = list(sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[:N])
        for tuple in rec_items:
            item_list.append(tuple[0])
        result[user] = item_list
    return result

In [19]:
def ItemCFNormRec(train, N):
        
    W = ItemSimilarityNorm(train)
    rank = dict()
    result = dict()
    
    for user in train.keys():
        user_items = train[user]
        item_list = []
        for i in user_items:
            for j, wj in sorted(W[i].items(), key=operator.itemgetter(1), reverse=True)[:20]:
                if j in user_items:
                    continue
                rank.setdefault(j,0)
                rank[j] += wj
        rec_items = list(sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[:N])
        for tuple in rec_items:
            item_list.append(tuple[0])
        result[user] = item_list
    return result

In [14]:
filePath = "./dataset/ratings.dat"
train, test = preprocess_data(filePath)

In [15]:
itemcf_result = ItemCFRec(train, 10)

In [16]:
rec_summary(itemcf_result)

precision = 0.067512
recall = 0.024517
coverage = 0.008684
popularity = 0.001263


In [17]:
itemcf_iuf_result = ItemCFIUFRec(train, 10)

In [18]:
rec_summary(itemcf_iuf_result)

precision = 0.071493
recall = 0.025963
coverage = 0.008684
popularity = 0.001270


In [22]:
# itemcf_norm_result = ItemCFNormRec(train, 10)

In [23]:
# rec_summary(itemcf_norm_result)