In [2]:
# 导入包
import random
import math
import time
import numpy as np
import pandas as pd
import warnings
import os
import sys
import operator
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)

In [3]:
def read_data(path):
    data = []
    with open(path,"r") as f:
        for line in f.readlines():
            user,movie,rating,_ = line.split("::")
            data.append([user, movie])
        return data

In [4]:
def split_data(data, M=5, k=1):
    train = []
    test = []
    random.seed(42)
    for line in data:
        if random.randint(0, M) == k:
            test.append(line)
        else:
            train.append(line)
    return train, test

In [5]:
# convert data to dict format
def transform_data(data):   
    data_dict = {}
    for user, movie in data:
        if user not in data_dict:
            data_dict[user] = set()
        data_dict[user].add(movie)
        
    data_dict = {user : list(data_dict[user]) for user in data_dict}   
    return data_dict        

In [6]:
def preprocess_data(path):
    raw_data = read_data(filePath)
    train_set, test_set = split_data(raw_data)
    train = transform_data(train_set)
    test = transform_data(test_set)
    return train, test

In [7]:
def GetRecommendation(result, user):
    rank = result[user]
    return rank
    
def Recall(train, test, result):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test.get(user)
        if tu is None:
            continue
        rank = GetRecommendation(result, user)
        for item in rank:
            if item in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)
    
def Precision(train, test, result):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test.get(user)
        if tu is None:
            continue    
        rank = GetRecommendation(result, user)
        for item in rank:
            if item in tu:
                hit += 1
        all += len(rank)
    return hit / (all * 1.0)
    
def Coverage(train, test, result):
    recommend_items = set()
    all_items = set()
    for user in train.keys():
        for item in train[user]:
            all_items.add(item)
        rank = GetRecommendation(result, user)
        for item in rank:
            recommend_items.add(item)
    return len(recommend_items) / (len(all_items) * 1.0)
    
def Popularity(train, test, result):
    item_popularity = dict()
    for user, items in train.items():
        for item in items:
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] += 1

    ret = 0
    n = 0
    for user in train.keys():
        rank = GetRecommendation(result, user)
        for item in rank:
            ret += math.log(1 + item_popularity[item])
            n += 1
        ret /= n * 1.0
    return ret

In [8]:
def rec_summary(result, numFold=5):
    precision =0
    recall = 0
    coverage = 0
    popularity =0

    for i in range(0, numFold):
        precision += Precision(train,test, result)
        recall += Recall(train,test,result)
        coverage += Coverage(train, test, result)
        popularity += Popularity(train, test, result)

    precision /= numFold
    recall /= numFold
    coverage /= numFold
    popularity /= numFold

    print('precision = %f' %precision)
    print('recall = %f' %recall)
    print('coverage = %f' %coverage)
    print('popularity = %f' %popularity)

In [9]:
def init_items_pool(train):
    print("Initialize items pool start")
    items_pool = set()
    for user, items in train.items():
        for item in items:
            items_pool.add(item)
    print("Initialize items pool complete")
    return list(items_pool)

In [10]:
def RandomSelectNegativeSample(items):
    ret = dict()
    for i in items:
        ret[i] = 1
    n = 0
    for i in range(0, len(items) * 3):
        item = items_pool[random.randint(0, len(items_pool) -1)]
        if item in ret:
            continue
        ret[item] = 0
        n += 1
        if n > len(items):
            break
    return ret

In [11]:
def Predict(user, item, P, Q):
#     rate = 0
#     for f, puf in P[user].items():
#         qif = Q[item][f]
#         rate += puf * qif
    return np.dot(P[user], Q[item])

In [12]:
def InitModel(train, F):
    P = dict()
    Q = dict()
    for user in train.keys():
        P[user] = np.random.random(F)
    
    items_pool = init_items_pool(train)
    for item in items_pool:
        Q[item] = np.random.random(F)

    return P, Q

In [13]:
def LatentFactorModel(train, F, N, alpha, lam):
    P, Q = InitModel(train, F)
    for step in range(0, N):
        for user, items in train.items():
            samples = RandomSelectNegativeSample(items)
            for item, rui in samples.items():
                eui = rui - Predict(user, item, P, Q)
                P[user] += alpha * (eui * Q[item] - lam * P[user])
                Q[item] += alpha * (eui * P[user] - lam * Q[item])
        alpha *= 0.9
    
    return P, Q

In [14]:
def LFMRec(train, N, P, Q):
        
    rank = dict()
    result = dict()
    
    for user in train.keys():
        user_items = train[user]
        item_list = []
        for i in Q:  
            if i in user_items:
                continue
            rank.setdefault(i, 0)
            rank[i] = np.dot(P[user], Q[i])
        
#         for f, qif in Q[i].items():
#             puf = P[user][f]
#             rank[i] += puf * qif
            
        rec_items = list(sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[:N])
        for tuple in rec_items:
            item_list.append(tuple[0])
        result[user] = item_list
    return result

In [15]:
filePath = "./dataset/ratings.dat"
train, test = preprocess_data(filePath)

In [16]:
items_pool = init_items_pool(train)

Initialize items pool start
Initialize items pool complete


In [25]:
P, Q = LatentFactorModel(train, 5, 5, 0.02, 0.02)

Initialize items pool start
Initialize items pool complete


In [26]:
lfm_result = LFMRec(train, 10, P, Q)

In [27]:
rec_summary(lfm_result)

precision = 0.010945
recall = 0.003975
coverage = 0.090909
popularity = 0.001162


In [17]:
P, Q = LatentFactorModel(train, 50, 5, 0.02, 0.02)

Initialize items pool start
Initialize items pool complete


In [18]:
lfm_result = LFMRec(train, 10, P, Q)

In [19]:
rec_summary(lfm_result)

precision = 0.014345
recall = 0.005209
coverage = 0.885210
popularity = 0.001118
