In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm, trange
from math import sqrt
import pandas as pd
import numpy as np
import heapq

In [2]:
n,m=943,1682
cols = ['user_id', 'item_id', 'rating', 'ts']
base = pd.read_csv('../../ml-100k/u1.base',sep='\t',names=cols)
test = pd.read_csv('../../ml-100k/u1.test',sep='\t',names=cols)

In [3]:
# 创建uesr-item矩阵
train_matrix = np.zeros((n, m))
test_matrix = np.zeros((n, m))
y_ui = np.zeros((n, m))
for i in base.itertuples():
    train_matrix[i[1] - 1, i[2] - 1] = i[3]

for i in test.itertuples():
    test_matrix[i[1] - 1, i[2] - 1] = i[3]

# 创建y矩阵
for i in range(n):
    for j in range(m):
        if train_matrix[i][j] != 0:
            y_ui[i][j] = 1

In [4]:
# User-Based CF
# 初始化相关参数
Swu = np.zeros((n,n))
Iv = [set(np.nonzero(train_matrix[i])[0]) for i in range(n)]
global_average_rating = train_matrix.sum()/y_ui.sum()
avg_rating_user = np.array([0.0]*n)
avg_rating_item = np.array([0.0]*m)
for i in range(n):
    if y_ui[i].sum()==0:
        avg_rating_user[i] = global_average_rating
    else:
        avg_rating_user[i] = train_matrix[i].sum()/y_ui[i].sum()
for i in range(m):
    if y_ui[:,i].sum()==0:
        avg_rating_item[i] = global_average_rating
    else:
        avg_rating_item[i] = train_matrix[:,i].sum()/y_ui[:,i].sum()

In [5]:
# 构造相关系数矩阵
for i in tqdm(range(n)):
    for j in range(n):
        if i==j:
            Swu[i,j] = 1.0
        else:
            k = Iv[i].intersection(Iv[j])
            a = sum(np.array((train_matrix[i]-avg_rating_user[i])*(train_matrix[j]-avg_rating_user[j]))[list(k)])
            b = np.sqrt(sum(np.array((train_matrix[i]-avg_rating_user[i])**2)[list(k)]))
            c = np.sqrt(sum(np.array((train_matrix[j]-avg_rating_user[j])**2)[list(k)]))
            if b*c==0:
                Swu[i,j] = 0.0
            else:
                Swu[i,j]=a/(b*c)
                if Swu[i,j]<0: # 限制Swu>0，用于提升推荐效果
                    Swu[i,j] = 0.0 

100%|██████████| 943/943 [00:53<00:00, 17.51it/s]


In [6]:
UCF = np.zeros((n,m))
Ui = [set(np.nonzero(train_matrix[:,i])[0]) for i in range(m)] 
Uu = [set(np.nonzero(Swu[i])[0]) for i in range(n)]

In [7]:
for i in tqdm(range(n)):
    for j in range(m):
        N_uj = Uu[i].intersection(Ui[j])
        N_uj = list(N_uj)
        UCF[i,j] = avg_rating_user[i]
        if len(N_uj)>50:
            temp_Nuj = heapq.nlargest(50,[(Swu[i,k],k) for k in N_uj])
            N_uj = [q[1] for q in temp_Nuj]
        sw = sum(np.abs(Swu[i,N_uj]))
        if sw != 0:
            UCF[i,j] += sum((train_matrix[N_uj,j]-avg_rating_user[N_uj])*Swu[i,N_uj])/sw
        if UCF[i,j]>5:
            UCF[i,j] = 5
        if UCF[i,j]<1:
            UCF[i,j] = 1

100%|██████████| 943/943 [01:59<00:00,  7.92it/s]


In [8]:
# Item-Based CF
Skj = np.zeros((m,m))
Uv = [set(np.nonzero(train_matrix[:,i])[0]) for i in range(m)]

In [9]:
# 构造相关系数矩阵
for i in tqdm(range(m)):
    for j in range(m):
        if i==j:
            Skj[i,j] = 1.0
        else:
            k = Uv[i].intersection(Uv[j])
            a = sum(np.array((train_matrix[:,i]-avg_rating_user)*(train_matrix[:,j]-avg_rating_user))[list(k)])
            b = np.sqrt(sum(np.array((train_matrix[:,i]-avg_rating_user)**2)[list(k)]))
            c = np.sqrt(sum(np.array((train_matrix[:,j]-avg_rating_user)**2)[list(k)]))
            if b*c==0:
                Skj[i,j] = 0.0
            else:
                Skj[i,j]=a/(b*c)
                if Skj[i,j]<0: # 限制Skj>0，用于提升推荐效果
                    Skj[i,j] = 0.0 

100%|██████████| 1682/1682 [02:55<00:00,  9.57it/s]


In [10]:
ICF = np.zeros((n,m))
Ju = [set(np.nonzero(train_matrix[i])[0]) for i in range(n)] 
Uj = [set(np.nonzero(Skj[i])[0]) for i in range(m)]

In [11]:
for j in tqdm(range(m)):
    for i in range(n):
        N_ju = Ju[i].intersection(Uj[j])
        N_ju = list(N_ju)
        ICF[i,j]  = avg_rating_user[i]
        if len(N_ju)>50:
            temp_Nju = heapq.nlargest(50,[(Skj[j,k],k) for k in N_ju])
            N_ju = [q[1] for q in temp_Nju]
        sw = sum(Skj[j,N_ju])
        if sw != 0:
            ICF[i,j] = sum(train_matrix[i,N_ju]*Skj[j,N_ju])/sw
        if ICF[i,j]>5:
            ICF[i,j] = 5
        if ICF[i,j]<1:
            ICF[i,j] = 1

100%|██████████| 1682/1682 [01:51<00:00, 15.04it/s]


In [12]:
# Hybrid CF
r_ucf = 0.5
HCF = r_ucf*UCF+(1-r_ucf)*ICF

In [13]:
# 评估函数
def eva(prediction, truth):
    prediction = prediction[truth.nonzero()].flatten()
    truth = truth[truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, truth)),mean_absolute_error(prediction, truth)

In [14]:
print("User-based CF RMSE、MAE:"+str(eva(UCF, test_matrix)))
print("Item-based CF RMSE、MAE:"+str(eva(ICF, test_matrix)))
print("Hybrid CF RMSE、MAE:"+str(eva(HCF, test_matrix)))

User-based CF RMSE、MAE:(0.9647438280049679, 0.7563841146393855)
Item-based CF RMSE、MAE:(0.9749952435622402, 0.7664674321022189)
Hybrid CF RMSE、MAE:(0.952446338776516, 0.7500527032991285)
