In [86]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
import networkx as nx

Data Loading

In [87]:
ratings= pd.read_csv("./Data/ratings.csv", low_memory=False)
train_data = pd.read_csv("./Data/train_data.csv",low_memory=False)
test_data = pd.read_csv("./Data/test_data.csv",low_memory=False)
# train_data, test_data = train_test_split(ratings, test_size=0.3, random_state=42)
# # 保存训练集为csv文件
# train_data.to_csv('./Data/train_data.csv',index=False)
# # 保存测试集为csv文件
# test_data.to_csv('./Data/test_data.csv',index=False)

In [88]:
datasets = ratings.dataset_id.unique()
models = ratings.model_id.unique()
datasets_train = train_data.dataset_id.unique()
model_train = train_data.model_id.unique()
datasets_test = test_data.dataset_id.unique()
model_test = test_data.model_id.unique()

Rating Matrix

In [89]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=model_train)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=model_test)

In [90]:
for row in train_data.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]

In [91]:
for row in test_data.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]

Dataset Similarity Matrix

In [92]:
def cosine_similarity_func(ratings, user1, user2):
    # 找到两个用户共同评分的物品，并将这些评分放入一个向量中
    u1_ratings = ratings.loc[user1].dropna()
    u2_ratings = ratings.loc[user2].dropna()

    common_items = np.intersect1d(u1_ratings.index, u2_ratings.index).tolist()
    u1_common_ratings = u1_ratings.loc[common_items]
    u2_common_ratings = u2_ratings.loc[common_items]

    # 计算两个向量之间的余弦相似度
    if len(common_items) == 0:
        return 0
    else:
        cos_sim = np.dot(u1_common_ratings, u2_common_ratings) / (np.linalg.norm(u1_common_ratings) * np.linalg.norm(u2_common_ratings))
        return cos_sim

In [93]:
dataset_similarity = pd.DataFrame(index=datasets_train,columns=datasets_train)

In [94]:
def create_bipartite_adjacency_matrix(rating_matrix):
    n_users, n_items = rating_matrix.shape
    adjacency_matrix = np.zeros((n_users + n_items, n_users + n_items))
    adjacency_matrix[:n_users, n_users:] = rating_matrix
    adjacency_matrix[n_users:, :n_users] = rating_matrix.T
    return adjacency_matrix

In [95]:
def propagation_matrix(adjacency, lambda_):
    n = adjacency.shape[0]
    I = np.eye(n)
    # 将 NaN 视为 0
    adjacency = np.nan_to_num(adjacency)
    try:
        P = np.linalg.inv(I - lambda_ * adjacency)
    except np.linalg.LinAlgError:
        print("矩阵不可逆，无法计算传播矩阵")
        return None
    return P

In [96]:
def propagation_matrix_withWalkLength(adjacency_matrix, max_walk_length):
    adjacency_matrix = np.nan_to_num(adjacency_matrix)
    propagation_matrix = np.eye(adjacency_matrix.shape[0])
    sum_matrix = np.eye(adjacency_matrix.shape[0])

    for _ in range(max_walk_length):
        propagation_matrix = propagation_matrix @ adjacency_matrix
        sum_matrix += propagation_matrix

    return sum_matrix

In [97]:
start_time_train = time.time()

In [98]:
# 计算二分图邻接矩阵
bipartite_adjacency_matrix = create_bipartite_adjacency_matrix(data_model_train_matrix)

In [99]:
propagation_maxLength = propagation_matrix_withWalkLength(bipartite_adjacency_matrix, 5)

In [100]:
# 提取用户-商品传播矩阵和商品-用户传播矩阵
n_users = data_model_train_matrix.shape[0]
user_item_propagation = propagation_maxLength[:n_users, n_users:]
item_user_propagation = propagation_maxLength[n_users:, :n_users]

In [101]:
#计算 Random Walk Kernel
random_walk_kernel = np.dot(user_item_propagation, item_user_propagation)

In [102]:
def normalize_kernel(kernel_matrix):
    # 计算矩阵的最小值和最大值
    min_val = np.min(kernel_matrix)
    max_val = np.max(kernel_matrix)

    # 防止除数为零的情况
    if max_val == min_val:
        return np.zeros_like(kernel_matrix)

    # 将矩阵的值缩放到0和1之间
    normalized_kernel_matrix = (kernel_matrix - min_val) / (max_val - min_val)

    return normalized_kernel_matrix

In [103]:
normalized_kernel = normalize_kernel(random_walk_kernel)

In [104]:
normalized_kernel = pd.DataFrame(normalized_kernel,index=datasets_train,columns=datasets_train)

In [105]:
for i in datasets_train:
    for j in datasets_train:
        dataset_similarity.loc[i][j] = normalized_kernel.loc[i][j]

In [106]:
dataset_similarity

Unnamed: 0,48,14,60,237,250,246,51,1,74,248,...,50,149,85,128,229,205,71,109,59,76
48,0.761661,0.668796,0.73842,0.084446,0.113163,0.089436,0.690884,0.566702,0.77077,0.097483,...,0.84292,0.727789,0.808352,0.811829,0.777317,0.810462,0.705682,0.736847,0.656096,0.800161
14,0.668796,0.587112,0.64835,0.07304,0.098301,0.077428,0.606485,0.497292,0.676762,0.084507,...,0.740221,0.639015,0.709889,0.712872,0.682585,0.711747,0.619568,0.646966,0.575945,0.702687
60,0.73842,0.64835,0.71588,0.081592,0.109445,0.086432,0.669783,0.54933,0.747264,0.094236,...,0.817245,0.705565,0.783703,0.787089,0.753602,0.785749,0.684124,0.714355,0.636031,0.775759
237,0.084446,0.07304,0.081592,0.001275,0.004802,0.001888,0.075763,0.060502,0.085574,0.002876,...,0.094437,0.080284,0.090178,0.090618,0.086367,0.090437,0.077569,0.081399,0.071479,0.089171
250,0.113163,0.098301,0.109445,0.004802,0.009397,0.0056,0.101852,0.081966,0.114634,0.006888,...,0.126183,0.107739,0.12063,0.121207,0.115664,0.120967,0.104202,0.109193,0.096268,0.119318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,0.810462,0.711747,0.785749,0.090437,0.120967,0.09574,0.735115,0.603187,0.820051,0.104296,...,0.896737,0.774482,0.860139,0.863684,0.827139,0.862386,0.750978,0.784076,0.698254,0.851438
71,0.705682,0.619568,0.684124,0.077569,0.104202,0.082195,0.639953,0.524866,0.714047,0.089659,...,0.780944,0.674295,0.749018,0.752111,0.720231,0.750978,0.653791,0.682664,0.607798,0.741428
109,0.736847,0.646966,0.714355,0.081399,0.109193,0.086228,0.668356,0.548154,0.745674,0.094017,...,0.815509,0.704061,0.782035,0.785416,0.751997,0.784076,0.682664,0.712832,0.634673,0.774106
59,0.656096,0.575945,0.636031,0.071479,0.096268,0.075785,0.594922,0.487801,0.663884,0.082732,...,0.72615,0.626881,0.69643,0.699313,0.669636,0.698254,0.607798,0.634673,0.56499,0.689366


In [107]:
end_time_train = time.time()

In [108]:
end_time_train - start_time_train

5.496531963348389

Prediction

In [109]:
start_time_ref = time.time()

In [110]:
# 预测函数
def predict(ratings, similarity):
    mean_user_rating = ratings.fillna(0).mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis]).fillna(0)
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
     # 只替换NaN值
    df_nan = ratings.isnull()
    pred = pd.DataFrame(pred).where(df_nan, ratings)
    return pred

In [111]:
user_prediction = predict(data_model_train_matrix,dataset_similarity).sort_index(axis=0).sort_index(axis=1)

In [112]:
user_prediction

Unnamed: 0,289,290,291,292,293,294,295,296,297,298,...,741,742,743,744,745,746,747,748,749,750
0,0.007678,0.003846,0.007678,0.003846,0.007678,0.003846,0.007678,0.003846,0.359447,0.003846,...,0,0.007678,0,0.007678,0,0,0,0.007678,0,0.007678
1,0.266667,0.307692,0.307692,0.307692,0.307692,0.307692,0.354637,0.307692,0.359447,0.307692,...,0,0,0,0,0,0,0,0,0,0
2,0.37037,0.333333,0.714286,0.363636,0.714286,0.363636,0.714286,0.363636,0.714286,0.363636,...,0,0,0,0,0,0,0,0,0,0
3,0.434783,0.307692,0.41369,0.333333,0.666667,0.300144,0.666667,0.333333,0.666667,0.333333,...,0,0,0,0,0,0,0,0,0,0
4,0.305741,0.444444,0.833333,0.444444,0.833333,0.444444,0.833333,0.444444,0.833333,0.444444,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,0.016807,0.003384,0.013468,0.003384,0.436275,0.300144,0.354637,0.003384,0.359447,0.003384,...,0.023451,0,0.023451,0.023451,0.023451,0.023451,0.023451,0.023451,0.023451,0
285,0.305741,0.00349,0.010435,0.00349,0.436275,0.00349,0.354637,0.00349,0.359447,0.00349,...,0.017331,0,0,0,0.017331,0.017331,0,0.017331,0.017331,0
286,0.210219,0.003396,0.41369,0.304029,0.436275,0.300144,0.016892,0.288288,0.359447,0.297619,...,0,0.016892,0.016892,0,0.016892,0,0,0.013536,0.013536,0
287,0.244541,0.288618,0.41369,0.304029,0.020654,0.003472,0.354637,0.003472,0.359447,0.003472,...,0.024055,0.024055,0.017241,0.017241,0,0.017241,0.013817,0.013817,0,0.013817


In [113]:
end_time_ref = time.time()

In [114]:
end_time_ref - start_time_ref

43.03105163574219

In [115]:
mask = (data_model_test_matrix.fillna(0) != 0) & (user_prediction != 0)

# 只选择那些在预测评分和实际评分中都不是 0 的评分
prediction = user_prediction[mask].values.flatten()
prediction = pd.to_numeric(prediction, errors='coerce')
prediction = prediction[~np.isnan(prediction)]

In [116]:
actual = data_model_test_matrix.fillna(0)[mask].values.flatten()
actual = pd.to_numeric(actual, errors='coerce')
actual = actual[~np.isnan(actual)]

In [117]:
def calculate_rmse(prediction, actual):
    # 计算 RMSE
    return sqrt(mean_squared_error(prediction, actual))


In [118]:
user_rmse = calculate_rmse(prediction, actual)

In [119]:
user_rmse

0.4155274420318513

In [120]:
def ndcg(y_true, y_pred, k):
    """计算 NDCG @k
    y_true: 真实的 relevancy 分数（通常为 0 或 1）
    y_pred: 预测的 relevancy 分数
    k: 截断位置
    """
    # 计算 DCG @k
    order = np.argsort(y_pred)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    dcg = np.sum(gains / discounts)

    # 计算 IDCG @k
    ideal_order = np.argsort(y_true)[::-1]
    ideal_gains = 2 ** np.take(y_true, ideal_order[:k]) - 1
    ideal_discounts = np.log2(np.arange(len(ideal_gains)) + 2)
    idcg = np.sum(ideal_gains / ideal_discounts)

    # 防止0除问题
    if idcg == 0:
        return 0

    # 计算 NDCG @k
    ndcg = dcg / idcg
    return ndcg



In [121]:
ndcg(actual, prediction, 10)

0.5618638692322778