In [438]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
import networkx as nx

Data Loading

In [439]:
ratings= pd.read_csv("./Data/rating.csv", low_memory=False)
train_data = pd.read_csv("./Data/train_data.csv",low_memory=False)
test_data = pd.read_csv("./Data/test_data.csv",low_memory=False)
# train_data, test_data = train_test_split(ratings, test_size=0.3, random_state=42)
# # 保存训练集为csv文件
# train_data.to_csv('./Data/train_data.csv',index=False)
# # 保存测试集为csv文件
# test_data.to_csv('./Data/test_data.csv',index=False)

In [440]:
datasets = ratings.dataset_id.unique()
models = ratings.model_id.unique()
datasets_train = train_data.dataset_id.unique()
model_train = train_data.model_id.unique()
datasets_test = test_data.dataset_id.unique()
model_test = test_data.model_id.unique()

Rating Matrix

In [441]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=model_train)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=model_test)

In [442]:
for row in train_data.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]

In [443]:
for row in test_data.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]

Dataset Similarity Matrix

In [444]:
start_time_train = time.time()

In [445]:
dataset_similarity = pd.DataFrame(index=datasets_train,columns=datasets_train)

In [446]:
import pandas as pd
import numpy as np
import networkx as nx

# 构造图
def construct_graph(rating_matrix):
    # 初始化图
    G = nx.Graph()

    # 添加节点
    users = rating_matrix.index.tolist()
    G.add_nodes_from(users)

    # 添加边和权重
    for col in rating_matrix.columns:
        rated_users = rating_matrix[rating_matrix[col].notna()].index.tolist()
        for i in range(len(rated_users)):
            for j in range(i+1, len(rated_users)):
                user_i = rated_users[i]
                user_j = rated_users[j]
                if G.has_edge(user_i, user_j):
                    G[user_i][user_j]['weight'] += rating_matrix[col][user_i] * rating_matrix[col][user_j]
                else:
                    G.add_edge(user_i, user_j, weight=rating_matrix[col][user_i] * rating_matrix[col][user_j])

    return G


In [447]:
def generate_walks(G, walk_length, num_walks):
    walks = []

    nodes = list(G.nodes())
    for _ in range(num_walks):
        np.random.shuffle(nodes)
        for node in nodes:
            walk = [node]
            while len(walk) < walk_length:
                current_node = walk[-1]
                neighbors = list(G[current_node].keys())
                if neighbors:
                    # 下一步的概率与边的权重成比例
                    probabilities = [G[current_node][neighbor]['weight'] for neighbor in neighbors]
                    probabilities /= np.sum(probabilities)
                    next_node = np.random.choice(neighbors, p=probabilities)
                    walk.append(next_node)
                else:
                    break
            walks.append(walk)

    return walks


In [448]:
def calculate_similarity(walks, rating_matrix):
    # 初始化相似度矩阵
    similarity_matrix = pd.DataFrame(0, index=rating_matrix.index, columns=rating_matrix.index)

    # 计算用户在随机游走中被共同访问的频率
    for walk in walks:
        for user_i in walk:
            for user_j in walk:
                similarity_matrix[user_i][user_j] += 1

    # 归一化相似度矩阵
    similarity_matrix /= len(walks)

    return similarity_matrix


In [449]:
G = construct_graph(data_model_train_matrix)

In [450]:
walks = generate_walks(G,walk_length=16,num_walks=1)

In [451]:
dataset_similarity = calculate_similarity(walks,data_model_train_matrix)

In [452]:
dataset_similarity

Unnamed: 0,39,7,21,48,12,52,22,51,64,1,...,8,40,63,44,57,53,58,60,59,17
39,1.263889,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.902778,0.000000,1.013889,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.000000,0.458333,0.000000,0.000000,0.361111,0.000000,0.0,0.625000,0.000000,0.347222,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
21,0.000000,0.000000,0.680556,0.000000,0.000000,0.708333,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.750000
48,0.000000,0.000000,0.000000,0.694444,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
12,0.000000,0.361111,0.000000,0.000000,0.375000,0.000000,0.0,0.486111,0.000000,0.375000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,1.069444,0.000000,...,0.0,0.513889,0.000000,0.000000,0.000000,0.847222,0.000000,0.000000,0.902778,0.000000
58,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.833333,0.000000,0.000000,1.402778,0.000000,0.000000,0.000000
60,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.347222,0.000000,0.000000
59,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,1.375000,0.000000,...,0.0,0.513889,0.000000,0.000000,0.000000,0.902778,0.000000,0.000000,1.430556,0.000000


In [453]:
# 计算非零元素的数量
non_zero_count = np.count_nonzero(dataset_similarity)

In [454]:
# 计算所有元素的数量
total_count = dataset_similarity.size

In [455]:
# 计算非零元素的占比
non_zero_ratio = non_zero_count / total_count

In [456]:
non_zero_ratio

0.07175925925925926

In [457]:
end_time_train = time.time()

In [458]:
end_time_train - start_time_train

1.1815569400787354

Prediction

In [459]:
start_time_ref = time.time()

In [460]:
# 预测函数
def predict(ratings, similarity):
    mean_user_rating = ratings.fillna(0).mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis]).fillna(0)
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
     # 只替换NaN值
    df_nan = ratings.isnull()
    pred = pd.DataFrame(pred).where(df_nan, ratings)
    return pred

In [461]:
user_prediction = predict(data_model_train_matrix,dataset_similarity).sort_index(axis=0).sort_index(axis=1)

  after removing the cwd from sys.path.
  """


In [462]:
end_time_ref = time.time()

In [463]:
end_time_ref - start_time_ref

0.2950732707977295

In [464]:
mask = (data_model_test_matrix.fillna(0) != 0) & (user_prediction.fillna(0) != 0)

# 只选择那些在预测评分和实际评分中都不是 0 的评分
prediction = user_prediction[mask].values.flatten()
prediction = pd.to_numeric(prediction, errors='coerce')
prediction = prediction[~np.isnan(prediction)]

In [465]:
actual = data_model_test_matrix.fillna(0)[mask].values.flatten()
actual = pd.to_numeric(actual, errors='coerce')
actual = actual[~np.isnan(actual)]

In [466]:
def calculate_rmse(prediction, actual):
    # 计算 RMSE
    return sqrt(mean_squared_error(prediction, actual))


In [467]:
user_rmse = calculate_rmse(prediction, actual)

In [468]:
user_rmse

0.3168788632610773

In [469]:
def ndcg(y_true, y_pred, k):
    """计算 NDCG @k
    y_true: 真实的 relevancy 分数（通常为 0 或 1）
    y_pred: 预测的 relevancy 分数
    k: 截断位置
    """
    # 计算 DCG @k
    order = np.argsort(y_pred)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    dcg = np.sum(gains / discounts)

    # 计算 IDCG @k
    ideal_order = np.argsort(y_true)[::-1]
    ideal_gains = 2 ** np.take(y_true, ideal_order[:k]) - 1
    ideal_discounts = np.log2(np.arange(len(ideal_gains)) + 2)
    idcg = np.sum(ideal_gains / ideal_discounts)

    # 防止0除问题
    if idcg == 0:
        return 0

    # 计算 NDCG @k
    ndcg = dcg / idcg
    return ndcg



In [471]:
ndcg(actual, prediction, 5)

0.945206940169708