In [778]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
import networkx as nx

Data Loading

In [779]:
ratings= pd.read_csv("./Data/ratings.csv", low_memory=False)
train_data = pd.read_csv("./Data/train_data.csv",low_memory=False)
test_data = pd.read_csv("./Data/test_data.csv",low_memory=False)
# train_data, test_data = train_test_split(ratings, test_size=0.3, random_state=42)
# # 保存训练集为csv文件
# train_data.to_csv('./Data/train_data.csv',index=False)
# # 保存测试集为csv文件
# test_data.to_csv('./Data/test_data.csv',index=False)

In [748]:
datasets = ratings.dataset_id.unique()
models = ratings.model_id.unique()
datasets_train = train_data.dataset_id.unique()
model_train = train_data.model_id.unique()
datasets_test = test_data.dataset_id.unique()
model_test = test_data.model_id.unique()

Rating Matrix

In [749]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=model_train)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=model_test)

In [750]:
for row in train_data.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]

In [751]:
for row in test_data.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]

Dataset Similarity Matrix

In [752]:
import networkx as nx
import pandas as pd
import random
import itertools

In [753]:
def create_graph(df):
    G = nx.Graph()
    for _, group in df.groupby('model_id'):
        users = group['dataset_id'].values
        ratings = group['F1_Score'].values
        for i in range(len(users)):
            for j in range(i + 1, len(users)):
                user1, user2 = users[i], users[j]
                rating = (ratings[i] + ratings[j])/2.0  # 使用评分的平均值作为边的权重
                if G.has_edge(user1, user2):
                    G[user1][user2]['sum_rating'] += rating
                    G[user1][user2]['count'] += 1
                else:
                    G.add_edge(user1, user2, sum_rating=rating, count=1)
    return G

In [754]:
def random_walk(G, num_walks, walk_length):
    walks = []
    for _ in range(num_walks):
        for node in G.nodes():
            walk = [node]
            for _ in range(walk_length):
                neighbors = list(G.neighbors(walk[-1]))
                if len(neighbors) > 0:
                    next_node = random.choices(
                        neighbors,
                        weights=[G[walk[-1]][neighbor]['sum_rating'] * G[walk[-1]][neighbor]['count'] for neighbor in neighbors],
                        k=1
                    )[0]
                    walk.append(next_node)
            walks.append(walk)
    return walks


In [755]:
def compute_similarity(walks, G, num_walks):
    user_to_index = {user: i for i, user in enumerate(G.nodes)}
    similarity = np.zeros((len(G), len(G)))
    interactions = np.zeros((len(G), len(G)))

    for walk in walks:
        for i in range(len(walk)):
            for j in range(i+1, len(walk)):
                user1, user2 = walk[i], walk[j]
                index1, index2 = user_to_index[user1], user_to_index[user2]
                # 检查用户1和用户2之间是否存在边
                if G.has_edge(user1, user2):
                    similarity[index1][index2] += G[user1][user2]['sum_rating'] / G[user1][user2]['count']
                    similarity[index2][index1] += G[user1][user2]['sum_rating'] / G[user1][user2]['count']
                    interactions[index1][index2] += 1
                    interactions[index2][index1] += 1

    # 在实际计算相似度之前，避免除以0
    interactions[interactions == 0] = 1
    similarity /= interactions

    # 把相同的用户相似度设为1
    np.fill_diagonal(similarity, 1)

    similarity_df = pd.DataFrame(similarity, index=G.nodes, columns=G.nodes)
    return similarity_df


In [756]:

# 创建图
G = create_graph(train_data)

# 进行随机游走
walks = random_walk(G, num_walks=100, walk_length=4)

# 计算相似性
similarity_df = compute_similarity(walks, G,num_walks=200)

In [757]:
similarity_df = pd.DataFrame(similarity_df,index=data_model_train_matrix.index,columns=data_model_train_matrix.index)

In [758]:
similarity_df

Unnamed: 0,48,14,60,237,250,246,51,1,74,248,...,50,149,85,128,229,205,71,109,59,76
48,1.000000,0.635264,0.636880,0.341744,0.361661,0.359926,0.542870,0.552943,0.575515,0.369860,...,0.591221,0.661203,0.694131,0.587014,0.656844,0.695585,0.644005,0.627996,0.631138,0.679075
14,0.635264,1.000000,0.635528,0.378529,0.384368,0.368701,0.583242,0.574539,0.614149,0.385136,...,0.611635,0.682211,0.000000,0.623094,0.661252,0.691358,0.658879,0.622222,0.640432,0.701114
60,0.636880,0.635528,1.000000,0.348588,0.373837,0.367432,0.554001,0.550004,0.582733,0.373396,...,0.596101,0.660887,0.690425,0.594583,0.653685,0.685016,0.646487,0.622676,0.620878,0.670870
237,0.341744,0.378529,0.348588,1.000000,0.066482,0.059305,0.276896,0.275333,0.281924,0.059511,...,0.308660,0.411174,0.410384,0.311459,0.387088,0.421793,0.388934,0.342566,0.369407,0.406976
250,0.361661,0.384368,0.373837,0.066482,1.000000,0.075872,0.288840,0.282753,0.298686,0.075939,...,0.319468,0.416367,0.432042,0.321879,0.395456,0.433982,0.385779,0.353086,0.372591,0.419774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,0.695585,0.691358,0.685016,0.421793,0.433982,0.436410,0.609613,0.630260,0.631856,0.437738,...,0.643257,0.716119,0.756004,0.647838,0.709163,1.000000,0.698446,0.679396,0.705742,0.738042
71,0.644005,0.658879,0.646487,0.388934,0.385779,0.386977,0.561437,0.576818,0.596447,0.382713,...,0.609979,0.679478,0.721446,0.608434,0.668721,0.698446,1.000000,0.635598,0.650454,0.704002
109,0.627996,0.622222,0.622676,0.342566,0.353086,0.359799,0.537026,0.553989,0.576227,0.364982,...,0.583429,0.648286,0.680536,0.584950,0.646207,0.679396,0.635598,1.000000,0.610169,0.664326
59,0.631138,0.640432,0.620878,0.369407,0.372591,0.365034,0.555804,0.557551,0.571115,0.370066,...,0.593869,0.670920,0.693471,0.592145,0.659606,0.705742,0.650454,0.610169,1.000000,0.693634


In [759]:
dataset_similarity = pd.DataFrame(index=datasets_train,columns=datasets_train)

In [760]:
for i in datasets_train:
    for j in datasets_train:
        dataset_similarity.loc[i][j] = similarity_df.loc[i][j]

In [761]:
dataset_similarity

Unnamed: 0,48,14,60,237,250,246,51,1,74,248,...,50,149,85,128,229,205,71,109,59,76
48,1.0,0.635264,0.63688,0.341744,0.361661,0.359926,0.54287,0.552943,0.575515,0.36986,...,0.591221,0.661203,0.694131,0.587014,0.656844,0.695585,0.644005,0.627996,0.631138,0.679075
14,0.635264,1.0,0.635528,0.378529,0.384368,0.368701,0.583242,0.574539,0.614149,0.385136,...,0.611635,0.682211,0.0,0.623094,0.661252,0.691358,0.658879,0.622222,0.640432,0.701114
60,0.63688,0.635528,1.0,0.348588,0.373837,0.367432,0.554001,0.550004,0.582733,0.373396,...,0.596101,0.660887,0.690425,0.594583,0.653685,0.685016,0.646487,0.622676,0.620878,0.67087
237,0.341744,0.378529,0.348588,1.0,0.066482,0.059305,0.276896,0.275333,0.281924,0.059511,...,0.30866,0.411174,0.410384,0.311459,0.387088,0.421793,0.388934,0.342566,0.369407,0.406976
250,0.361661,0.384368,0.373837,0.066482,1.0,0.075872,0.28884,0.282753,0.298686,0.075939,...,0.319468,0.416367,0.432042,0.321879,0.395456,0.433982,0.385779,0.353086,0.372591,0.419774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,0.695585,0.691358,0.685016,0.421793,0.433982,0.43641,0.609613,0.63026,0.631856,0.437738,...,0.643257,0.716119,0.756004,0.647838,0.709163,1.0,0.698446,0.679396,0.705742,0.738042
71,0.644005,0.658879,0.646487,0.388934,0.385779,0.386977,0.561437,0.576818,0.596447,0.382713,...,0.609979,0.679478,0.721446,0.608434,0.668721,0.698446,1.0,0.635598,0.650454,0.704002
109,0.627996,0.622222,0.622676,0.342566,0.353086,0.359799,0.537026,0.553989,0.576227,0.364982,...,0.583429,0.648286,0.680536,0.58495,0.646207,0.679396,0.635598,1.0,0.610169,0.664326
59,0.631138,0.640432,0.620878,0.369407,0.372591,0.365034,0.555804,0.557551,0.571115,0.370066,...,0.593869,0.67092,0.693471,0.592145,0.659606,0.705742,0.650454,0.610169,1.0,0.693634


In [762]:
start_time_train = time.time()

In [763]:
end_time_train = time.time()

In [764]:
end_time_train - start_time_train

0.01529073715209961

Prediction

In [765]:
start_time_ref = time.time()

In [766]:
# 预测函数
def predict(ratings, similarity):
    mean_user_rating = ratings.fillna(0).mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis]).fillna(0)
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
     # 只替换NaN值
    df_nan = ratings.isnull()
    pred = pd.DataFrame(pred).where(df_nan, ratings)
    return pred

In [767]:
user_prediction = predict(data_model_train_matrix,dataset_similarity).sort_index(axis=0).sort_index(axis=1)

  after removing the cwd from sys.path.
  """


In [768]:
end_time_ref = time.time()

In [769]:
end_time_ref - start_time_ref

4.550584554672241

In [770]:
mask = (data_model_test_matrix.fillna(0) != 0) & (user_prediction != 0)

In [771]:
# 只选择那些在预测评分和实际评分中都不是 0 的评分
prediction = user_prediction[mask].values.flatten()
prediction = pd.to_numeric(prediction, errors='coerce')
prediction = prediction[~np.isnan(prediction)]

In [772]:
actual = data_model_test_matrix.fillna(0)[mask].values.flatten()
actual = pd.to_numeric(actual, errors='coerce')
actual = actual[~np.isnan(actual)]

In [773]:
def calculate_rmse(prediction, actual):
    # 计算 RMSE
    return sqrt(mean_squared_error(prediction, actual))


In [774]:
user_rmse = calculate_rmse(prediction, actual)

In [775]:
user_rmse

0.36561230348957907

In [776]:
def ndcg(y_true, y_pred, k):
    """计算 NDCG @k
    y_true: 真实的 relevancy 分数（通常为 0 或 1）
    y_pred: 预测的 relevancy 分数
    k: 截断位置
    """
    # 计算 DCG @k
    order = np.argsort(y_pred)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    dcg = np.sum(gains / discounts)

    # 计算 IDCG @k
    ideal_order = np.argsort(y_true)[::-1]
    ideal_gains = 2 ** np.take(y_true, ideal_order[:k]) - 1
    ideal_discounts = np.log2(np.arange(len(ideal_gains)) + 2)
    idcg = np.sum(ideal_gains / ideal_discounts)

    # 防止0除问题
    if idcg == 0:
        return 0

    # 计算 NDCG @k
    ndcg = dcg / idcg
    return ndcg



In [777]:
ndcg(actual, prediction, 10)

0.8361291886983206