In [76]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances

Data Loading

In [77]:
ratings_train = pd.read_csv("./Data/rate_train.csv", low_memory=False)
ratings_test = pd.read_csv("./Data/Ground_truth/groundtruth_0.3.csv",low_memory=False)

In [78]:
datasets_train = ratings_train.Node_Id.unique()
model_train = ratings_train.Model_Id.unique()
datasets_test = ratings_test.dataset.unique()
model_test = ratings_test.model.unique()
meta_models = pd.read_csv("./Data/model_v.csv",low_memory=False)
models = meta_models.model_id.unique()

Rating Matrix

In [79]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=models)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=model_test)

In [80]:
for row in ratings_train.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]


In [81]:
for row in ratings_test.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]
data_model_test_matrix = data_model_test_matrix.fillna(0)

Dataset Similarity Matrix

In [82]:
meta_datasets = pd.read_csv("./Data/dataset_v.csv",low_memory=False)
datasets = meta_datasets.dataset_id.unique()
meta_datasets = meta_datasets.loc[:,("v1","v2","v3","v4","v5")]
scaler = StandardScaler()

# 对dataframe的数据进行标准化
scaled_data = scaler.fit_transform(meta_datasets)
# 将标准化后的数据转换为dataframe，并保留原始索引
scaled_df = pd.DataFrame(scaled_data, index=meta_datasets.index, columns=meta_datasets.columns)
meta_dataset_similarity = cosine_similarity(scaled_df.values.tolist())
meta_dataset_similarity = pd.DataFrame(meta_dataset_similarity,index=datasets,columns=datasets)

KNN sim

In [83]:
meta_datasets

Unnamed: 0,v1,v2,v3,v4,v5
0,0.008156,0.001922,0.059339,0.002252,-0.019061
1,-0.071854,0.041016,0.135472,-0.024235,-0.111402
2,-0.071854,0.041016,0.135472,-0.024235,-0.111402
3,-0.071854,0.041016,0.135472,-0.024235,-0.111402
4,-0.071854,0.041016,0.135472,-0.024235,-0.111402
...,...,...,...,...,...
284,0.040722,0.087765,0.104818,-0.007502,-0.050698
285,0.045504,0.019537,0.134035,-0.021794,0.050458
286,0.045504,0.019537,0.134035,-0.021794,0.050458
287,0.045504,0.019537,0.134035,-0.021794,0.050458


In [84]:
def cosine_similarity_func(ratings, user1, user2):
    # 找到两个用户共同评分的物品，并将这些评分放入一个向量中
    u1_ratings = ratings.loc[user1].dropna()
    u2_ratings = ratings.loc[user2].dropna()

    common_items = np.intersect1d(u1_ratings.index, u2_ratings.index).tolist()
    u1_common_ratings = u1_ratings.loc[common_items]
    u2_common_ratings = u2_ratings.loc[common_items]

    # 计算两个向量之间的余弦相似度
    if len(common_items) == 0:
        return 0
    else:
        cos_sim = np.dot(u1_common_ratings, u2_common_ratings) / (np.linalg.norm(u1_common_ratings) * np.linalg.norm(u2_common_ratings))
        return cos_sim

In [85]:
dataset_similarity = pd.DataFrame(index=datasets_train,columns=datasets_train)

In [86]:
def create_bipartite_adjacency_matrix(rating_matrix):
    n_users, n_items = rating_matrix.shape
    adjacency_matrix = np.zeros((n_users + n_items, n_users + n_items))
    adjacency_matrix[:n_users, n_users:] = rating_matrix
    adjacency_matrix[n_users:, :n_users] = rating_matrix.T
    return adjacency_matrix

In [87]:
def propagation_matrix(adjacency, lambda_):
    n = adjacency.shape[0]
    I = np.eye(n)
    # 将 NaN 视为 0
    adjacency = np.nan_to_num(adjacency)
    try:
        P = np.linalg.inv(I - lambda_ * adjacency)
    except np.linalg.LinAlgError:
        print("矩阵不可逆，无法计算传播矩阵")
        return None
    return P

In [88]:
def propagation_matrix_withWalkLength(adjacency_matrix, max_walk_length):
    adjacency_matrix = np.nan_to_num(adjacency_matrix)
    propagation_matrix = np.eye(adjacency_matrix.shape[0])
    sum_matrix = np.eye(adjacency_matrix.shape[0])

    for _ in range(max_walk_length):
        propagation_matrix = propagation_matrix @ adjacency_matrix
        sum_matrix += propagation_matrix

    return sum_matrix

In [89]:
start_time_train = time.time()

In [90]:
# 计算二分图邻接矩阵
bipartite_adjacency_matrix = create_bipartite_adjacency_matrix(data_model_train_matrix)

In [91]:
propagation_maxLength = propagation_matrix_withWalkLength(bipartite_adjacency_matrix, 6)

In [92]:
# 提取用户-商品传播矩阵和商品-用户传播矩阵
n_users = data_model_train_matrix.shape[0]
user_item_propagation = propagation_maxLength[:n_users, n_users:]
item_user_propagation = propagation_maxLength[n_users:, :n_users]

In [93]:
# 计算 Random Walk Kernel
random_walk_kernel = np.dot(user_item_propagation, item_user_propagation)

In [94]:
def normalize_kernel(kernel_matrix):
    diagonal_elements = np.diag(kernel_matrix)
    normalized_kernel_matrix = np.divide(kernel_matrix, np.sqrt(np.outer(diagonal_elements, diagonal_elements)))
    return normalized_kernel_matrix

In [95]:
normalized_kernel = normalize_kernel(random_walk_kernel)

In [96]:
normalized_kernel = pd.DataFrame(normalized_kernel,index=datasets_train,columns=datasets_train)

In [97]:
lambda_ = 0.5
for i in datasets_train:
    for j in datasets_train:
        # rating_based_sim = cosine_similarity_func(data_model_train_matrix,i,j)
        if normalized_kernel.loc[i][j] != 0 and meta_dataset_similarity.loc[i][j] != 0:
            dataset_similarity.loc[i][j] = lambda_ * normalized_kernel.loc[i][j] + (1-lambda_) * meta_dataset_similarity.loc[i][j]
        else:
            dataset_similarity.loc[i][j] = 0


In [98]:
dataset_similarity

Unnamed: 0,0,29,14,28,16,17,13,12,10,38,...,275,274,279,278,280,281,288,286,287,285
0,1.0,0.407055,0.407038,0.407037,0.407037,0.407053,0.407054,0.407054,0.407055,0.407054,...,0.375823,0.375834,0.37583,0.375839,0.328952,0.328948,0.816564,0.816564,0.816564,0.816564
29,0.407055,1.0,0.999994,0.999993,0.999993,1.0,1.0,1.0,1.0,1.0,...,0.293156,0.293173,0.293166,0.29318,0.348507,0.3485,0.522713,0.522714,0.522713,0.522713
14,0.407038,0.999994,1.0,1.0,1.0,0.999996,0.999995,0.999995,0.999994,0.999995,...,0.293115,0.293138,0.293128,0.293149,0.348476,0.348465,0.522696,0.522698,0.522696,0.522696
28,0.407037,0.999993,1.0,1.0,1.0,0.999995,0.999994,0.999994,0.999993,0.999994,...,0.293111,0.293135,0.293125,0.293146,0.348473,0.348462,0.522694,0.522696,0.522694,0.522694
16,0.407037,0.999993,1.0,1.0,1.0,0.999995,0.999994,0.999994,0.999994,0.999994,...,0.293112,0.293135,0.293126,0.293146,0.348473,0.348462,0.522695,0.522696,0.522694,0.522695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,0.328948,0.3485,0.348465,0.348462,0.348462,0.348495,0.348498,0.348498,0.348499,0.348498,...,0.971586,0.971588,0.971588,0.971587,1.0,1.0,0.533015,0.533014,0.533015,0.533015
288,0.816564,0.522713,0.522696,0.522694,0.522695,0.522711,0.522712,0.522712,0.522713,0.522712,...,0.48045,0.480461,0.480457,0.480465,0.533019,0.533015,1.0,1.0,1.0,1.0
286,0.816564,0.522714,0.522698,0.522696,0.522696,0.522712,0.522713,0.522713,0.522713,0.522713,...,0.480448,0.48046,0.480455,0.480464,0.533018,0.533014,1.0,1.0,1.0,1.0
287,0.816564,0.522713,0.522696,0.522694,0.522694,0.522711,0.522712,0.522712,0.522712,0.522712,...,0.480451,0.480461,0.480457,0.480466,0.53302,0.533015,1.0,1.0,1.0,1.0


Prediction

In [99]:
def predict_ratings(rating_matrix, user_similarity_matrix, k=5):
    """
    输入：
    rating_matrix - 评分矩阵，DataFrame格式，其中NaN表示未评分
    user_similarity_matrix - 用户相似度矩阵，DataFrame格式
    k - 最近邻的数量，默认为5

    输出：
    prediction_matrix - 预测矩阵，DataFrame格式
    """

    # 初始化预测矩阵
    prediction_matrix = rating_matrix.copy()

    # 对于评分矩阵中的每个NaN值，使用K最近邻的方法预测评分
    for i in rating_matrix.index:
        for j in rating_matrix.columns:
            if np.isnan(rating_matrix.loc[i][j]):
                # 获取第i个用户的相似度值，并在相似度矩阵中找到K个最相似的用户
                similarity_values = user_similarity_matrix.loc[i].sort_values(ascending=False)[1:k+1]

                # 计算加权平均评分
                weighted_sum = 0
                similarity_sum = 0
                for index, value in similarity_values.items():
                    user_rating = rating_matrix.loc[index][j]
                    if not np.isnan(user_rating):
                        weighted_sum += value * user_rating
                        similarity_sum += value

                # 如果存在至少一个相似用户对该物品进行了评分，则计算预测评分
                if similarity_sum != 0:
                    prediction_matrix.loc[i][j] = weighted_sum / similarity_sum
                else:
                    # 如果没有相似用户评分，则使用当前用户的平均评分作为预测值
                    prediction_matrix.loc[i][j] = rating_matrix.loc[i].mean()

    return prediction_matrix

In [100]:
def predict(rating_matrix, similarity_matrix):
    """
    根据评分矩阵和相似度矩阵预测评分。

    参数：
    rating_matrix (pd.DataFrame)：评分矩阵，包含NaN值
    similarity_matrix (numpy.array)：相似度矩阵

    返回：
    pd.DataFrame：预测评分矩阵
    """

    # 获取评分矩阵的均值（忽略NaN值）
    mean_rating = rating_matrix.mean(axis=1).values

    # 将评分矩阵中的NaN值替换为0
    rating_matrix_nan_to_zero = rating_matrix.fillna(0).values

    # 减去均值，得到归一化的评分矩阵
    normalized_rating_matrix = rating_matrix_nan_to_zero - mean_rating[:, np.newaxis]

    # 计算预测评分
    predicted_ratings = mean_rating[:, np.newaxis] + np.dot(similarity_matrix, normalized_rating_matrix) / np.abs(similarity_matrix).sum(axis=1)[:, np.newaxis]

    # 将预测评分数组转换为DataFrame
    predicted_ratings_df = pd.DataFrame(predicted_ratings, index=rating_matrix.index, columns=rating_matrix.columns)

    return predicted_ratings_df

In [101]:
model_prediction_train = predict(data_model_train_matrix,dataset_similarity)
model_prediction_train = pd.DataFrame(model_prediction_train,index=datasets_train,columns=models).sort_index().sort_index(axis=1)



In [102]:
model_prediction_train

Unnamed: 0,289,290,291,292,293,294,295,296,297,298,...,741,742,743,744,745,746,747,748,749,750
0,-0.082269,-0.184046,-0.030986,-0.175057,-0.037754,-0.17521,-0.039279,-0.175727,-0.039234,-0.175727,...,-0.41069,-0.410893,-0.410769,-0.411444,-0.411241,-0.411444,-0.411647,-0.411798,-0.411647,-0.411798
1,0.305123,0.201733,0.376778,0.212056,0.371962,0.211863,0.370885,0.211384,0.370871,0.211384,...,-0.026367,-0.026512,-0.026417,-0.026999,-0.026854,-0.026999,-0.027308,-0.027412,-0.027308,-0.027412
2,0.51611,0.412714,0.587769,0.423037,0.582952,0.422844,0.581875,0.422365,0.581861,0.422366,...,0.184613,0.184468,0.184563,0.183981,0.184126,0.183981,0.183672,0.183568,0.183672,0.183568
3,0.521253,0.417857,0.592912,0.42818,0.588095,0.427988,0.587018,0.427509,0.587004,0.427509,...,0.189757,0.189611,0.189707,0.189124,0.18927,0.189124,0.188816,0.188712,0.188816,0.188712
4,0.595254,0.491858,0.666913,0.502181,0.662096,0.501988,0.661019,0.501509,0.661005,0.501509,...,0.263757,0.263612,0.263707,0.263125,0.26327,0.263125,0.262816,0.262712,0.262816,0.262712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,0.056794,-0.034994,0.086712,-0.028549,0.081906,-0.029207,0.080947,-0.031096,0.080312,-0.031096,...,-0.183321,-0.183529,-0.183399,-0.185332,-0.185124,-0.185332,-0.185858,-0.185966,-0.185858,-0.185966
285,-0.058225,-0.155677,-0.016742,-0.147935,-0.021772,-0.148252,-0.022854,-0.149097,-0.023093,-0.149097,...,-0.337706,-0.337974,-0.337814,-0.338853,-0.338585,-0.338853,-0.339283,-0.339464,-0.339283,-0.339464
286,-0.057151,-0.154603,-0.015668,-0.146861,-0.020697,-0.147179,-0.02178,-0.148023,-0.022019,-0.148023,...,-0.336632,-0.3369,-0.33674,-0.337779,-0.337511,-0.337779,-0.338209,-0.338391,-0.338209,-0.338391
287,-0.057359,-0.154811,-0.015876,-0.147069,-0.020906,-0.147386,-0.021988,-0.148231,-0.022227,-0.148231,...,-0.33684,-0.337108,-0.336948,-0.337987,-0.337719,-0.337987,-0.338417,-0.338598,-0.338417,-0.338598


In [103]:
model_prediction_test = pd.DataFrame(index=datasets_test,columns=model_test)

In [104]:
end_time_train = time.time()
Training_time = end_time_train - start_time_train
Training_time

12.504194974899292

In [105]:
def find_sim_index(index):
    row1 = meta_dataset_similarity.loc[index]
    row1_max_index = row1[row1 == row1.max()].index[0]
    return row1_max_index

In [139]:
def Find_Top_k(i,sim_matrix):
    row = sim_matrix.loc[i]
    row = row.sort_values(ascending=False)
    index_row = row.index
    index_row = index_row.values.tolist()
    return index_row

In [140]:
start_time = time.time()

In [141]:
for dataset in datasets_test:
    for model in model_test:
        dataset_sim_list = Find_Top_k(dataset,meta_dataset_similarity)[1:]
        # 仅保留存在于 model_prediction_train 的索引
        valid_indices = [idx for idx in dataset_sim_list if idx in model_prediction_train.index][:15]
        model_prediction_test.loc[dataset][model] = model_prediction_train.loc[valid_indices][model].mean()

In [142]:
valid_indices

[271, 270, 269, 281, 280, 266, 265, 264, 263, 262, 255, 256, 268, 276, 279]

In [143]:
end_time = time.time()

In [144]:
end_time - start_time

9.567063570022583

In [145]:
for i in datasets_test:
    for j in model_test:
        if data_model_test_matrix.loc[i][j] == 0:
            model_prediction_test.loc[i][j] = None

In [146]:
result = pd.DataFrame(columns={"dataset","model","f1_score","groundtruth_f1_score"})

In [147]:
for i in datasets_test:
    for j in model_test:
        if model_prediction_test.loc[i][j] is not None:
            f1_score = model_prediction_test.loc[i][j]
            groundtruth_f1_score = data_model_test_matrix.loc[i][j]
            result = result.append([{'dataset':i,'model':j,'f1_score':f1_score,'groundtruth_f1_score':groundtruth_f1_score}],ignore_index=True)

In [148]:
result

Unnamed: 0,f1_score,groundtruth_f1_score,dataset,model
0,0.376168,0.300000,93,463
1,0.371448,0.300000,93,465
2,0.593886,0.909091,93,363
3,0.540648,0.303030,93,367
4,0.528509,0.303030,93,369
...,...,...,...,...
8207,0.235397,0.908689,283,307
8208,0.555389,0.882246,283,317
8209,0.570036,0.885276,283,315
8210,0.048590,0.751981,283,303


In [149]:
result.to_csv("../Peak_Finding/Output/Dataset_RandomWalk/Full_Dataset_RandomWalk@6@15.csv",index=False)