In [171]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances

Data Loading

In [172]:
ratings_train = pd.read_csv("./Data/rate_train.csv", low_memory=False)
ratings_test = pd.read_csv("./Data/rate_test.csv",low_memory=False)

In [173]:
datasets_train = ratings_train.dataset_id.unique()
model_train = ratings_train.model_id.unique()
datasets_test = ratings_test.dataset_id.unique()
model_test = ratings_test.model_id.unique()
meta_models = pd.read_csv("./Data/models_v.csv",low_memory=False)
models = meta_models.model_id.unique()

Rating Matrix

In [174]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=models)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=model_test)

In [175]:
for row in ratings_train.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]


In [176]:
for row in ratings_test.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]
data_model_test_matrix = data_model_test_matrix.fillna(0)

Dataset Similarity Matrix

In [177]:
meta_datasets = pd.read_csv("./Data/dataset_v.csv",low_memory=False)
datasets = meta_datasets.dataset_id.unique()
meta_datasets = meta_datasets.loc[:,("v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16")]
scaler = StandardScaler()

# 对dataframe的数据进行标准化
scaled_data = scaler.fit_transform(meta_datasets)
# 将标准化后的数据转换为dataframe，并保留原始索引
scaled_df = pd.DataFrame(scaled_data, index=meta_datasets.index, columns=meta_datasets.columns)
meta_dataset_similarity = cosine_similarity(scaled_df.values.tolist())
meta_dataset_similarity = pd.DataFrame(meta_dataset_similarity,index=datasets,columns=datasets)

KNN sim

In [178]:
meta_datasets

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16
0,0.025034,0.003197,0.013935,-0.013116,0.012825,-0.020438,-0.014044,0.003616,0.006384,0.003538,-0.023130,0.006932,0.004364,-0.004096,-0.003078,0.004269
1,0.037312,0.012165,0.021228,-0.026170,0.013586,-0.025984,-0.016653,-0.002769,0.013702,0.008140,-0.036311,0.008131,0.014083,-0.005162,0.003845,0.009943
2,0.017211,0.004628,0.007155,-0.011390,0.008059,-0.013901,-0.005064,-0.003218,-0.001625,0.000959,-0.010429,-0.004005,0.008344,-0.008625,0.002013,0.000405
3,0.025696,0.008171,0.015039,-0.025142,0.009067,-0.032023,-0.015498,-0.004472,0.005749,0.008614,-0.029290,0.012309,0.011675,-0.003869,-0.002635,-0.003028
4,0.027573,0.010065,0.014708,-0.018399,0.013069,-0.028164,-0.011714,0.002462,0.007283,0.002274,-0.026866,0.006382,0.007245,-0.003877,-0.002586,0.005021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,0.019118,0.006924,0.013132,-0.017057,0.008796,-0.021800,-0.014749,0.006227,0.003878,0.005254,-0.030610,0.001404,0.006432,-0.007262,-0.002656,0.002973
62,0.037312,0.010336,0.014128,-0.018743,0.015076,-0.037564,-0.015761,0.014260,0.008174,0.004831,-0.034659,0.005804,0.005876,-0.009112,0.001246,0.013720
63,0.026009,0.010832,0.017085,-0.015846,0.015780,-0.027117,-0.016596,0.003597,0.005324,0.002113,-0.026415,0.007599,0.009518,-0.007011,-0.004881,0.006763
64,0.025716,0.011888,0.016567,-0.018135,0.021027,-0.036057,-0.018491,0.002428,0.002532,0.000715,-0.029196,0.006693,0.005483,-0.007920,-0.005947,0.005251


In [179]:
def cosine_similarity_func(ratings, user1, user2):
    # 找到两个用户共同评分的物品，并将这些评分放入一个向量中
    u1_ratings = ratings.loc[user1].dropna()
    u2_ratings = ratings.loc[user2].dropna()

    common_items = np.intersect1d(u1_ratings.index, u2_ratings.index).tolist()
    u1_common_ratings = u1_ratings.loc[common_items]
    u2_common_ratings = u2_ratings.loc[common_items]

    # 计算两个向量之间的余弦相似度
    if len(common_items) == 0:
        return 0
    else:
        cos_sim = np.dot(u1_common_ratings, u2_common_ratings) / (np.linalg.norm(u1_common_ratings) * np.linalg.norm(u2_common_ratings))
        return cos_sim

In [180]:
dataset_similarity = pd.DataFrame(index=datasets_train,columns=datasets_train)

In [181]:
def create_bipartite_adjacency_matrix(rating_matrix):
    n_users, n_items = rating_matrix.shape
    adjacency_matrix = np.zeros((n_users + n_items, n_users + n_items))
    adjacency_matrix[:n_users, n_users:] = rating_matrix
    adjacency_matrix[n_users:, :n_users] = rating_matrix.T
    return adjacency_matrix

In [182]:
def propagation_matrix(adjacency, lambda_):
    n = adjacency.shape[0]
    I = np.eye(n)
    # 将 NaN 视为 0
    adjacency = np.nan_to_num(adjacency)
    try:
        P = np.linalg.inv(I - lambda_ * adjacency)
    except np.linalg.LinAlgError:
        print("矩阵不可逆，无法计算传播矩阵")
        return None
    return P

In [183]:
def propagation_matrix_withWalkLength(adjacency_matrix, max_walk_length):
    adjacency_matrix = np.nan_to_num(adjacency_matrix)
    propagation_matrix = np.eye(adjacency_matrix.shape[0])
    sum_matrix = np.eye(adjacency_matrix.shape[0])

    for _ in range(max_walk_length):
        propagation_matrix = propagation_matrix @ adjacency_matrix
        sum_matrix += propagation_matrix

    return sum_matrix

In [184]:
start_time_train = time.time()

In [185]:
# 计算二分图邻接矩阵
bipartite_adjacency_matrix = create_bipartite_adjacency_matrix(data_model_train_matrix)

In [186]:
propagation_maxLength = propagation_matrix_withWalkLength(bipartite_adjacency_matrix, 6)

In [187]:
# 提取用户-商品传播矩阵和商品-用户传播矩阵
n_users = data_model_train_matrix.shape[0]
user_item_propagation = propagation_maxLength[:n_users, n_users:]
item_user_propagation = propagation_maxLength[n_users:, :n_users]

In [188]:
# 计算 Random Walk Kernel
random_walk_kernel = np.dot(user_item_propagation, item_user_propagation)

In [189]:
def normalize_kernel(kernel_matrix):
    diagonal_elements = np.diag(kernel_matrix)
    normalized_kernel_matrix = np.divide(kernel_matrix, np.sqrt(np.outer(diagonal_elements, diagonal_elements)))
    return normalized_kernel_matrix

In [190]:
normalized_kernel = normalize_kernel(random_walk_kernel)

In [191]:
normalized_kernel = pd.DataFrame(normalized_kernel,index=datasets_train,columns=datasets_train)

In [192]:
lambda_ = 0.5
for i in datasets_train:
    for j in datasets_train:
        # rating_based_sim = cosine_similarity_func(data_model_train_matrix,i,j)
        if normalized_kernel.loc[i][j] != 0 and meta_dataset_similarity.loc[i][j] != 0:
            dataset_similarity.loc[i][j] = lambda_ * normalized_kernel.loc[i][j] + (1-lambda_) * meta_dataset_similarity.loc[i][j]
        else:
            dataset_similarity.loc[i][j] = 0


In [193]:
dataset_similarity

Unnamed: 0,0,2,4,5,6,7,8,9,10,11,...,55,56,57,58,59,60,61,62,64,65
0,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1.0,0,0,0,0
61,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1.0,0,0,0
62,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,0,0
64,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.0,0


Prediction

In [194]:
def predict_ratings(rating_matrix, user_similarity_matrix, k=5):
    """
    输入：
    rating_matrix - 评分矩阵，DataFrame格式，其中NaN表示未评分
    user_similarity_matrix - 用户相似度矩阵，DataFrame格式
    k - 最近邻的数量，默认为5

    输出：
    prediction_matrix - 预测矩阵，DataFrame格式
    """

    # 初始化预测矩阵
    prediction_matrix = rating_matrix.copy()

    # 对于评分矩阵中的每个NaN值，使用K最近邻的方法预测评分
    for i in rating_matrix.index:
        for j in rating_matrix.columns:
            if np.isnan(rating_matrix.loc[i][j]):
                # 获取第i个用户的相似度值，并在相似度矩阵中找到K个最相似的用户
                similarity_values = user_similarity_matrix.loc[i].sort_values(ascending=False)[1:k+1]

                # 计算加权平均评分
                weighted_sum = 0
                similarity_sum = 0
                for index, value in similarity_values.items():
                    user_rating = rating_matrix.loc[index][j]
                    if not np.isnan(user_rating):
                        weighted_sum += value * user_rating
                        similarity_sum += value

                # 如果存在至少一个相似用户对该物品进行了评分，则计算预测评分
                if similarity_sum != 0:
                    prediction_matrix.loc[i][j] = weighted_sum / similarity_sum
                else:
                    # 如果没有相似用户评分，则使用当前用户的平均评分作为预测值
                    prediction_matrix.loc[i][j] = rating_matrix.loc[i].mean()

    return prediction_matrix

In [195]:
model_prediction_train = predict_ratings(data_model_train_matrix,dataset_similarity)
model_prediction_train = pd.DataFrame(model_prediction_train,index=datasets_train,columns=models).sort_index().sort_index(axis=1)

In [196]:
model_prediction_train

Unnamed: 0,66,67,68,69,70,71,72,73,74,75,...,988,989,990,991,992,993,994,995,996,997
0,0.9385,0.9385,0.9385,0.9385,0.9385,0.9385,0.9385,0.9385,0.9385,0.9385,...,0.9385,0.9385,0.9385,0.9385,0.9385,0.9385,0.9385,0.9385,0.9385,0.9385
2,0.8075,0.8075,0.8075,0.8075,0.8075,0.8075,0.8075,0.8075,0.8075,0.8075,...,0.8075,0.8075,0.8075,0.8075,0.8075,0.8075,0.8075,0.8075,0.8075,0.8075
4,0.767278,0.767278,0.767278,0.767278,0.767278,0.767278,0.767278,0.767278,0.767278,0.767278,...,0.767278,0.767278,0.767278,0.767278,0.767278,0.767278,0.767278,0.767278,0.767278,0.767278
5,0.48,0.48,0.48,0.48,0.48,0.48,0.48,0.48,0.48,0.48,...,0.48,0.48,0.48,0.48,0.48,0.48,0.48,0.48,0.48,0.48
6,0.818,0.818,0.818,0.818,0.818,0.818,0.818,0.818,0.818,0.818,...,0.818,0.818,0.818,0.818,0.818,0.818,0.818,0.818,0.818,0.818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.74,0.74,0.74,0.74,0.74,0.74,0.74,0.74,0.74,0.74,...,0.74,0.74,0.74,0.74,0.74,0.74,0.74,0.74,0.74,0.74
61,0.733,0.733,0.733,0.733,0.733,0.733,0.733,0.733,0.733,0.733,...,0.733,0.733,0.733,0.733,0.733,0.733,0.733,0.733,0.733,0.733
62,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.75,...,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.75
64,0.641448,0.641448,0.641448,0.641448,0.641448,0.641448,0.641448,0.641448,0.641448,0.641448,...,0.641448,0.641448,0.641448,0.641448,0.641448,0.641448,0.641448,0.641448,0.641448,0.641448


In [197]:
model_prediction_test = pd.DataFrame(index=datasets_test,columns=model_test)

In [198]:
end_time_train = time.time()
Training_time = end_time_train - start_time_train
Training_time

25.815704584121704

In [211]:
def find_sim_index(index):
    row1 = meta_dataset_similarity.loc[index]
    row1_max_index = row1[row1 == row1.max()].index[0]
    return row1_max_index

In [234]:
def Find_Top_k(i,sim_matrix):
    row = sim_matrix.loc[i]
    row = row.sort_values(ascending=False)
    index_row = row.index
    index_row = index_row.values.tolist()
    return index_row

In [235]:
start_time = time.time()

In [236]:
for dataset in datasets_test:
    for model in model_test:
        dataset_sim_list = Find_Top_k(dataset,meta_dataset_similarity)[1:]
        # 仅保留存在于 model_prediction_train 的索引
        valid_indices = [idx for idx in dataset_sim_list if idx in model_prediction_train.index][:15]
        model_prediction_test.loc[dataset][model] = model_prediction_train.loc[valid_indices][model].mean()

In [237]:
valid_indices

[64, 54, 51, 37, 39, 9, 4, 32, 40, 43, 62, 26, 13, 65, 44]

In [238]:
end_time = time.time()

In [239]:
end_time - start_time

0.12262892723083496

In [240]:
for i in datasets_test:
    for j in model_test:
        if data_model_test_matrix.loc[i][j] == 0:
            model_prediction_test.loc[i][j] = None

In [241]:
result = pd.DataFrame(columns={"dataset","model","according_accuracy","groundtruth_according_accuracy"})

In [242]:
for i in datasets_test:
    for j in model_test:
        if model_prediction_test.loc[i][j] is not None:
            according_accuracy = model_prediction_test.loc[i][j]
            groundtruth_according_accuracy = data_model_test_matrix.loc[i][j]
            result = result.append([{'dataset':i,'model':j,'according_accuracy':according_accuracy,'groundtruth_according_accuracy':groundtruth_according_accuracy}],ignore_index=True)

In [243]:
result

Unnamed: 0,dataset,according_accuracy,groundtruth_according_accuracy,model
0,1,0.834907,0.949,596
1,1,0.834907,0.947,708
2,1,0.834907,0.946,795
3,1,0.834907,0.925,597
4,1,0.846105,0.914,950
5,1,0.834907,0.905,841
6,1,0.834907,0.901,641
7,1,0.83781,0.883,868
8,1,0.83349,0.851,772
9,1,0.845078,0.508,527


In [244]:
result.to_csv("../Huggingface/Output/Dataset_RandomWalk/Full_Dataset_RandomWalk@6@15.csv",index=False)