In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances

Data Loading

In [2]:
ratings_train = pd.read_csv("./Data/rate_train.csv", low_memory=False)
ratings_test = pd.read_csv("./Data/Ground_truth/groundtruth_0.3.csv",low_memory=False)

In [3]:
datasets_train = ratings_train.Node_Id.unique()
model_train = ratings_train.Model_Id.unique()
datasets_test = ratings_test.dataset.unique()
model_test = ratings_test.model.unique()
meta_models = pd.read_csv("./Data/model_v.csv",low_memory=False)
models = meta_models.model_id.unique()

Rating Matrix

In [4]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=models)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=model_test)

In [5]:
for row in ratings_train.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]


In [6]:
for row in ratings_test.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]
data_model_test_matrix = data_model_test_matrix.fillna(0)

Dataset Similarity Matrix

In [7]:
meta_datasets = pd.read_csv("./Data/dataset_v.csv",low_memory=False)
datasets = meta_datasets.dataset_id.unique()
meta_datasets = meta_datasets.loc[:,("v1","v2","v3","v4","v5")]
scaler = StandardScaler()

# 对dataframe的数据进行标准化
scaled_data = scaler.fit_transform(meta_datasets)
# 将标准化后的数据转换为dataframe，并保留原始索引
scaled_df = pd.DataFrame(scaled_data, index=meta_datasets.index, columns=meta_datasets.columns)
meta_dataset_similarity = cosine_similarity(scaled_df.values.tolist())
meta_dataset_similarity = pd.DataFrame(meta_dataset_similarity,index=datasets,columns=datasets)

KNN sim

In [8]:
def cosine_similarity_func(ratings, user1, user2):
    # 找到两个用户共同评分的物品，并将这些评分放入一个向量中
    u1_ratings = ratings.loc[user1].dropna()
    u2_ratings = ratings.loc[user2].dropna()

    common_items = np.intersect1d(u1_ratings.index, u2_ratings.index).tolist()
    u1_common_ratings = u1_ratings.loc[common_items]
    u2_common_ratings = u2_ratings.loc[common_items]

    # 计算两个向量之间的余弦相似度
    if len(common_items) == 0:
        return 0
    else:
        cos_sim = np.dot(u1_common_ratings, u2_common_ratings) / (np.linalg.norm(u1_common_ratings) * np.linalg.norm(u2_common_ratings))
        return cos_sim

In [9]:
dataset_similarity = pd.DataFrame(index=datasets_train,columns=datasets_train)

In [10]:
def create_bipartite_adjacency_matrix(rating_matrix):
    n_users, n_items = rating_matrix.shape
    adjacency_matrix = np.zeros((n_users + n_items, n_users + n_items))
    adjacency_matrix[:n_users, n_users:] = rating_matrix
    adjacency_matrix[n_users:, :n_users] = rating_matrix.T
    return adjacency_matrix

In [11]:
def propagation_matrix(adjacency, lambda_):
    n = adjacency.shape[0]
    I = np.eye(n)
    # 将 NaN 视为 0
    adjacency = np.nan_to_num(adjacency)
    try:
        P = np.linalg.inv(I - lambda_ * adjacency)
    except np.linalg.LinAlgError:
        print("矩阵不可逆，无法计算传播矩阵")
        return None
    return P

In [12]:
def propagation_matrix_withWalkLength(adjacency_matrix, max_walk_length):
    adjacency_matrix = np.nan_to_num(adjacency_matrix)
    propagation_matrix = np.eye(adjacency_matrix.shape[0])
    sum_matrix = np.eye(adjacency_matrix.shape[0])

    for _ in range(max_walk_length):
        propagation_matrix = propagation_matrix @ adjacency_matrix
        sum_matrix += propagation_matrix

    return sum_matrix

In [13]:
start_time_train = time.time()

In [14]:
# 计算二分图邻接矩阵
bipartite_adjacency_matrix = create_bipartite_adjacency_matrix(data_model_train_matrix)

In [15]:
propagation_maxLength = propagation_matrix_withWalkLength(bipartite_adjacency_matrix, 4)

In [16]:
# 提取用户-商品传播矩阵和商品-用户传播矩阵
n_users = data_model_train_matrix.shape[0]
user_item_propagation = propagation_maxLength[:n_users, n_users:]
item_user_propagation = propagation_maxLength[n_users:, :n_users]

In [17]:
# 计算 Random Walk Kernel
random_walk_kernel = np.dot(user_item_propagation, item_user_propagation)

In [18]:
def normalize_kernel(kernel_matrix):
    # 计算矩阵的最小值和最大值
    min_val = np.min(kernel_matrix)
    max_val = np.max(kernel_matrix)

    # 防止除数为零的情况
    if max_val == min_val:
        return np.zeros_like(kernel_matrix)

    # 将矩阵的值缩放到0和1之间
    normalized_kernel_matrix = (kernel_matrix - min_val) / (max_val - min_val)

    return normalized_kernel_matrix

In [19]:
normalized_kernel = normalize_kernel(random_walk_kernel)

In [20]:
normalized_kernel = pd.DataFrame(normalized_kernel,index=datasets_train,columns=datasets_train)

In [21]:
normalized_kernel

Unnamed: 0,0,29,14,28,16,17,13,12,10,38,...,275,274,279,278,280,281,288,286,287,285
0,0.000965,0.091849,0.086431,0.087606,0.090945,0.051502,0.068505,0.060708,0.081428,0.074573,...,0.034225,0.028086,0.019411,0.018828,0.026387,0.016304,0.001467,0.001592,0.001431,0.001310
29,0.091849,0.748755,0.709706,0.718234,0.742368,0.457248,0.580080,0.523737,0.673387,0.623937,...,0.331599,0.287484,0.224910,0.220795,0.275281,0.202552,0.095459,0.096363,0.095197,0.094327
14,0.086431,0.709706,0.673637,0.681788,0.704717,0.433241,0.549737,0.496288,0.638295,0.591340,...,0.313212,0.271575,0.212263,0.208463,0.260084,0.191132,0.089852,0.090715,0.089601,0.088778
28,0.087606,0.718234,0.681788,0.690040,0.713240,0.438515,0.556382,0.502303,0.645982,0.598475,...,0.317000,0.274892,0.214887,0.211049,0.263271,0.193514,0.091067,0.091941,0.090814,0.089981
16,0.090945,0.742368,0.704717,0.713240,0.737206,0.453425,0.575179,0.519317,0.667735,0.618660,...,0.327920,0.284419,0.222434,0.218469,0.272414,0.200355,0.094520,0.095423,0.094259,0.093399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,0.016304,0.202552,0.191132,0.193514,0.200355,0.119840,0.154695,0.138717,0.181179,0.167130,...,0.084877,0.072189,0.054378,0.053134,0.068663,0.047970,0.017331,0.017584,0.017259,0.017009
288,0.001467,0.095459,0.089852,0.091067,0.094520,0.053731,0.071316,0.063252,0.084680,0.077591,...,0.035866,0.029516,0.020544,0.019941,0.027759,0.017331,0.001986,0.002116,0.001949,0.001824
286,0.001592,0.096363,0.090715,0.091941,0.095423,0.054290,0.072021,0.063890,0.085496,0.078348,...,0.036269,0.029869,0.020823,0.020216,0.028098,0.017584,0.002116,0.002246,0.002078,0.001952
287,0.001431,0.095197,0.089601,0.090814,0.094259,0.053569,0.071113,0.063068,0.084445,0.077372,...,0.035751,0.029416,0.020465,0.019862,0.027662,0.017259,0.001949,0.002078,0.001913,0.001788


In [22]:
lambda_ = 0.5
for i in datasets_train:
    for j in datasets_train:
        if normalized_kernel.loc[i][j] != 0:
            dataset_similarity.loc[i][j] = normalized_kernel.loc[i][j]
        else:
            dataset_similarity.loc[i][j] = 0


In [23]:
dataset_similarity

Unnamed: 0,0,29,14,28,16,17,13,12,10,38,...,275,274,279,278,280,281,288,286,287,285
0,0.000965,0.091849,0.086431,0.087606,0.090945,0.051502,0.068505,0.060708,0.081428,0.074573,...,0.034225,0.028086,0.019411,0.018828,0.026387,0.016304,0.001467,0.001592,0.001431,0.00131
29,0.091849,0.748755,0.709706,0.718234,0.742368,0.457248,0.58008,0.523737,0.673387,0.623937,...,0.331599,0.287484,0.22491,0.220795,0.275281,0.202552,0.095459,0.096363,0.095197,0.094327
14,0.086431,0.709706,0.673637,0.681788,0.704717,0.433241,0.549737,0.496288,0.638295,0.59134,...,0.313212,0.271575,0.212263,0.208463,0.260084,0.191132,0.089852,0.090715,0.089601,0.088778
28,0.087606,0.718234,0.681788,0.69004,0.71324,0.438515,0.556382,0.502303,0.645982,0.598475,...,0.317,0.274892,0.214887,0.211049,0.263271,0.193514,0.091067,0.091941,0.090814,0.089981
16,0.090945,0.742368,0.704717,0.71324,0.737206,0.453425,0.575179,0.519317,0.667735,0.61866,...,0.32792,0.284419,0.222434,0.218469,0.272414,0.200355,0.09452,0.095423,0.094259,0.093399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,0.016304,0.202552,0.191132,0.193514,0.200355,0.11984,0.154695,0.138717,0.181179,0.16713,...,0.084877,0.072189,0.054378,0.053134,0.068663,0.04797,0.017331,0.017584,0.017259,0.017009
288,0.001467,0.095459,0.089852,0.091067,0.09452,0.053731,0.071316,0.063252,0.08468,0.077591,...,0.035866,0.029516,0.020544,0.019941,0.027759,0.017331,0.001986,0.002116,0.001949,0.001824
286,0.001592,0.096363,0.090715,0.091941,0.095423,0.05429,0.072021,0.06389,0.085496,0.078348,...,0.036269,0.029869,0.020823,0.020216,0.028098,0.017584,0.002116,0.002246,0.002078,0.001952
287,0.001431,0.095197,0.089601,0.090814,0.094259,0.053569,0.071113,0.063068,0.084445,0.077372,...,0.035751,0.029416,0.020465,0.019862,0.027662,0.017259,0.001949,0.002078,0.001913,0.001788


Prediction

In [24]:
def predict_ratings(rating_matrix, user_similarity_matrix, k=5):
    """
    输入：
    rating_matrix - 评分矩阵，DataFrame格式，其中NaN表示未评分
    user_similarity_matrix - 用户相似度矩阵，DataFrame格式
    k - 最近邻的数量，默认为5

    输出：
    prediction_matrix - 预测矩阵，DataFrame格式
    """

    # 初始化预测矩阵
    prediction_matrix = rating_matrix.copy()

    # 对于评分矩阵中的每个NaN值，使用K最近邻的方法预测评分
    for i in rating_matrix.index:
        for j in rating_matrix.columns:
            if np.isnan(rating_matrix.loc[i][j]):
                # 获取第i个用户的相似度值，并在相似度矩阵中找到K个最相似的用户
                similarity_values = user_similarity_matrix.loc[i].sort_values(ascending=False)[1:k+1]

                # 计算加权平均评分
                weighted_sum = 0
                similarity_sum = 0
                for index, value in similarity_values.items():
                    user_rating = rating_matrix.loc[index][j]
                    if not np.isnan(user_rating):
                        weighted_sum += value * user_rating
                        similarity_sum += value

                # 如果存在至少一个相似用户对该物品进行了评分，则计算预测评分
                if similarity_sum != 0:
                    prediction_matrix.loc[i][j] = weighted_sum / similarity_sum
                else:
                    # 如果没有相似用户评分，则使用当前用户的平均评分作为预测值
                    prediction_matrix.loc[i][j] = rating_matrix[j].mean()

    return prediction_matrix

In [25]:
def predict(rating_matrix, similarity_matrix):
    """
    根据评分矩阵和相似度矩阵预测评分。

    参数：
    rating_matrix (pd.DataFrame)：评分矩阵，包含NaN值
    similarity_matrix (numpy.array)：相似度矩阵

    返回：
    pd.DataFrame：预测评分矩阵
    """

    # 获取评分矩阵的均值（忽略NaN值）
    mean_rating = rating_matrix.mean(axis=1).values

    # 将评分矩阵中的NaN值替换为0
    rating_matrix_nan_to_zero = rating_matrix.fillna(0).values

    # 减去均值，得到归一化的评分矩阵
    normalized_rating_matrix = rating_matrix_nan_to_zero - mean_rating[:, np.newaxis]

    # 计算预测评分
    predicted_ratings = mean_rating[:, np.newaxis] + np.dot(similarity_matrix, normalized_rating_matrix) / np.abs(similarity_matrix).sum(axis=1)[:, np.newaxis]

    # 将预测评分数组转换为DataFrame
    predicted_ratings_df = pd.DataFrame(predicted_ratings, index=rating_matrix.index, columns=rating_matrix.columns)

    return predicted_ratings_df

In [26]:
model_prediction_train = predict(data_model_train_matrix,dataset_similarity)
model_prediction_train = pd.DataFrame(model_prediction_train,index=datasets_train,columns=models).sort_index().sort_index(axis=1)



In [27]:
model_prediction_train

Unnamed: 0,289,290,291,292,293,294,295,296,297,298,...,741,742,743,744,745,746,747,748,749,750
0,-0.13011,-0.237457,-0.048142,-0.225821,-0.056037,-0.225902,-0.057897,-0.226034,-0.057537,-0.226034,...,-0.531556,-0.531558,-0.531552,-0.531675,-0.531673,-0.531675,-0.531639,-0.531641,-0.531639,-0.531641
1,0.274531,0.167565,0.353542,0.178953,0.345768,0.178853,0.343946,0.17865,0.344264,0.17865,...,-0.119012,-0.119031,-0.119015,-0.119218,-0.1192,-0.119218,-0.119213,-0.119225,-0.119213,-0.119225
2,0.485845,0.378678,0.564846,0.390074,0.557062,0.389973,0.555237,0.389767,0.555554,0.389767,...,0.092371,0.092352,0.092368,0.092161,0.09218,0.092161,0.092165,0.092153,0.092165,0.092153
3,0.490984,0.383832,0.569976,0.395226,0.562193,0.395125,0.560369,0.39492,0.560685,0.39492,...,0.097532,0.097513,0.097529,0.097322,0.097341,0.097322,0.097326,0.097314,0.097326,0.097314
4,0.565081,0.457919,0.644032,0.469311,0.63625,0.46921,0.634426,0.469003,0.634742,0.469003,...,0.171741,0.171722,0.171738,0.17153,0.171549,0.17153,0.171534,0.171521,0.171534,0.171521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,-0.027985,-0.13455,0.051778,-0.123123,0.043999,-0.123217,0.042172,-0.123396,0.042502,-0.123396,...,-0.423613,-0.423626,-0.423613,-0.42379,-0.423777,-0.42379,-0.423774,-0.423782,-0.423774,-0.423782
285,-0.128121,-0.235442,-0.046255,-0.223815,-0.054145,-0.223897,-0.056004,-0.224031,-0.055646,-0.224031,...,-0.529295,-0.529298,-0.529292,-0.529417,-0.529414,-0.529417,-0.529382,-0.529384,-0.529382,-0.529384
286,-0.126907,-0.234229,-0.045111,-0.222607,-0.052999,-0.22269,-0.054857,-0.222825,-0.054499,-0.222825,...,-0.527895,-0.527898,-0.527891,-0.528018,-0.528015,-0.528018,-0.527984,-0.527986,-0.527984,-0.527986
287,-0.127202,-0.234509,-0.045374,-0.222885,-0.053262,-0.222967,-0.055121,-0.223102,-0.054763,-0.223102,...,-0.528278,-0.528281,-0.528274,-0.5284,-0.528398,-0.5284,-0.528366,-0.528368,-0.528366,-0.528368


In [28]:
model_prediction_test = pd.DataFrame(index=datasets_test,columns=model_test)

In [29]:
end_time_train = time.time()
Training_time = end_time_train - start_time_train
Training_time

8.288299560546875

In [30]:
def find_sim_index(index):
    row1 = meta_dataset_similarity.loc[index]
    row1_max_index = row1[row1 == row1.max()].index[0]
    return row1_max_index

In [64]:
def Find_Top_k(i,sim_matrix):
    row = sim_matrix.loc[i]
    row = row.sort_values(ascending=False)
    index_row = row.index
    index_row = index_row.values.tolist()
    return index_row

In [65]:
start_time = time.time()

In [66]:
for dataset in datasets_test:
    for model in model_test:
        dataset_sim_list = Find_Top_k(dataset,meta_dataset_similarity)[1:]
        # 仅保留存在于 model_prediction_train 的索引
        valid_indices = [idx for idx in dataset_sim_list if idx in model_prediction_train.index][:15]
        model_prediction_test.loc[dataset][model] = model_prediction_train.loc[valid_indices][model].mean()

In [67]:
valid_indices

[271, 270, 269, 281, 280, 266, 265, 264, 263, 262, 255, 256, 268, 276, 279]

In [68]:
end_time = time.time()

In [69]:
end_time - start_time

10.040815591812134

In [70]:
for i in datasets_test:
    for j in model_test:
        if data_model_test_matrix.loc[i][j] == 0:
            model_prediction_test.loc[i][j] = None

In [71]:
result = pd.DataFrame(columns={"dataset","model","f1_score","groundtruth_f1_score"})

In [72]:
for i in datasets_test:
    for j in model_test:
        if model_prediction_test.loc[i][j] is not None:
            f1_score = model_prediction_test.loc[i][j]
            groundtruth_f1_score = data_model_test_matrix.loc[i][j]
            result = result.append([{'dataset':i,'model':j,'f1_score':f1_score,'groundtruth_f1_score':groundtruth_f1_score}],ignore_index=True)

In [73]:
result

Unnamed: 0,dataset,groundtruth_f1_score,model,f1_score
0,93,0.300000,463,0.381069
1,93,0.300000,465,0.375416
2,93,0.909091,363,0.591702
3,93,0.303030,367,0.528832
4,93,0.303030,369,0.517794
...,...,...,...,...
8207,283,0.908689,307,-0.283689
8208,283,0.882246,317,0.433330
8209,283,0.885276,315,0.458599
8210,283,0.751981,303,-0.480643


In [74]:
result.to_csv("../Peak_Finding/Output/RandomWalk_only/Full_RandomWalk@4@15.csv",index=False)