In [19]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

Data Loading

In [20]:
ratings_train = pd.read_csv("./Data/ratings_train.csv", low_memory=False)
ratings_test = pd.read_csv("./Data/ratings_test.csv",low_memory=False)

In [21]:
datasets_train = ratings_train.dataset_id.unique()
model_train = ratings_train.model_id.unique()
datasets_test = ratings_test.dataset_id.unique()
model_test = ratings_test.model_id.unique()

Embedded Similarity Matrix

In [22]:
meta_datasets = pd.read_csv("./Data/datasets_v.csv",low_memory=False)
datasets = meta_datasets.data_id.unique()
meta_datasets = meta_datasets.loc[:,("v1","v2","v3","v4","v5","v6","v7","v8")]
meta_dataset_similarity = cosine_similarity(meta_datasets)
meta_dataset_similarity = pd.DataFrame(meta_dataset_similarity,index=datasets,columns=datasets)

In [23]:
meta_models= pd.read_csv("./Data/models_v.csv",low_memory=False)
models = meta_models.model_id.unique()
meta_models = meta_models.loc[:,("training_data","base_model","gpu_type","para_num","size(MB)","depth","flops")]
meta_model_similarity = cosine_similarity(meta_models)
meta_model_similarity = pd.DataFrame(meta_model_similarity,index=models,columns=models)

Rating Matrix

In [24]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=model_train)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=model_test)

In [25]:
for row in ratings_train.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]

In [26]:
for row in ratings_test.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]
data_model_test_matrix = data_model_test_matrix.fillna(0)

Dataset Similarity Matrix

In [27]:
dataset_similarity = pd.DataFrame(cosine_similarity(data_model_train_matrix.fillna(0)),index=data_model_train_matrix.index,columns=data_model_train_matrix.index)

Prediction

In [28]:
def predict_specific_user_item_rating(ratings, similarity, user_id, item_id):
    # 求指定用户的评分平均值
    mean_rating = ratings.replace(0,np.nan).mean(axis=1)
    # 计算所有用户对特定商品的评分偏差
    item_ratings_diff = (ratings.loc[:, item_id] - mean_rating).fillna(0)

    # 利用用户相似度和评分差异计算预测的评分差异
    user_similarity = similarity.loc[user_id, :]
    # 计算预测的评分偏差，该操作实际上是一个加权平均，权重是用户之间的相似度
    pred_diff = user_similarity.dot(item_ratings_diff) / np.abs(user_similarity).sum()

    # 将预测的评分差异加上用户的平均评分，得到预测评分
    pred_rating = mean_rating.loc[user_id] + pred_diff
    # 返回用户对指定商品的预测评分
    return pred_rating

In [29]:
# for i in data_model_train_matrix.index:
#     for j in data_model_train_matrix.columns:
#         if pd.isna(data_model_train_matrix.loc[i][j]):
#             data_model_train_matrix.loc[i][j] = predict_specific_user_item_rating(data_model_train_matrix, dataset_similarity, i, j)

In [30]:
# data_model_train_matrix.to_csv("Experiment2_result_ratingOnly.csv")

In [31]:
data_model_train_matrix = pd.read_csv("Experiment2_result_ratingOnly.csv",low_memory=False,index_col=0)

In [33]:
data_model_train_matrix

Unnamed: 0,72,73,74,75,76,77,78,79,80,81,...,1612,1613,1614,1615,1616,1617,1618,1619,1620,1621
0,0.940536,0.958961,0.958403,0.999721,0.999442,0.997487,0.999442,0.999721,0.998883,0.999442,...,0.472778,0.472778,0.472778,0.472778,0.472778,0.472778,0.472778,0.472778,0.472778,0.472778
2,0.253397,0.253397,0.253397,0.253397,0.253397,0.253397,0.253397,0.253397,0.253397,0.253397,...,0.253397,0.253397,0.253397,0.253397,0.253397,0.253397,0.253397,0.253397,0.253397,0.253397
3,0.251417,0.251417,0.251417,0.251417,0.251417,0.251417,0.251417,0.251417,0.251417,0.251417,...,0.251417,0.251417,0.251417,0.251417,0.251417,0.251417,0.251417,0.251417,0.251417,0.251417
4,0.502325,0.502325,0.502325,0.502325,0.502325,0.502325,0.502325,0.502325,0.502325,0.502325,...,0.502325,0.502325,0.502325,0.502325,0.502325,0.502325,0.502325,0.502325,0.502325,0.502325
5,0.260131,0.260131,0.260131,0.260131,0.260131,0.260131,0.260131,0.260131,0.260131,0.260131,...,0.260131,0.260131,0.260131,0.260131,0.260131,0.260131,0.260131,0.260131,0.260131,0.260131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,0.201509,0.201509,0.201509,0.201509,0.201509,0.201509,0.201509,0.201509,0.201509,0.201509,...,0.201509,0.201509,0.201509,0.201509,0.201509,0.201509,0.201509,0.201509,0.201509,0.201509
68,0.376876,0.382171,0.356134,0.412180,0.347749,0.379523,0.339806,0.311121,0.337158,0.314651,...,0.346408,0.346408,0.346408,0.346408,0.346408,0.346408,0.346408,0.346408,0.346408,0.346408
69,0.522440,0.522440,0.522440,0.522440,0.522440,0.522440,0.522440,0.522440,0.522440,0.522440,...,0.522440,0.522440,0.522440,0.522440,0.522440,0.522440,0.522440,0.522440,0.522440,0.522440
70,0.256281,0.256281,0.256281,0.256281,0.256281,0.256281,0.256281,0.256281,0.256281,0.256281,...,0.256281,0.256281,0.256281,0.256281,0.256281,0.256281,0.256281,0.256281,0.256281,0.256281


Interaction Similarity Matrix

In [34]:
def Interaction_similarity(user_id_1, model_id_1, user_id_2, model_id_2, alpha,gamma):
    # 假设以下两个函数可以返回用户之间和商品之间的相似度
    rating_1 = data_model_train_matrix.loc[user_id_1][model_id_1]
    rating_2 = data_model_train_matrix.loc[user_id_2][model_id_2]
    user_similarity = meta_dataset_similarity.loc[user_id_1][user_id_2]
    model_similarity = meta_model_similarity.loc[model_id_1][model_id_2]

    # 计算混合相似度
    interaction_similarity = (alpha * user_similarity + (1-alpha) * model_similarity) * np.exp(-gamma * np.abs(rating_1 - rating_2) ** 2)
    print(interaction_similarity)
    return interaction_similarity

In [35]:
def Find_Top_k(i,sim_matrix):
    row = sim_matrix.loc[i]
    row = row.sort_values(ascending=False)
    index_row = row.index.values.tolist()
    return index_row

In [37]:
top_b1 = 1
top_b2 = 2
# 计算所有用户商品组合的相似度
inter_similarity_results = []
for i in data_model_test_matrix.index:
    most_similar_datasets = Find_Top_k(i,meta_dataset_similarity)[1:1+top_b1]
    for user_id_1 in most_similar_datasets:
        if user_id_1 not in data_model_train_matrix.index:
            continue
        for item_id_1 in data_model_test_matrix.columns:
            if item_id_1 not in data_model_train_matrix.columns:
                continue
            for user_id_2 in most_similar_datasets:
                if user_id_2 not in data_model_train_matrix.index:
                    continue
                for item_id_2 in data_model_test_matrix.columns:
                    if item_id_2 not in data_model_train_matrix.columns:
                        continue
                    similarity = Interaction_similarity(user_id_1, item_id_1,user_id_2, item_id_2, alpha=0.5,gamma=0.01)
                    inter_similarity_results.append((user_id_1, item_id_1, user_id_2, item_id_2, similarity))

In [20]:
inter_similarity_results

[]