In [161]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import datetime

Data Loading

In [162]:
ratings_train = pd.read_csv("./Data/Delay_time_data/edge_train.csv",low_memory=False)
ratings_test = pd.read_csv("./Data/rate_test.csv",low_memory=False)

In [163]:
datasets_train = ratings_train.dataset_id.unique()
model_train = ratings_train.model_id.unique()
datasets_test = ratings_test.dataset_id.unique()
model_test = ratings_test.model_id.unique()
meta_models = pd.read_csv("./Data/models_v.csv",low_memory=False)
models = meta_models.model_id.unique()

Rating Matrix

In [164]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=models)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=models)

In [165]:
for row in ratings_train.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]
data_model_train_matrix = data_model_train_matrix.fillna(0)

In [166]:
for row in ratings_test.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]
data_model_test_matrix = data_model_test_matrix.fillna(0)

Dataset Similarity Matrix

In [167]:
meta_datasets = pd.read_csv("./Data/dataset_v.csv",low_memory=False)
datasets = meta_datasets.dataset_id.unique()
meta_datasets = meta_datasets.loc[:,("v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16")]

In [168]:
meta_dataset_similarity = cosine_similarity(meta_datasets.values.tolist())
meta_dataset_similarity = pd.DataFrame(meta_dataset_similarity,index=datasets,columns=datasets)

In [169]:
meta_models = meta_models.loc[:,("v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16")]

In [170]:
meta_model_similarity = cosine_similarity(meta_models.values.tolist())
meta_model_similarity = pd.DataFrame(meta_model_similarity,index=models,columns=models)

In [171]:
# user_similarity = cosine_similarity(data_model_train_matrix)
# datasets_similarity = pd.DataFrame(user_similarity,index=datasets_train,columns=datasets_train)
def similarity(user1, user2,):
    # 获取两个用户共同评分的物品
    cols = data_model_train_matrix.columns[data_model_train_matrix.loc[user1].values.nonzero()[0]]
    common_items = np.intersect1d(cols, data_model_train_matrix.columns[data_model_train_matrix.loc[user2].values.nonzero()[0]]).tolist()
    # 如果没有共同评分的物品，则相似度为0
    if len(common_items) == 0:
        return meta_dataset_similarity.loc[user1][user2]
    # 计算两个用户共同评分物品的评分差的平方和
    diff = 0
    for common_item in common_items:
        diff += data_model_train_matrix.loc[user1][common_item] - data_model_train_matrix.loc[user2][common_item]
    square_diff = np.sum(diff ** 2)
    # 计算相似度
    sim = 1 / (1 + np.sqrt(square_diff))
    return sim


user_similarities = pd.DataFrame(index=datasets_train, columns=datasets_train)
for i in datasets_train:
    for j in datasets_train:
        user_similarities.loc[i][j] = similarity(i, j)
        user_similarities.loc[j][i] = user_similarities.loc[i][j]

Prediction

In [172]:
def predict(ratings, user_similarities, user_id, item_id):
    # 获取与用户最相似的K个用户
    k = 1
    similar_users = user_similarities.loc[user_id].sort_values(ascending=False).index.tolist()[1:k+1]
    # print(similar_users)
    # 计算加权平均评分
    weighted_sum = 0
    sum_of_weights = 0
    for sim_user_id in similar_users:
        if ratings.loc[sim_user_id][item_id] != 0:
            # print(sim_user_id,item_id)
            weighted_sum += user_similarities.loc[user_id][sim_user_id] * ratings.loc[sim_user_id][item_id]
            sum_of_weights += user_similarities.loc[user_id][sim_user_id]
    if sum_of_weights == 0:
        return 0
    else:
        return weighted_sum / sum_of_weights

In [173]:
model_prediction_train = pd.DataFrame(index=datasets_train,columns=models)

In [174]:
for i in datasets_train:
    for j in models:
        if data_model_train_matrix.loc[i][j] == 0:
            model_prediction_train.loc[i][j] = predict(data_model_train_matrix,user_similarities,i,j)

In [175]:
# train_prediction = predict(data_model_train_matrix,user_similarities)
model_prediction_train = pd.DataFrame(data_model_train_matrix,index=datasets_train,columns=models).sort_index().sort_index(axis=1)
model_prediction_test = pd.DataFrame(index=datasets_test,columns=models)

Metadata Based

In [176]:
def find_sim_index(index):
    row1 = meta_dataset_similarity.loc[index]
    row1_max_index = row1[row1 == row1.max()].index[0]
    return row1_max_index

In [177]:
def Find_Top_k(i,sim_matrix):
    row = sim_matrix.loc[i]
    row = row.sort_values(ascending=False)
    index_row = row.index
    index_row = index_row.values.tolist()
    return index_row

In [178]:
start_time = time.time()

In [179]:
for dataset in datasets_test:
    for model in model_test:
        dataset_sim_list = Find_Top_k(dataset,meta_dataset_similarity)[1:]
        for sim_dataset in dataset_sim_list:
            if sim_dataset not in datasets_train:
                continue
            if model_prediction_train.loc[sim_dataset][model] > 0.1:
                model_prediction_test.loc[dataset][model] = model_prediction_train.loc[sim_dataset][model]
                break

In [180]:
end_time = time.time()

In [181]:
end_time - start_time

0.1847519874572754

In [182]:
for i in datasets_test:
    for j in models:
        if data_model_test_matrix.loc[i][j] == 0:
            model_prediction_test.loc[i][j] = None

In [183]:
new_result = pd.DataFrame(columns={"dataset_id","model_id","predict_according_accuracy","groundtruth_according_accuracy"})

In [184]:
for i in datasets_test:
    for j in models:
        if model_prediction_test.loc[i][j] is not None:
            predict_according_accuracy = model_prediction_test.loc[i][j]
            groundtruth_according_accuracy = data_model_test_matrix.loc[i][j]
            new_result = new_result.append([{'dataset_id':i,'model_id':j,'predict_according_accuracy':predict_according_accuracy,'groundtruth_according_accuracy':groundtruth_according_accuracy}],ignore_index=True)

In [185]:
new_result.fillna(0)

Unnamed: 0,groundtruth_according_accuracy,model_id,dataset_id,predict_according_accuracy
0,0.501,318,1,0.938
1,0.508,527,1,0.9246
2,0.949,596,1,0.923
3,0.925,597,1,0.9303
4,0.901,641,1,0.86
5,0.947,708,1,0.9214
6,0.851,772,1,0.82
7,0.946,795,1,0.9319
8,0.905,841,1,0.9672
9,0.883,868,1,0.685


In [186]:
# new_result.to_csv("./Output/Dataset_only/r100.csv",index=False)