In [96]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import datetime

Data Loading

In [97]:
ratings_train = pd.read_csv("./Data/rate_train.csv", low_memory=False)
ratings_test = pd.read_csv("./Data/rate_test.csv",low_memory=False)

In [98]:
datasets_train = ratings_train.dataset_id.unique()
model_train = ratings_train.model_id.unique()
datasets_test = ratings_test.dataset_id.unique()
model_test = ratings_test.model_id.unique()
meta_models = pd.read_csv("./Data/models_num.csv",low_memory=False)
models = meta_models.model_id.unique()

Rating Matrix

In [99]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=models)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=models)

In [100]:
for row in ratings_train.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]
data_model_train_matrix = data_model_train_matrix.fillna(0)

In [101]:
for row in ratings_test.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]
data_model_test_matrix = data_model_test_matrix.fillna(0)

Dataset Similarity Matrix

In [102]:
meta_datasets = pd.read_csv("./Data/datasets_v.csv",low_memory=False)
datasets = meta_datasets.data_id.unique()
meta_datasets = meta_datasets.loc[:,("v1","v2","v3","v4","v5","v6","v7","v8")]

In [103]:
meta_dataset_similarity = cosine_similarity(meta_datasets.values.tolist())
meta_dataset_similarity = pd.DataFrame(meta_dataset_similarity)

In [104]:
meta_models = meta_models.loc[:,("training_data","base_model","gpu_type","para_num","size(MB)","depth","flops")]

In [105]:
meta_model_similarity = cosine_similarity(meta_models.values.tolist())
meta_model_similarity = pd.DataFrame(meta_model_similarity,index=models,columns=models)

In [106]:
# user_similarity = cosine_similarity(data_model_train_matrix)
# datasets_similarity = pd.DataFrame(user_similarity,index=datasets_train,columns=datasets_train)
def similarity(user1, user2):
    # 获取两个用户共同评分的物品
    cols = data_model_train_matrix.columns[data_model_train_matrix.loc[user1].values.nonzero()[0]]
    common_items = np.intersect1d(cols, data_model_train_matrix.columns[data_model_train_matrix.loc[user2].values.nonzero()[0]]).tolist()
    # 如果没有共同评分的物品，则相似度为0
    if len(common_items) == 0:
        return meta_dataset_similarity.loc[user1][user2]
    # 计算两个用户共同评分物品的评分差的平方和
    diff = 0
    for common_item in common_items:
        diff += data_model_train_matrix.loc[user1][common_item] - data_model_train_matrix.loc[user2][common_item]
    square_diff = np.sum(diff ** 2)
    # 计算相似度
    sim = 1 / (1 + np.sqrt(square_diff))
    return sim


user_similarities = pd.DataFrame(index=datasets_train, columns=datasets_train)
for i in datasets_train:
    for j in datasets_train:
        user_similarities.loc[i][j] = similarity(i, j)
        user_similarities.loc[j][i] = user_similarities.loc[i][j]

Prediction

In [107]:
# def predict(ratings, similarity):
#         mean_user_rating = ratings.mean(axis=1)
#         ratings_diff = ratings - np.array(mean_user_rating)[:,np.newaxis]
#         pred = np.array(mean_user_rating)[:,np.newaxis] + np.dot(similarity,ratings_diff) / np.array([np.abs(similarity).sum(axis = 1)]).T
#         return pred

def predict(ratings, user_similarities, user_id, item_id):
    # 获取与用户最相似的K个用户
    k = 10
    similar_users = user_similarities.loc[user_id].sort_values(ascending=False).index.tolist()[1:k+1]
    # print(similar_users)
    # 计算加权平均评分
    weighted_sum = 0
    sum_of_weights = 0
    for sim_user_id in similar_users:
        if ratings.loc[sim_user_id][item_id] != 0:
            weighted_sum += user_similarities.loc[user_id][sim_user_id] * ratings.loc[sim_user_id][item_id]
            sum_of_weights += user_similarities.loc[user_id][sim_user_id]
    if sum_of_weights == 0:
        return 0
    else:
        return weighted_sum / sum_of_weights

In [108]:
model_prediction_train = pd.DataFrame(index=datasets_train,columns=models)

In [109]:
for i in datasets_train:
    for j in models:
        if data_model_train_matrix.loc[i][j] == 0:
            model_prediction_train.loc[i][j] = predict(data_model_train_matrix,user_similarities,i,j)

In [110]:
# train_prediction = predict(data_model_train_matrix,user_similarities)
model_prediction_train = pd.DataFrame(data_model_train_matrix,index=datasets_train,columns=models).sort_index().sort_index(axis=1)
model_prediction_test = pd.DataFrame(index=datasets_test,columns=model_test)

Metadata Based

In [71]:
def find_sim_index(index):
    row1 = meta_dataset_similarity.loc[index]
    row1_max_index = row1[row1 == row1.max()].index[0]
    return row1_max_index

In [72]:
def Find_Top_k(i,sim_matrix):
    row = sim_matrix.loc[i]
    row = row.sort_values(ascending=False)
    index_row = row.index
    index_row = index_row.values.tolist()
    return index_row

In [73]:
start_time = time.time()

In [74]:
for dataset in datasets_test:
    for model in model_test:
        dataset_sim_list = Find_Top_k(dataset,meta_dataset_similarity)[1:]
        # model_sim_list = Find_Top_k(model,meta_model_similarity)[:10]
        for sim_dataset in dataset_sim_list:
            if sim_dataset not in datasets_train:
                continue
            if model_prediction_train.loc[sim_dataset][model] > 0:
                model_prediction_test.loc[dataset][model] = model_prediction_train.loc[sim_dataset][model]
                break

KeyboardInterrupt: 

In [None]:
end_time = time.time()

In [None]:
end_time - start_time

In [None]:
for i in datasets_test:
    for j in model_test:
        if data_model_test_matrix.loc[i][j] == 0:
            model_prediction_test.loc[i][j] = None

In [None]:
new_result = pd.DataFrame(columns={"dataset","model","predict_balance_accuracy","groundtruth_balance_accuracy"})

In [None]:
for i in datasets_test:
    for j in model_test:
        if model_prediction_test.loc[i][j] is not None:
            balanced_accuracy = model_prediction_test.loc[i][j]
            groundtruth_balanced_accuracy = data_model_test_matrix.loc[i][j]
            new_result = new_result.append([{'dataset':i,'model':j,'balanced_accuracy':balanced_accuracy,'groundtruth_balanced_accuracy':groundtruth_balanced_accuracy}],ignore_index=True)

In [None]:
new_result

In [None]:
# new_result.to_csv("../Kaggle/Output/Dataset_only/Full2.csv",index=False)