In [29]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics.pairwise import cosine_similarity

Data Loading

In [30]:
ratings= pd.read_csv("./Data/ratings.csv", low_memory=False)
train_data = pd.read_csv("./Data/train_data.csv",low_memory=False)
test_data = pd.read_csv("./Data/test_data.csv",low_memory=False)
# train_data, test_data = train_test_split(ratings, test_size=0.3, random_state=42)
# # 保存训练集为csv文件
# train_data.to_csv('./Data/train_data.csv',index=False)
# # 保存测试集为csv文件
# test_data.to_csv('./Data/test_data.csv',index=False)

In [31]:
datasets = ratings.dataset_id.unique()
models = ratings.model_id.unique()
datasets_train = train_data.dataset_id.unique()
model_train = train_data.model_id.unique()
datasets_test = test_data.dataset_id.unique()
model_test = test_data.model_id.unique()

Rating Matrix

In [32]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=model_train)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=model_test)

In [33]:
for row in train_data.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]


In [34]:
for row in test_data.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]

Dataset Similarity Matrix

In [35]:
def cosine_similarity_func(ratings, user1, user2):
    # 找到两个用户共同评分的物品，并将这些评分放入一个向量中
    u1_ratings = ratings.loc[user1].dropna()
    u2_ratings = ratings.loc[user2].dropna()

    common_items = np.intersect1d(u1_ratings.index, u2_ratings.index).tolist()
    u1_common_ratings = u1_ratings.loc[common_items]
    u2_common_ratings = u2_ratings.loc[common_items]

    # 计算两个向量之间的余弦相似度
    if len(common_items) == 0:
        return 0
    else:
        cos_sim = np.dot(u1_common_ratings, u2_common_ratings) / (np.linalg.norm(u1_common_ratings) * np.linalg.norm(u2_common_ratings))
        return cos_sim

In [36]:
start_time_train = time.time()

In [37]:
# for i in datasets_train:
#     for j in datasets_train:
#         dataset_similarity.loc[i][j] = cosine_similarity_func(data_model_train_matrix,i,j)

In [38]:
dataset_similarity = cosine_similarity(data_model_train_matrix.fillna(0))

In [39]:
dataset_similarity

array([[1.        , 0.55368711, 0.72653293, ..., 0.7044731 , 0.585661  ,
        0.56642151],
       [0.55368711, 1.        , 0.53704519, ..., 0.51143755, 0.63528243,
        0.64702675],
       [0.72653293, 0.53704519, 1.        , ..., 0.66723055, 0.57026794,
        0.53255696],
       ...,
       [0.7044731 , 0.51143755, 0.66723055, ..., 1.        , 0.53211721,
        0.54312194],
       [0.585661  , 0.63528243, 0.57026794, ..., 0.53211721, 1.        ,
        0.6559295 ],
       [0.56642151, 0.64702675, 0.53255696, ..., 0.54312194, 0.6559295 ,
        1.        ]])

In [40]:
end_time_train = time.time()

In [41]:
end_time_train - start_time_train

0.09298229217529297

Prediction

In [42]:
start_time_ref = time.time()

In [43]:
def positive_similarity_ratio(similarity_matrix):
    # Count the number of positive similarities
    num_positive_similarities = (similarity_matrix > 0).sum().sum()

    # Count the total number of similarities
    total_similarities = similarity_matrix.size

    # Compute the ratio of positive similarities
    ratio = num_positive_similarities / total_similarities

    return ratio


In [44]:
positive_similarity_ratio(dataset_similarity)

1.0

In [45]:
# 预测函数
def predict(ratings, similarity):
    mean_user_rating = ratings.fillna(0).mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis]).fillna(0)
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
     # 只替换NaN值
    df_nan = ratings.isnull()
    pred = pd.DataFrame(pred).where(df_nan, ratings)
    return pred

In [46]:
user_prediction = predict(data_model_train_matrix,dataset_similarity).sort_index(axis=0).sort_index(axis=1)

  after removing the cwd from sys.path.
  """


In [47]:
end_time_ref = time.time()

In [48]:
end_time_ref - start_time_ref

0.20406532287597656

In [49]:
mask = (data_model_test_matrix.fillna(0) != 0) & (user_prediction != 0)

# 只选择那些在预测评分和实际评分中都不是 0 的评分
prediction = user_prediction[mask].values.flatten()
prediction = pd.to_numeric(prediction, errors='coerce')
prediction = prediction[~np.isnan(prediction)]

In [50]:
actual = data_model_test_matrix.fillna(0)[mask].values.flatten()
actual = pd.to_numeric(actual, errors='coerce')
actual = actual[~np.isnan(actual)]

In [51]:
def calculate_rmse(prediction, actual):
    # 计算 RMSE
    return sqrt(mean_squared_error(prediction, actual))


In [52]:
user_rmse = calculate_rmse(prediction,actual)

In [53]:
user_rmse

0.38786630013583917

In [54]:
def ndcg(y_true, y_pred, k):
    """计算 NDCG @k
    y_true: 真实的 relevancy 分数（通常为 0 或 1）
    y_pred: 预测的 relevancy 分数
    k: 截断位置
    """
    # 计算 DCG @k
    order = np.argsort(y_pred)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    dcg = np.sum(gains / discounts)

    # 计算 IDCG @k
    ideal_order = np.argsort(y_true)[::-1]
    ideal_gains = 2 ** np.take(y_true, ideal_order[:k]) - 1
    ideal_discounts = np.log2(np.arange(len(ideal_gains)) + 2)
    idcg = np.sum(ideal_gains / ideal_discounts)

    # 防止0除问题
    if idcg == 0:
        return 0

    # 计算 NDCG @k
    ndcg = dcg / idcg
    return ndcg

In [55]:
ndcg(actual, prediction, 5)

0.7410632251660123