In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
import networkx as nx

Data Loading

In [2]:
ratings= pd.read_csv("./Data/rating.csv", low_memory=False)
train_data = pd.read_csv("./Data/train_data.csv",low_memory=False)
test_data = pd.read_csv("./Data/test_data.csv",low_memory=False)
# train_data, test_data = train_test_split(ratings, test_size=0.3, random_state=42)
# # 保存训练集为csv文件
# train_data.to_csv('./Data/train_data.csv',index=False)
# # 保存测试集为csv文件
# test_data.to_csv('./Data/test_data.csv',index=False)

In [3]:
datasets = ratings.dataset_id.unique()
models = ratings.model_id.unique()
datasets_train = train_data.dataset_id.unique()
model_train = train_data.model_id.unique()
datasets_test = test_data.dataset_id.unique()
model_test = test_data.model_id.unique()

Rating Matrix

In [4]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=model_train)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=model_test)

In [5]:
for row in train_data.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]

In [6]:
for row in test_data.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]

Dataset Similarity Matrix

In [7]:
import numpy as np
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def create_graph_from_df(df):
    G = nx.Graph()

    for user in df.index:
        for item in df.columns:
            rating = df.loc[user, item]
            if not np.isnan(rating):
                G.add_edge(user, item, weight=rating)

    return G

In [9]:
def deepwalk(G, walk_length=6, num_walks=1, embed_size=32):
    walks = []
    for node in G.nodes():
        if G.degree(node) == 0:
            continue
        for _ in range(num_walks):
            walk = [node]
            while len(walk) < walk_length:
                cur = walk[-1]
                cur_nbrs = list(G.neighbors(cur))
                walk.append(np.random.choice(cur_nbrs))
            walks.append([str(node) for node in walk])
    model = Word2Vec(walks, vector_size=embed_size, window=5, min_count=0, sg=1, workers=4)
    return model

In [10]:
def generate_weighted_walks(G, walk_length=10, num_walks=1,embed_size=32):
    walks = []
    nodes = list(G.nodes())
    for _ in range(num_walks):
        np.random.shuffle(nodes)
        for node in nodes:
            walk = [node]
            while len(walk) < walk_length:
                cur = walk[-1]
                neighbors = list(G.neighbors(cur))
                if neighbors:
                    # 下一步的概率与边的权重成比例
                    weights = [G[cur][neighbor]['weight'] for neighbor in neighbors]
                    probabilities = weights / np.sum(weights)
                    next_node = np.random.choice(neighbors, p=probabilities)
                    walk.append(next_node)
                else:
                    break
            walks.append([str(node) for node in walk])
    model = Word2Vec(walks, vector_size=embed_size, window=5, min_count=0, sg=1, workers=4)
    return model


In [11]:
def get_similarity_matrix(model, user_nodes):
    embeddings = np.array([model.wv.get_vector(str(user)) for user in user_nodes])
    similarity_matrix = cosine_similarity(embeddings)

    return pd.DataFrame(similarity_matrix, index=user_nodes, columns=user_nodes)


In [12]:
# 创建图
G = create_graph_from_df(data_model_train_matrix)

# 执行DeepWalk算法
model = generate_weighted_walks(G)

# 获取用户相似性矩阵
user_nodes = data_model_train_matrix.index.tolist()
similarity_matrix = get_similarity_matrix(model, user_nodes)

In [13]:
similarity_matrix

Unnamed: 0,39,7,21,48,12,52,22,51,64,1,...,8,40,63,44,57,53,58,60,59,17
39,1.000000,0.575757,0.342748,0.415777,0.585654,0.343730,0.451688,0.588606,0.290335,0.599234,...,0.349655,0.322706,0.992548,0.307688,0.988845,0.295017,0.329401,0.418620,0.302627,0.344329
7,0.575757,1.000000,0.510261,0.507396,0.997298,0.510364,0.499789,0.994664,0.436113,0.996022,...,0.581077,0.475505,0.555854,0.507940,0.558419,0.436628,0.534906,0.464647,0.439537,0.501742
21,0.342748,0.510261,1.000000,0.377827,0.499698,0.997410,0.534858,0.535419,0.499209,0.519995,...,0.603365,0.540124,0.286549,0.434530,0.327520,0.504616,0.460865,0.386395,0.504764,0.996818
48,0.415777,0.507396,0.377827,1.000000,0.496650,0.373316,0.376800,0.528242,0.323336,0.519866,...,0.536524,0.376584,0.412346,0.427901,0.397013,0.329035,0.459331,0.363023,0.311735,0.362134
12,0.585654,0.997298,0.499698,0.496650,1.000000,0.497914,0.510867,0.994787,0.437246,0.995579,...,0.564882,0.476720,0.566346,0.498663,0.567517,0.438321,0.525469,0.464871,0.440838,0.489411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,0.295017,0.436628,0.504616,0.329035,0.438321,0.492106,0.511953,0.464935,0.995816,0.455139,...,0.423080,0.995851,0.253224,0.456673,0.253188,1.000000,0.469657,0.500018,0.996008,0.476158
58,0.329401,0.534906,0.460865,0.459331,0.525469,0.444948,0.382076,0.557533,0.472558,0.533957,...,0.539222,0.507945,0.271636,0.996607,0.297466,0.469657,1.000000,0.359691,0.461289,0.445004
60,0.418620,0.464647,0.386395,0.363023,0.464871,0.383565,0.485193,0.494883,0.498672,0.485615,...,0.459333,0.521264,0.406533,0.315015,0.378978,0.500018,0.359691,1.000000,0.509483,0.366982
59,0.302627,0.439537,0.504764,0.311735,0.440838,0.491974,0.500462,0.467600,0.998219,0.458050,...,0.421742,0.994340,0.262156,0.448970,0.260784,0.996008,0.461289,0.509483,1.000000,0.477365


In [14]:
dataset_similarity = pd.DataFrame(index=datasets_train,columns=datasets_train).sort_index(axis=0).sort_index(axis=1)

In [15]:
for i in datasets_train:
    for j in datasets_train:
        if similarity_matrix.loc[i][j] > 0:
            dataset_similarity.loc[i][j] = similarity_matrix.loc[i][j]

In [16]:
dataset_similarity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
0,1,0.478487,0.413325,0.47226,0.35455,0.57125,0.36967,0.483763,0.441031,0.507267,...,0.518065,0.470783,0.426622,0.250832,0.486941,0.366863,0.996846,0.494386,0.388155,0.504565
1,0.478487,1,0.553336,0.506456,0.514487,0.470384,0.531416,0.996022,0.58578,0.996424,...,0.996131,0.57793,0.455118,0.550646,0.549448,0.532851,0.50967,0.997526,0.32652,0.533363
2,0.413325,0.553336,1,0.381668,0.505063,0.439722,0.378829,0.549238,0.994567,0.573508,...,0.572155,0.280352,0.41223,0.473987,0.371235,0.365636,0.430439,0.552685,0.362104,0.430641
3,0.47226,0.506456,0.381668,1,0.541687,0.535075,0.530818,0.513383,0.443024,0.543571,...,0.526048,0.408557,0.422296,0.242159,0.30894,0.530057,0.490134,0.522133,0.581237,0.993826
4,0.35455,0.514487,0.505063,0.541687,1,0.376235,0.378355,0.501935,0.548092,0.523417,...,0.522477,0.410693,0.322906,0.45491,0.4618,0.368052,0.382142,0.525324,0.494494,0.597875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.366863,0.532851,0.365636,0.530057,0.368052,0.562792,0.997132,0.502211,0.418872,0.549299,...,0.536688,0.441422,0.508494,0.493007,0.640648,1,0.392618,0.532037,0.539253,0.548255
68,0.996846,0.50967,0.430439,0.490134,0.382142,0.599259,0.396158,0.515125,0.460397,0.53956,...,0.550059,0.492457,0.449006,0.26566,0.503971,0.392618,1,0.525382,0.409217,0.523284
69,0.494386,0.997526,0.552685,0.522133,0.525324,0.483907,0.530824,0.997132,0.585464,0.996845,...,0.99715,0.56924,0.459644,0.531396,0.543734,0.532037,0.525382,1,0.32546,0.548897
70,0.388155,0.32652,0.362104,0.581237,0.494494,0.562573,0.547389,0.296499,0.397985,0.350194,...,0.349191,0.528034,0.528816,0.384713,0.390574,0.539253,0.409217,0.32546,1,0.607244


In [17]:
def positive_similarity_ratio(similarity_matrix):
    # Count the number of positive similarities
    num_positive_similarities = (similarity_matrix > 0).sum().sum()

    # Count the total number of similarities
    total_similarities = similarity_matrix.size

    # Compute the ratio of positive similarities
    ratio = num_positive_similarities / total_similarities

    return ratio


In [18]:
positive_similarity_ratio(similarity_matrix)

1.0

In [19]:
start_time_train = time.time()

In [20]:
end_time_train = time.time()

In [21]:
end_time_train - start_time_train

0.05717921257019043

Prediction

In [22]:
start_time_ref = time.time()

In [23]:
# 预测函数
def predict(ratings, similarity):
    mean_user_rating = ratings.fillna(0).mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis]).fillna(0)
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
     # 只替换NaN值
    df_nan = ratings.isnull()
    pred = pd.DataFrame(pred).where(df_nan, ratings)
    return pred

In [24]:
user_prediction = predict(data_model_train_matrix,dataset_similarity.fillna(0)).sort_index(axis=0).sort_index(axis=1)

  after removing the cwd from sys.path.
  """


In [25]:
user_prediction

Unnamed: 0,72,73,74,75,76,77,78,79,80,81,...,1862,1863,1864,1865,1866,1867,1868,1869,1870,1871
0,0.0188494,0.958961,0.958403,0.0497574,0.999442,0.997487,0.999442,0.999721,0.0320443,0.999442,...,0.0143107,0.0184219,0.0189132,0.0160616,0.0131706,0.0218312,0.0164532,0.0241106,0.00969823,0.018713
1,0.0563963,0.0751751,0.0739159,0.0681205,0.0825219,0.0806837,0.0805824,0.0714793,0.0615975,0.0789845,...,0.0572268,0.0605315,0.0609964,0.0586555,0.0563958,0.0633989,0.0589624,0.0654226,0.0535545,0.0607896
2,0.0230081,0.0423262,0.0410591,0.035219,0.0499109,0.047973,0.0478795,0.0386229,0.0284832,0.0463665,...,0.0234469,0.0265246,0.0269715,0.0247095,0.0227122,0.0292283,0.0250326,0.0311304,0.0199767,0.0267342
3,0.0414652,0.0618619,0.0605051,0.0544413,0.0698977,0.0678662,0.0677733,0.0578779,0.0471929,0.0661879,...,0.0753692,0.173029,0.0566432,0.234606,0.0462954,0.173447,0.142658,0.145166,0.146002,0.230705
4,0.0657271,0.0820723,0.0809763,0.0761542,0.0885626,0.0869143,0.0868295,0.0788542,0.070389,0.0854856,...,0.0687899,0.0731391,0.0737061,0.0705746,0.0676345,0.0768271,0.0710372,0.0793469,0.0638541,0.0734245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.0191199,0.0346809,0.0336589,0.028947,0.0408145,0.039238,0.0391542,0.0316988,0.0235867,0.0378813,...,0.0215156,0.0253054,0.025748,0.0230838,0.0204795,0.0284878,0.0235023,0.0307394,0.017247,0.0255376
68,0.376876,0.382171,0.356134,0.0568032,0.347749,0.379523,0.339806,0.311121,0.337158,0.0842323,...,0.0232132,0.0272641,0.0277528,0.0249453,0.0221029,0.0306323,0.0253268,0.0328926,0.0186797,0.0275522
69,0.0245579,0.0438762,0.0425781,0.0366142,0.0514295,0.0495452,0.049442,0.0400635,0.0298876,0.0477988,...,0.0254164,0.0288183,0.0292846,0.0268857,0.0245586,0.0317534,0.0272046,0.0338244,0.0216401,0.0290746
70,0.0210839,0.039722,0.0384803,0.0332173,0.0471871,0.0452712,0.0451898,0.0360732,0.0264845,0.0437912,...,0.0244808,0.0293058,0.0298268,0.0265283,0.023,0.0332567,0.026978,0.0358155,0.0190102,0.0297001


In [26]:
end_time_ref = time.time()

In [27]:
end_time_ref - start_time_ref

1.4616684913635254

In [28]:
mask = (data_model_test_matrix.fillna(0) != 0) & (user_prediction != 0)

In [29]:
# 只选择那些在预测评分和实际评分中都不是 0 的评分
prediction = user_prediction[mask].values.flatten()
prediction = pd.to_numeric(prediction, errors='coerce')
prediction = prediction[~np.isnan(prediction)]

In [30]:
actual = data_model_test_matrix.fillna(0)[mask].values.flatten()
actual = pd.to_numeric(actual, errors='coerce')
actual = actual[~np.isnan(actual)]

In [31]:
def calculate_rmse(prediction, actual):
    # 计算 RMSE
    return sqrt(mean_squared_error(prediction, actual))


In [32]:
user_rmse = calculate_rmse(prediction, actual)

In [33]:
user_rmse

0.44089113702143873

In [34]:
def ndcg(y_true, y_pred, k):
    """计算 NDCG @k
    y_true: 真实的 relevancy 分数（通常为 0 或 1）
    y_pred: 预测的 relevancy 分数
    k: 截断位置
    """
    # 计算 DCG @k
    order = np.argsort(y_pred)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    dcg = np.sum(gains / discounts)

    # 计算 IDCG @k
    ideal_order = np.argsort(y_true)[::-1]
    ideal_gains = 2 ** np.take(y_true, ideal_order[:k]) - 1
    ideal_discounts = np.log2(np.arange(len(ideal_gains)) + 2)
    idcg = np.sum(ideal_gains / ideal_discounts)

    # 防止0除问题
    if idcg == 0:
        return 0

    # 计算 NDCG @k
    ndcg = dcg / idcg
    return ndcg



In [35]:
ndcg(actual, prediction,5)

0.9851116734123847