In [51]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
import networkx as nx

Data Loading

In [52]:
ratings= pd.read_csv("./Data/ratings.csv", low_memory=False)
train_data = pd.read_csv("./Data/train_data.csv",low_memory=False)
test_data = pd.read_csv("./Data/test_data.csv",low_memory=False)
# train_data, test_data = train_test_split(ratings, test_size=0.3, random_state=42)
# # 保存训练集为csv文件
# train_data.to_csv('./Data/train_data.csv',index=False)
# # 保存测试集为csv文件
# test_data.to_csv('./Data/test_data.csv',index=False)

In [53]:
datasets = ratings.dataset_id.unique()
models = ratings.model_id.unique()
datasets_train = train_data.dataset_id.unique()
model_train = train_data.model_id.unique()
datasets_test = test_data.dataset_id.unique()
model_test = test_data.model_id.unique()

Rating Matrix

In [54]:
data_model_train_matrix = pd.DataFrame(index=datasets_train,columns=model_train)
data_model_test_matrix = pd.DataFrame(index=datasets_test,columns=model_test)

In [55]:
for row in train_data.itertuples():
    data_model_train_matrix.loc[row[1]][row[2]] = row[3]

In [56]:
for row in test_data.itertuples():
    data_model_test_matrix.loc[row[1]][row[2]] = row[3]

Dataset Similarity Matrix

In [57]:
import numpy as np
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [58]:
def create_graph_from_df(df):
    G = nx.Graph()

    for user in df.index:
        for item in df.columns:
            rating = df.loc[user, item]
            if not np.isnan(rating):
                G.add_edge(user, item, weight=rating)

    return G

In [59]:
def deepwalk(G, walk_length=1, num_walks=1, embed_size=32):
    walks = []
    for node in G.nodes():
        if G.degree(node) == 0:
            continue
        for _ in range(num_walks):
            walk = [node]
            while len(walk) < walk_length:
                cur = walk[-1]
                cur_nbrs = list(G.neighbors(cur))
                walk.append(np.random.choice(cur_nbrs))
            walks.append([str(node) for node in walk])

    model = Word2Vec(walks, vector_size=embed_size, window=5, min_count=0, sg=1, workers=4)
    return model

In [60]:
def generate_weighted_walks(G, walk_length=10, num_walks=1,embed_size=32):
    walks = []
    nodes = list(G.nodes())
    for _ in range(num_walks):
        np.random.shuffle(nodes)
        for node in nodes:
            walk = [node]
            while len(walk) < walk_length:
                cur = walk[-1]
                neighbors = list(G.neighbors(cur))
                if neighbors:
                    # The probability of the next step is proportional to the weight of the edge
                    weights = [G[cur][neighbor]['weight'] for neighbor in neighbors]
                    probabilities = weights / np.sum(weights)
                    next_node = np.random.choice(neighbors, p=probabilities)
                    walk.append(next_node)
                else:
                    break
            walks.append([str(node) for node in walk])
    model = Word2Vec(walks, vector_size=embed_size, window=5, min_count=0, sg=1, workers=4)
    return model


In [61]:
def get_similarity_matrix(model, user_nodes):
    embeddings = np.array([model.wv.get_vector(str(user)) for user in user_nodes])
    similarity_matrix = cosine_similarity(embeddings)

    return pd.DataFrame(similarity_matrix, index=user_nodes, columns=user_nodes)


In [62]:
# 创建图
G = create_graph_from_df(data_model_train_matrix)

# 执行DeepWalk算法
model = generate_weighted_walks(G)

# 获取用户相似性矩阵
user_nodes = data_model_train_matrix.index.tolist()
similarity_matrix = get_similarity_matrix(model, user_nodes)

In [63]:
similarity_matrix

Unnamed: 0,48,14,60,237,250,246,51,1,74,248,...,50,149,85,128,229,205,71,109,59,76
48,1.000000,0.993330,0.994590,0.900500,0.980520,0.986121,0.993900,0.992663,0.992556,0.992176,...,0.994697,0.993086,0.994355,0.995027,0.992860,0.993438,0.993336,0.994063,0.992770,0.992508
14,0.993330,1.000000,0.992685,0.898781,0.981719,0.985850,0.994083,0.994545,0.991624,0.989979,...,0.992623,0.996407,0.991163,0.994449,0.992729,0.994636,0.991232,0.992444,0.990439,0.992939
60,0.994590,0.992685,1.000000,0.900396,0.988415,0.985840,0.993139,0.994015,0.992701,0.993226,...,0.993758,0.993650,0.992452,0.994694,0.995147,0.995201,0.991272,0.992929,0.993154,0.993360
237,0.900500,0.898781,0.900396,1.000000,0.882757,0.887775,0.894688,0.907678,0.898911,0.895489,...,0.900657,0.890126,0.903924,0.901481,0.897331,0.902817,0.908691,0.888221,0.899857,0.889990
250,0.980520,0.981719,0.988415,0.882757,1.000000,0.976027,0.984558,0.983652,0.982994,0.982726,...,0.985228,0.982062,0.980744,0.981215,0.982576,0.983794,0.980530,0.986487,0.979403,0.982077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,0.993438,0.994636,0.995201,0.902817,0.983794,0.983902,0.994345,0.993483,0.993549,0.994162,...,0.992691,0.995050,0.993546,0.994230,0.994502,1.000000,0.993144,0.991506,0.991801,0.995462
71,0.993336,0.991232,0.991272,0.908691,0.980530,0.984502,0.995623,0.994401,0.993493,0.993803,...,0.991759,0.992317,0.995043,0.993818,0.990010,0.993144,1.000000,0.991420,0.990889,0.990662
109,0.994063,0.992444,0.992929,0.888221,0.986487,0.983813,0.992265,0.994038,0.992619,0.991258,...,0.993711,0.991912,0.990813,0.991917,0.992025,0.991506,0.991420,1.000000,0.990859,0.992227
59,0.992770,0.990439,0.993154,0.899857,0.979403,0.979481,0.994660,0.991557,0.989949,0.992279,...,0.989088,0.993107,0.991181,0.990629,0.992078,0.991801,0.990889,0.990859,1.000000,0.993730


In [64]:
dataset_similarity = pd.DataFrame(index=datasets_train,columns=datasets_train).sort_index(axis=0).sort_index(axis=1)

In [65]:
for i in datasets_train:
    for j in datasets_train:
        dataset_similarity.loc[i][j] = similarity_matrix.loc[i][j]

In [66]:
dataset_similarity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,279,280,281,282,283,284,285,286,287,288
0,1,0.98951,0.988842,0.98698,0.98737,0.986735,0.990386,0.987238,0.986405,0.986196,...,0.975112,0.987875,0.986671,0.989368,0.983222,0.960405,0.934319,0.970912,0.975678,0.954783
1,0.98951,1,0.994801,0.994094,0.992265,0.993884,0.99511,0.991536,0.992832,0.993653,...,0.988778,0.995975,0.992429,0.991993,0.986599,0.971868,0.960584,0.978977,0.982699,0.95651
2,0.988842,0.994801,1,0.995117,0.991939,0.994841,0.996102,0.989963,0.991585,0.991039,...,0.985172,0.993351,0.992087,0.991796,0.988492,0.970975,0.955456,0.979991,0.976786,0.958329
3,0.98698,0.994094,0.995117,1,0.986893,0.991879,0.995973,0.991907,0.991556,0.990413,...,0.984915,0.991547,0.991474,0.990647,0.987274,0.977493,0.953694,0.971023,0.978768,0.953422
4,0.98737,0.992265,0.991939,0.986893,1,0.989978,0.992966,0.987197,0.991581,0.990458,...,0.982651,0.991513,0.993541,0.992034,0.98485,0.971011,0.955592,0.980192,0.982766,0.963604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,0.960405,0.971868,0.970975,0.977493,0.971011,0.969346,0.977726,0.96887,0.970288,0.971146,...,0.966969,0.972939,0.977286,0.969668,0.96739,1,0.965254,0.941535,0.964211,0.927476
285,0.934319,0.960584,0.955456,0.953694,0.955592,0.957763,0.956757,0.953577,0.953299,0.953425,...,0.949749,0.958181,0.965485,0.94841,0.96394,0.965254,1,0.938565,0.949816,0.912479
286,0.970912,0.978977,0.979991,0.971023,0.980192,0.973527,0.976881,0.97195,0.978863,0.976383,...,0.966015,0.973781,0.97373,0.977017,0.969028,0.941535,0.938565,1,0.959967,0.936275
287,0.975678,0.982699,0.976786,0.978768,0.982766,0.97919,0.983552,0.98426,0.984243,0.980684,...,0.971454,0.984769,0.984688,0.976728,0.980358,0.964211,0.949816,0.959967,1,0.946137


In [67]:
def positive_similarity_ratio(similarity_matrix):
    # Count the number of positive similarities
    num_positive_similarities = (similarity_matrix > 0).sum().sum()

    # Count the total number of similarities
    total_similarities = similarity_matrix.size

    # Compute the ratio of positive similarities
    ratio = num_positive_similarities / total_similarities

    return ratio


In [68]:
positive_similarity_ratio(similarity_matrix)

1.0

In [69]:
start_time_train = time.time()

In [70]:
end_time_train = time.time()

In [71]:
end_time_train - start_time_train

0.0927438735961914

Prediction

In [72]:
start_time_ref = time.time()

In [73]:
# 预测函数
def predict(ratings, similarity):
    mean_user_rating = ratings.fillna(0).mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis]).fillna(0)
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
     # 只替换NaN值
    df_nan = ratings.isnull()
    pred = pd.DataFrame(pred).where(df_nan, ratings)
    return pred

In [74]:
user_prediction = predict(data_model_train_matrix,dataset_similarity.fillna(0)).sort_index(axis=0).sort_index(axis=1)

  after removing the cwd from sys.path.
  """


In [75]:
user_prediction

Unnamed: 0,289,290,291,292,293,294,295,296,297,298,...,741,742,743,744,745,746,747,748,749,750
0,0.00767754,0.00384615,0.00767754,0.00384615,0.00767754,0.00384615,0.00767754,0.00384615,0.409715,0.00384615,...,0.290521,0.00767754,0.290843,0.00767754,0.290715,0.29022,0.290245,0.00767754,0.290105,0.00767754
1,0.266667,0.307692,0.307692,0.307692,0.307692,0.307692,0.349279,0.307692,0.345978,0.307692,...,0.226785,0.226758,0.227111,0.227126,0.226981,0.226486,0.226516,0.226355,0.226372,0.227422
2,0.37037,0.333333,0.714286,0.363636,0.714286,0.363636,0.714286,0.363636,0.714286,0.363636,...,0.27845,0.278423,0.278774,0.278791,0.278644,0.278151,0.278179,0.278021,0.278036,0.279086
3,0.434783,0.307692,0.168896,0.333333,0.666667,0.0623717,0.666667,0.333333,0.666667,0.333333,...,0.038773,0.0387466,0.039098,0.0391122,0.0389692,0.0384717,0.0385028,0.0383426,0.0383607,0.0394107
4,0.134568,0.444444,0.833333,0.444444,0.833333,0.444444,0.833333,0.444444,0.833333,0.444444,...,0.049469,0.0494415,0.049793,0.0498145,0.0496636,0.0491696,0.0491964,0.0490397,0.0490531,0.050104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,0.0168067,0.00338409,0.013468,0.00338409,0.38918,0.296876,0.395822,0.00338409,0.392507,0.00338409,...,0.0234506,0.273233,0.0234506,0.0234506,0.0234506,0.0234506,0.0234506,0.0234506,0.0234506,0.273895
285,0.325385,0.0034904,0.0104348,0.0034904,0.356164,0.0034904,0.362735,0.0034904,0.359363,0.0034904,...,0.017331,0.240213,0.240563,0.240577,0.017331,0.017331,0.239968,0.017331,0.017331,0.240873
286,0.210219,0.00339559,0.412674,0.303164,0.39847,0.306227,0.0168919,0.302647,0.40175,0.30271,...,0.282658,0.0168919,0.0168919,0.283004,0.0168919,0.282363,0.282388,0.0135364,0.0135364,0.283293
287,0.244541,0.232077,0.351939,0.242416,0.020654,0.00347222,0.34438,0.00347222,0.341086,0.00347222,...,0.024055,0.024055,0.0172414,0.0172414,0.222111,0.0172414,0.0138169,0.0138169,0.221501,0.0138169


In [76]:
end_time_ref = time.time()

In [77]:
end_time_ref - start_time_ref

0.6593940258026123

In [78]:
mask = (data_model_test_matrix.fillna(0) != 0) & (user_prediction != 0)

In [79]:
# 只选择那些在预测评分和实际评分中都不是 0 的评分
prediction = user_prediction[mask].values.flatten()
prediction = pd.to_numeric(prediction, errors='coerce')
prediction = prediction[~np.isnan(prediction)]

In [80]:
actual = data_model_test_matrix.fillna(0)[mask].values.flatten()
actual = pd.to_numeric(actual, errors='coerce')
actual = actual[~np.isnan(actual)]

In [81]:
def calculate_rmse(prediction, actual):
    # 计算 RMSE
    return sqrt(mean_squared_error(prediction, actual))


In [82]:
user_rmse = calculate_rmse(prediction, actual)

In [83]:
user_rmse

0.34743774996815474

In [84]:
def ndcg(y_true, y_pred, k):
    """计算 NDCG @k
    y_true: 真实的 relevancy 分数（通常为 0 或 1）
    y_pred: 预测的 relevancy 分数
    k: 截断位置
    """
    # 计算 DCG @k
    order = np.argsort(y_pred)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    dcg = np.sum(gains / discounts)

    # 计算 IDCG @k
    ideal_order = np.argsort(y_true)[::-1]
    ideal_gains = 2 ** np.take(y_true, ideal_order[:k]) - 1
    ideal_discounts = np.log2(np.arange(len(ideal_gains)) + 2)
    idcg = np.sum(ideal_gains / ideal_discounts)

    # 防止0除问题
    if idcg == 0:
        return 0

    # 计算 NDCG @k
    ndcg = dcg / idcg
    return ndcg



In [85]:
ndcg(actual, prediction,10)

0.7311252676813786