# 电影推荐算法比较实验

本实验实现并比较两种协同过滤电影推荐算法：User-CF和Item-CF

## 一、数据加载与预处理

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import kagglehub

# Download latest version
path = kagglehub.dataset_download("/data")

print("Path to dataset files:", path)

# 加载数据
train_data = pd.read_csv('data/recommendation-ratings-train.txt', sep='	')
test_data = pd.read_csv('data/recommendation-ratings-test.txt', sep='	')

# 转换为用户-物品评分矩阵
user_item_train = train_data.pivot_table(index='userld', columns='movield', values='rating', fill_value=0)
user_item_test = test_data.pivot_table(index='userld', columns='movield', values='rating', fill_value=0)


## 二、数据可视化分析

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# 评分分布可视化
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

train_data['rating'].hist(bins=10, ax=axes[0], color='skyblue')
axes[0].set_title('训练集评分分布')
axes[0].set_xlabel('评分')
axes[0].set_ylabel('数量')

test_data['rating'].hist(bins=10, ax=axes[1], color='lightgreen')
axes[1].set_title('测试集评分分布')
axes[1].set_xlabel('评分')
axes[1].set_ylabel('数量')
plt.tight_layout()
plt.show()


## 二、User-CF算法实现

In [None]:
def user_cf_predict(user_item, user_id, movie_id, k=50):
    # 计算用户相似度矩阵
    user_similarity = cosine_similarity(user_item)
    np.fill_diagonal(user_similarity, 0)  # 自身相似度设为0
    
    # 获取相似用户指数
    similar_users = np.argsort(-user_similarity[user_item.index.get_loc(user_id)])[:k]
    
    # 计算预测评分
    mean_user_rating = user_item.mean(axis=1)
    user_ratings = user_item.iloc[similar_users]
    similarity_weights = user_similarity[user_item.index.get_loc(user_id)][similar_users]
    dev = (user_ratings - mean_user_rating.iloc[similar_users].values.reshape(-1,1))
    pred_rating = mean_user_rating[user_id] + np.dot(similarity_weights, dev[movie_id]) / np.sum(np.abs(similarity_weights))
    return pred_rating


## 三、Item-CF算法实现

In [None]:
def item_cf_predict(user_item, user_id, movie_id, k=50):
    # 转置为物品-用户矩阵
    item_user = user_item.T
    
    # 计算物品相似度矩阵
    item_similarity = cosine_similarity(item_user)
    np.fill_diagonal(item_similarity, 0)
    
    # 获取相似物品指数
    similar_items = np.argsort(-item_similarity[item_user.index.get_loc(movie_id)])[:k]
    
    # 计算预测评分
    user_ratings = item_user.iloc[similar_items][user_id]
    similarity_weights = item_similarity[item_user.index.get_loc(movie_id)][similar_items]
    pred_rating = np.dot(similarity_weights, user_ratings) / np.sum(np.abs(similarity_weights))
    return pred_rating


## 四、RMSE评估与结果比较

In [None]:
# 预测User-CF结果
user_cf_preds = test_data.apply(lambda row: user_cf_predict(user_item_train, row['userld'], row['movield']), axis=1)
user_cf_rmse = np.sqrt(mean_squared_error(test_data['rating'], user_cf_preds))

# 预测Item-CF结果
item_cf_preds = test_data.apply(lambda row: item_cf_predict(user_item_train, row['userld'], row['movield']), axis=1)
item_cf_rmse = np.sqrt(mean_squared_error(test_data['rating'], item_cf_preds))

print(f'User-CF RMSE: {user_cf_rmse:.4f}')
print(f'Item-CF RMSE: {item_cf_rmse:.4f}')

# 推荐准确性对比可视化
plt.figure(figsize=(8, 5))
algorithms = ['User-CF', 'Item-CF']
rmse_values = [user_cf_rmse, item_cf_rmse]
plt.bar(algorithms, rmse_values, color=['#4CAF50', '#2196F3'])
plt.title('推荐算法RMSE对比')
plt.xlabel('算法')
plt.ylabel('RMSE')
plt.ylim(0, max(rmse_values)*1.2)
for i, v in enumerate(rmse_values):
    plt.text(i, v+0.05, f'{v:.4f}', ha='center')
plt.show()

# Top-N推荐实现（以User-CF为例）
def get_top_n_recommendations(user_id, n=10):
    user_ratings = user_item_train.loc[user_id]
    unrated_movies = user_ratings[user_ratings == 0].index
    
    # 预测未评分电影的评分
    predictions = [user_cf_predict(user_item_train, user_id, movie_id) for movie_id in unrated_movies]
    
    # 生成推荐列表
    recommendations = pd.DataFrame({'movield': unrated_movies, 'pred_rating': predictions})
    return recommendations.sort_values('pred_rating', ascending=False).head(n)

# 评估Top-N推荐（召回率/精确率）
def evaluate_top_n(n=10):
    test_users = test_data['userld'].unique()
    relevant = 0
    retrieved = 0
    relevant_retrieved = 0
    
    for user_id in test_users:
        top_n = get_top_n_recommendations(user_id, n)['movield'].tolist()
        actual_rated = test_data[(test_data['userld']==user_id) & (test_data['rating']>=4)]['movield'].tolist()
        
        relevant += len(actual_rated)
        retrieved += n
        relevant_retrieved += len(set(top_n) & set(actual_rated))
    
    recall = relevant_retrieved / relevant if relevant > 0 else 0
    precision = relevant_retrieved / retrieved if retrieved > 0 else 0
    return recall, precision

recall, precision = evaluate_top_n()
print(f'Top-10推荐召回率: {recall:.4f}')
print(f'Top-10推荐精确率: {precision:.4f}')
