In [1]:
from surprise import Dataset, Reader, SVD, KNNBasic, NMF, SlopeOne
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import KNNBasic, SVD, NMF
import numpy as np

data = Dataset.load_builtin("ml-100k")

In [2]:
trainset, testset = train_test_split(data, test_size=0.25)

In [3]:
svd = SVD()
knn = KNNBasic()
nmf = NMF()
slop = SlopeOne()

# Обучение моделей
svd.fit(trainset)
knn.fit(trainset)
nmf.fit(trainset)
slop.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.slope_one.SlopeOne at 0x104bd28d0>

In [4]:
svd_predictions = svd.test(testset)
knn_predictions = knn.test(testset)
nmf_predictions = nmf.test(testset)
slop_predictions = slop.test(testset)

In [5]:
from surprise import Prediction
def average_predictions(predictions_list):
    aggregated_predictions = []

    num_predictions = len(predictions_list[0])

    for i in range(num_predictions):
        # Сбор предсказаний для каждого user-item пары
        pred_ratings = [pred[i].est for pred in predictions_list]
        avg_rating = np.mean(pred_ratings)

        # Создаем новый объект Prediction
        aggregated_predictions.append(Prediction(
            uid=predictions_list[0][i].uid,
            iid=predictions_list[0][i].iid,
            r_ui=predictions_list[0][i].r_ui,
            est=avg_rating,
            details={}  # Параметр details можно оставить пустым
        ))

    return aggregated_predictions

aggregated_predictions = average_predictions([svd_predictions, knn_predictions, nmf_predictions, slop_predictions])


In [6]:
print(aggregated_predictions[1])
print(svd_predictions[1])
print(knn_predictions[1])
print(nmf_predictions[1])
print(slop_predictions[1])

user: 472        item: 426        r_ui = 4.00   est = 3.66   {}
user: 472        item: 426        r_ui = 4.00   est = 3.92   {'was_impossible': False}
user: 472        item: 426        r_ui = 4.00   est = 2.99   {'actual_k': 22, 'was_impossible': False}
user: 472        item: 426        r_ui = 4.00   est = 3.89   {'was_impossible': False}
user: 472        item: 426        r_ui = 4.00   est = 3.84   {'was_impossible': False}


In [7]:
accuracy.rmse(aggregated_predictions)
accuracy.rmse(svd_predictions)
accuracy.rmse(knn_predictions)
accuracy.rmse(nmf_predictions)
accuracy.rmse(slop_predictions)

RMSE: 0.9409
RMSE: 0.9452
RMSE: 0.9890
RMSE: 0.9760
RMSE: 0.9534


0.9534177925198085

In [8]:
print(accuracy.fcp(aggregated_predictions))
print(accuracy.fcp(svd_predictions))
print(accuracy.fcp(knn_predictions))
print(accuracy.fcp(nmf_predictions))

FCP:  0.7037
0.7037294095286939
FCP:  0.6944
0.6944009390976165
FCP:  0.7048
0.7047575073975367
FCP:  0.6823
0.6823116663482497


In [6]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, KFold
from surprise import SVD, KNNBasic, NMF
from surprise import accuracy
from sklearn.linear_model import LinearRegression
import numpy as np

# Загрузка данных MovieLens
data = Dataset.load_builtin('ml-100k')

# Разделение данных на тренировочные и тестовые с помощью KFold
kf = KFold(n_splits=5)

# Инициализация моделей
svd = SVD()
knn = KNNBasic()
nmf = NMF()

# Создаем массивы для хранения признаков (предсказаний) и целевых значений
X_meta = []
y_meta = []

# Для хранения RMSE каждого алгоритма по fold'ам
rmse_svd = []
rmse_knn = []
rmse_nmf = []

# Кросс-валидация с использованием Surprise KFold
for trainset, testset in kf.split(data):
    # Обучение базовых моделей
    svd.fit(trainset)
    knn.fit(trainset)
    nmf.fit(trainset)

    # Получение предсказаний от каждой модели
    svd_preds = svd.test(testset)
    knn_preds = knn.test(testset)
    nmf_preds = nmf.test(testset)

    # Оценка RMSE каждой модели на текущем fold
    rmse_svd.append(accuracy.rmse(svd_preds, verbose=False))
    rmse_knn.append(accuracy.rmse(knn_preds, verbose=False))
    rmse_nmf.append(accuracy.rmse(nmf_preds, verbose=False))

    # Преобразуем предсказания в DataFrame
    def predictions_to_df(predictions):
        user_ids = [pred.uid for pred in predictions]
        item_ids = [pred.iid for pred in predictions]
        true_ratings = [pred.r_ui for pred in predictions]
        estimated_ratings = [pred.est for pred in predictions]
        return pd.DataFrame({'user_id': user_ids, 'item_id': item_ids, 'true_rating': true_ratings, 'est_rating': estimated_ratings})

    df_svd = predictions_to_df(svd_preds)
    df_knn = predictions_to_df(knn_preds)
    df_nmf = predictions_to_df(nmf_preds)

    # Собираем предсказания в единый DataFrame
    ensemble_df = pd.DataFrame({
        'user_id': df_svd['user_id'],
        'item_id': df_svd['item_id'],
        'true_rating': df_svd['true_rating'],
        'svd_pred': df_svd['est_rating'],
        'knn_pred': df_knn['est_rating'],
        'nmf_pred': df_nmf['est_rating']
    })

    # Добавляем предсказания в матрицу признаков X и целевые значения y
    X_meta.extend(ensemble_df[['svd_pred', 'knn_pred', 'nmf_pred']].values)
    y_meta.extend(ensemble_df['true_rating'].values)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [7]:
from pandas import DataFrame

X_meta = DataFrame(X_meta)
y_meta = DataFrame(y_meta)


In [None]:
from lightgbm import LGBMRegressor
# Преобразуем списки в массивы numpy
X_meta = np.array(X_meta)
y_meta = np.array(y_meta)

# Обучаем метамодель на предсказаниях базовых моделей
meta_model = LinearRegression()
meta_light = LGBMRegressor()

meta_model.fit(X_meta, y_meta)
meta_light.fit(X_meta, y_meta)

# Тестируем метамодель на тестовой выборке
y_pred_meta = meta_model.predict(X_meta)
y_pred_meta_light = meta_light.predict(X_meta)

# Оценка RMSE метамодели
final_rmse_meta = np.sqrt(((y_pred_meta - y_meta) ** 2).mean())
final_rmse_light = np.sqrt(((y_pred_meta_light - y_meta) ** 2).mean())

# Вывод результатов
print("Сравнение RMSE:")
print(f"Средний RMSE SVD: {np.mean(rmse_svd):.4f}")
print(f"Средний RMSE KNN: {np.mean(rmse_knn):.4f}")
print(f"Средний RMSE NMF: {np.mean(rmse_nmf):.4f}")
print(f"RMSE метамодели LogReg: {final_rmse_meta:.4f}")
print(f"RMSE метамодели LightGBM: {final_rmse_light:.4f}")


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000960 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 100000, number of used features: 3
[LightGBM] [Info] Start training from score 3.529860
