Для первой задачи мы используем данные [Jester Online Joke Recommender System](https://goldberg.berkeley.edu/jester-data/)

**Описание данных**

Файл `train_joke_df.csv` содержит:
- UID - id пользователей
- JID - id шуток, которые 
- Ratin - рейтинг шутки, который проставил пользователь 


Рейтинг имеет значение от -10.00 до 10.00. Могут встречаться значения 99.00, но это обозначает Null (нет рейтинга от пользователя).

Метрика для оценки [RMSE](https://www.codecamp.ru/blog/how-to-interpret-rmse/)

Минимальный RMSE: `4.2238`



In [None]:
!pip install "scikit-surprise==1.1.3"
!pip install "xlrd==2.0.1"
!pip install optuna -q

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise==1.1.3
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095451 sha256=5718577cdda82fbaedf22b3c5e6d1e421ae2c7b227d06ee66e106547638ac1f9
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/sim

### Import

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
from surprise import Dataset, Reader, KNNWithMeans, accuracy, SVD, SVDpp, NMF, SlopeOne, KNNWithZScore
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from sklearn.model_selection import train_test_split as tts
from surprise.model_selection import KFold
import optuna

from google.colab import drive
drive.mount('/content/drive')

np.random.seed(42)

Mounted at /content/drive


### Базовые функции для скоринга и получения рекомендаций

In [None]:
def get_num_user_ratings(uid):
    """ возвращает кол-во рейтингов у пользователя 
    args: 
      uid: id пользователей
    returns: 
      кол-во объектов, которые оценил пользователь
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # пользователя не было во время обучения (новый, отправить на стартовые рекомендации)
        return 0
    
def get_num_item_ratings(iid):
    """ возвращает кол-во пользователей, которые оценили выбранный элемент 
    args:
      iid: строка с элементов рекомендации
    returns:
      кол-во пользователей, которые дали оценки по элементу
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
# На основе Surprise FAQ построим рекомендации Топ-N
def get_top_n(predictions, n=5):
    """Определят Топ-N рекомендаций

    Args:
        predictions(list of Prediction objects): Списко рекомендаций, из алгоритма Surprise
        n(int): Кол-во топ рекомендаций

    Returns:
        Словарь пользователь - список рекомендакиций для пользователей
        [(raw item id, rating estimation), ...]
    """

    # Предикт для каждого пользователя
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Сортировка предикта (по пользователям)
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n    

### Загрузка и обработка данных

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Bootcamp_Магнит/train_joke_df.csv')

df.head(5)

Unnamed: 0,UID,JID,Rating
0,18029,6,-1.26
1,3298,64,-4.17
2,3366,58,0.92
3,12735,92,3.69
4,11365,38,-6.6


In [None]:
# сделаем сортировку и перепишем index
df = df.sort_values(by=['UID', 'JID'])
df = df.reset_index(drop=True)

In [None]:
# создадим на основе набора данных
# поднабор, который требуется для библиотеки Surprise

# указываем минимальный и максимальный рейтинги
reader = Reader(rating_scale=(-10, 10))

# передаём набор, указывая последовательность колонок: user (raw) ids, item (raw) ids, ratings
# для Surprise - это обязательно
data = Dataset.load_from_df(df[['UID', 'JID', 'Rating']], reader)

In [None]:
trainset_data = data.build_full_trainset()

# сделаем разделение на обучающую и тестовую выборку
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

### Обучение модели

In [None]:
# Подберём гиперпараметры

def objective_SVD(trial):    
    reg_pu = trial.suggest_float("reg_pu", 1e-6, 0.01, log=True)
    reg_qi = trial.suggest_float("reg_qi", 0.01, 1, log=True)
    reg_bu = trial.suggest_float("reg_bu", 1e-7, 0.01, log=True)
    reg_bi = trial.suggest_float("reg_bi", 1e-7, 0.01, log=True)
    
    lr_pu = trial.suggest_float("lr_pu", 0.0001, 0.01, log=True)
    lr_qi = trial.suggest_float("lr_qi", 0.0001, 0.01, log=True)
    lr_bu = trial.suggest_float("lr_bu", 0.0001, 0.01, log=True)
    lr_bi = trial.suggest_float("lr_bi", 0.0001, 0.01, log=True)
    

    #reg_all = trial.suggest_float("reg_all", 1e-5, 2, log=True)
    #lr_all = trial.suggest_float("lr_all", 0.0001, 0.01, log=True)
    n_factors = trial.suggest_int("n_factors", 100, 2000)
    n_epochs = trial.suggest_int("n_epochs", 10, 50)

    #init_mean = trial.suggest_float("init_mean", 0, 10, log=False)
    #init_std_dev = trial.suggest_float("init_std_dev", 0.01, 2, log=True)
    
    kf = KFold(n_splits=3)

    algo = SVD(reg_pu=reg_pu, reg_qi=reg_qi, reg_bu=reg_bu, reg_bi=reg_bi,
               lr_pu=lr_pu, lr_qi=lr_qi, lr_bu=lr_bu, lr_bi=lr_bi,
               n_factors=n_factors, n_epochs=n_epochs)

    rmse = []
    
    for trainset, testset in kf.split(data):

      algo.fit(trainset)
      predictions = algo.test(testset)

      rmse.append(accuracy.rmse(predictions))

    score = np.array(rmse).mean()

    return score


study = optuna.create_study(direction="minimize")
study.optimize(objective_SVD, n_trials=40)

[32m[I 2023-04-26 09:18:06,993][0m A new study created in memory with name: no-name-a88de648-8763-4bab-9283-138289238824[0m


RMSE: 4.0683
RMSE: 4.0696
RMSE: 4.0702


[32m[I 2023-04-26 09:21:08,885][0m Trial 0 finished with value: 4.069347680962569 and parameters: {'reg_pu': 1.220855211403125e-06, 'reg_qi': 0.5478595913941109, 'reg_bu': 0.00780509079637674, 'reg_bi': 0.0012921586468483987, 'lr_pu': 0.009294438389589607, 'lr_qi': 0.00019683272170000586, 'lr_bu': 0.0005726084296876974, 'lr_bi': 0.00018818090671546834, 'n_factors': 640, 'n_epochs': 13}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1962
RMSE: 4.1943
RMSE: 4.1973


[32m[I 2023-04-26 09:36:44,688][0m Trial 1 finished with value: 4.195945833771931 and parameters: {'reg_pu': 0.0019942266906958377, 'reg_qi': 0.037036423070412106, 'reg_bu': 0.00019715290218143904, 'reg_bi': 5.490703496667441e-06, 'lr_pu': 0.00039099856905620763, 'lr_qi': 0.00016099608012160935, 'lr_bu': 0.0008818558544526172, 'lr_bi': 0.0036613617252923927, 'n_factors': 1839, 'n_epochs': 34}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1886
RMSE: 4.2028


[32m[I 2023-04-26 09:41:55,708][0m Trial 2 finished with value: 4.192363626829882 and parameters: {'reg_pu': 0.0007969842363138854, 'reg_qi': 0.016253200220686367, 'reg_bu': 8.121801535473412e-05, 'reg_bi': 0.008886165598068426, 'lr_pu': 0.005347088594248675, 'lr_qi': 0.00029168565734931915, 'lr_bu': 0.0007175303753531704, 'lr_bi': 0.006648389973054777, 'n_factors': 1297, 'n_epochs': 14}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1858
RMSE: 4.1214
RMSE: 4.1265
RMSE: 4.1197


[32m[I 2023-04-26 09:55:27,141][0m Trial 3 finished with value: 4.122528670571615 and parameters: {'reg_pu': 1.3616042593652704e-06, 'reg_qi': 0.014942486769864854, 'reg_bu': 1.4317384525713946e-07, 'reg_bi': 7.779775389798357e-06, 'lr_pu': 0.005403789992560395, 'lr_qi': 0.009579321229141523, 'lr_bu': 0.003659067690444993, 'lr_bi': 0.001913371054920376, 'n_factors': 1231, 'n_epochs': 43}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.3822
RMSE: 4.3835
RMSE: 4.3860


[32m[I 2023-04-26 10:00:08,627][0m Trial 4 finished with value: 4.38390104108916 and parameters: {'reg_pu': 0.0014653870776635616, 'reg_qi': 0.6922045165833495, 'reg_bu': 0.000143151742615965, 'reg_bi': 2.357427407019533e-07, 'lr_pu': 0.0016407177378766921, 'lr_qi': 0.00014468809264199878, 'lr_bu': 0.0020706090619958422, 'lr_bi': 0.004840749212596454, 'n_factors': 1347, 'n_epochs': 12}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.2786
RMSE: 4.2688
RMSE: 4.2711


[32m[I 2023-04-26 10:06:16,912][0m Trial 5 finished with value: 4.272835475998927 and parameters: {'reg_pu': 4.772605638738243e-05, 'reg_qi': 0.03895886091665082, 'reg_bu': 6.672822515634316e-07, 'reg_bi': 0.0007108299267626391, 'lr_pu': 0.00022676657072262727, 'lr_qi': 0.0007755557835238138, 'lr_bu': 0.002315620733778612, 'lr_bi': 0.003555804693465876, 'n_factors': 1719, 'n_epochs': 13}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1377
RMSE: 4.1372
RMSE: 4.1419


[32m[I 2023-04-26 10:20:46,037][0m Trial 6 finished with value: 4.138916609489358 and parameters: {'reg_pu': 1.504497854736823e-06, 'reg_qi': 0.0206356145130677, 'reg_bu': 0.00048164832496169074, 'reg_bi': 0.0023321945401795054, 'lr_pu': 0.002338693925194916, 'lr_qi': 0.0024975144767897294, 'lr_bu': 0.001553555699099695, 'lr_bi': 0.0040593800534453285, 'n_factors': 1497, 'n_epochs': 38}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1140
RMSE: 4.1141


[32m[I 2023-04-26 10:25:51,705][0m Trial 7 finished with value: 4.1175282350206395 and parameters: {'reg_pu': 9.74156899694228e-06, 'reg_qi': 0.08273200747392112, 'reg_bu': 0.002799068491688402, 'reg_bi': 0.001296552126160083, 'lr_pu': 0.0028651064301063603, 'lr_qi': 0.00022202296110747816, 'lr_bu': 0.0017955258643710954, 'lr_bi': 0.006954232868573051, 'n_factors': 383, 'n_epochs': 37}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1245
RMSE: 4.3280
RMSE: 4.3222
RMSE: 4.3317


[32m[I 2023-04-26 10:40:48,036][0m Trial 8 finished with value: 4.3272708179542025 and parameters: {'reg_pu': 0.004283072593711643, 'reg_qi': 0.7544230618458196, 'reg_bu': 0.00018769404887984025, 'reg_bi': 1.7023464049833718e-07, 'lr_pu': 0.0007240867510964138, 'lr_qi': 0.0007894454593676848, 'lr_bu': 0.0012832180463941164, 'lr_bi': 0.00012978851412077583, 'n_factors': 1571, 'n_epochs': 37}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1942
RMSE: 4.1937
RMSE: 4.1904


[32m[I 2023-04-26 10:47:13,633][0m Trial 9 finished with value: 4.19275445208685 and parameters: {'reg_pu': 1.3512497277892724e-05, 'reg_qi': 0.020682803612200013, 'reg_bu': 1.4023002760507664e-05, 'reg_bi': 0.0003762664206645498, 'lr_pu': 0.001861054061201455, 'lr_qi': 0.00034476709505164256, 'lr_bu': 0.0004497319846523098, 'lr_bi': 0.000527282261037241, 'n_factors': 1026, 'n_epochs': 22}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1079
RMSE: 4.1112
RMSE: 4.1040


[32m[I 2023-04-26 10:51:54,965][0m Trial 10 finished with value: 4.107732444302745 and parameters: {'reg_pu': 0.00023271930110466227, 'reg_qi': 0.31508744670316163, 'reg_bu': 0.007156188868231573, 'reg_bi': 9.429715865880254e-05, 'lr_pu': 0.008357825761103027, 'lr_qi': 0.00010193305929239357, 'lr_bu': 0.00015932563540421832, 'lr_bi': 0.00016216618841812932, 'n_factors': 563, 'n_epochs': 25}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1166
RMSE: 4.1119
RMSE: 4.1077


[32m[I 2023-04-26 10:56:28,106][0m Trial 11 finished with value: 4.112061290755882 and parameters: {'reg_pu': 0.00023347151703521343, 'reg_qi': 0.29832079745907897, 'reg_bu': 0.00915197452489221, 'reg_bi': 0.00010713633346185188, 'lr_pu': 0.009552037219055301, 'lr_qi': 0.00010025996894298352, 'lr_bu': 0.00012768897408282025, 'lr_bi': 0.00012648826758674702, 'n_factors': 581, 'n_epochs': 24}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1263
RMSE: 4.1199
RMSE: 4.1182


[32m[I 2023-04-26 11:02:00,505][0m Trial 12 finished with value: 4.121473961408932 and parameters: {'reg_pu': 0.00014833232933947547, 'reg_qi': 0.24928817906800307, 'reg_bu': 0.008901173044930571, 'reg_bi': 9.267078336052883e-05, 'lr_pu': 0.009741495222122214, 'lr_qi': 0.00010390582076046136, 'lr_bu': 0.0002639823128078807, 'lr_bi': 0.0003015924062223725, 'n_factors': 740, 'n_epochs': 24}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.6642
RMSE: 4.6720
RMSE: 4.6675


[32m[I 2023-04-26 11:05:21,018][0m Trial 13 finished with value: 4.667891083752379 and parameters: {'reg_pu': 0.0003647804987371024, 'reg_qi': 0.31963762407816115, 'reg_bu': 0.0008835286010255344, 'reg_bi': 0.00016823060912782444, 'lr_pu': 0.0001024025450759609, 'lr_qi': 0.0003953325188773842, 'lr_bu': 0.00013133369206649084, 'lr_bi': 0.000265562588975178, 'n_factors': 102, 'n_epochs': 50}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1586
RMSE: 4.1581


[32m[I 2023-04-26 11:10:41,745][0m Trial 14 finished with value: 4.160278596004262 and parameters: {'reg_pu': 6.158736311658782e-05, 'reg_qi': 0.975811210029159, 'reg_bu': 0.0018319135193855217, 'reg_bi': 3.941262979057673e-05, 'lr_pu': 0.00449209063976818, 'lr_qi': 0.00020582788841394506, 'lr_bu': 0.007393268626783481, 'lr_bi': 0.0009744590146383508, 'n_factors': 884, 'n_epochs': 20}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.1641
RMSE: 4.1388
RMSE: 4.1289
RMSE: 4.1300


[32m[I 2023-04-26 11:13:40,167][0m Trial 15 finished with value: 4.132584770003148 and parameters: {'reg_pu': 1.983331936141918e-05, 'reg_qi': 0.1458286204633049, 'reg_bu': 0.004379793101643873, 'reg_bi': 0.009720873778759803, 'lr_pu': 0.009189484079056233, 'lr_qi': 0.00010593453001661308, 'lr_bu': 0.000288633795788231, 'lr_bi': 0.00011400972458049842, 'n_factors': 413, 'n_epochs': 18}. Best is trial 0 with value: 4.069347680962569.[0m


RMSE: 4.0648
RMSE: 4.0597
RMSE: 4.0553


[32m[I 2023-04-26 11:19:22,231][0m Trial 16 finished with value: 4.059935038660574 and parameters: {'reg_pu': 0.006860932459696676, 'reg_qi': 0.47139271254371795, 'reg_bu': 0.0009185925246949851, 'reg_bi': 0.00043330790755112915, 'lr_pu': 0.0033674666058948802, 'lr_qi': 0.00047438563657017485, 'lr_bu': 0.0005581939382888749, 'lr_bi': 0.00022699462617129364, 'n_factors': 603, 'n_epochs': 30}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0747
RMSE: 4.0679
RMSE: 4.0660


[32m[I 2023-04-26 11:22:35,294][0m Trial 17 finished with value: 4.069526399002403 and parameters: {'reg_pu': 0.007982258272184519, 'reg_qi': 0.536949773958323, 'reg_bu': 0.0019247157217938827, 'reg_bi': 0.0024170064622656886, 'lr_pu': 0.0036738744493047615, 'lr_qi': 0.0005032362728669625, 'lr_bu': 0.0006304285839637224, 'lr_bi': 0.00026077337806543015, 'n_factors': 253, 'n_epochs': 29}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.1730
RMSE: 4.1678
RMSE: 4.1683


[32m[I 2023-04-26 11:29:40,826][0m Trial 18 finished with value: 4.169685945669799 and parameters: {'reg_pu': 0.009944799097605235, 'reg_qi': 0.470006773782788, 'reg_bu': 0.0010527846721787128, 'reg_bi': 0.00042104273836623507, 'lr_pu': 0.0013094871106525687, 'lr_qi': 0.0004919944576193361, 'lr_bu': 0.0004895961684844297, 'lr_bi': 0.000475575869804684, 'n_factors': 786, 'n_epochs': 31}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0890
RMSE: 4.0900
RMSE: 4.0972


[32m[I 2023-04-26 11:42:38,275][0m Trial 19 finished with value: 4.092055095610925 and parameters: {'reg_pu': 0.0006717778659573299, 'reg_qi': 0.9981234244336206, 'reg_bu': 0.0005435923099611964, 'reg_bi': 0.0028701691789307896, 'lr_pu': 0.0029337131325243245, 'lr_qi': 0.000250946474558989, 'lr_bu': 0.0009625422471838119, 'lr_bi': 0.00019281942651043252, 'n_factors': 1058, 'n_epochs': 46}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0814
RMSE: 4.0884
RMSE: 4.0795


[32m[I 2023-04-26 11:45:47,683][0m Trial 20 finished with value: 4.083095735361381 and parameters: {'reg_pu': 3.6011433966490665e-06, 'reg_qi': 0.1791270617665813, 'reg_bu': 3.525838261569544e-05, 'reg_bi': 0.0007949849054680316, 'lr_pu': 0.004897438235858445, 'lr_qi': 0.0013867752316819608, 'lr_bu': 0.0003407754586104081, 'lr_bi': 0.00037023438358311226, 'n_factors': 508, 'n_epochs': 17}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0703
RMSE: 4.0735
RMSE: 4.0755


[32m[I 2023-04-26 11:48:36,954][0m Trial 21 finished with value: 4.073086237984274 and parameters: {'reg_pu': 0.008479792022686339, 'reg_qi': 0.5055609826148751, 'reg_bu': 0.0021498605479624514, 'reg_bi': 0.005057918362328946, 'lr_pu': 0.0035239925660773756, 'lr_qi': 0.0005013992942403064, 'lr_bu': 0.0006652119327618975, 'lr_bi': 0.00022192180672981008, 'n_factors': 199, 'n_epochs': 28}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0648
RMSE: 4.0619
RMSE: 4.0724


[32m[I 2023-04-26 11:52:09,735][0m Trial 22 finished with value: 4.066366889563345 and parameters: {'reg_pu': 0.003391036974797534, 'reg_qi': 0.5179641629429279, 'reg_bu': 0.0021776414910895303, 'reg_bi': 0.001244757871822412, 'lr_pu': 0.006480589525971267, 'lr_qi': 0.0005362978701218122, 'lr_bu': 0.0005814337000109186, 'lr_bi': 0.00019382646995729378, 'n_factors': 302, 'n_epochs': 29}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0874
RMSE: 4.0825
RMSE: 4.0988


[32m[I 2023-04-26 11:54:45,456][0m Trial 23 finished with value: 4.089554510778698 and parameters: {'reg_pu': 0.0031385799199041423, 'reg_qi': 0.4210425698518158, 'reg_bu': 0.004058709576625114, 'reg_bi': 0.0013099453835175754, 'lr_pu': 0.006333127611732297, 'lr_qi': 0.0003189023983411477, 'lr_bu': 0.0004472303569877998, 'lr_bi': 0.0001783754015959838, 'n_factors': 687, 'n_epochs': 10}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0564
RMSE: 4.0630
RMSE: 4.0610


[32m[I 2023-04-26 11:58:44,032][0m Trial 24 finished with value: 4.060142164640057 and parameters: {'reg_pu': 0.0013180673282654862, 'reg_qi': 0.6415238195441398, 'reg_bu': 0.0004897255366887051, 'reg_bi': 0.0003044085840748599, 'lr_pu': 0.006069222664881346, 'lr_qi': 0.00023047021143624238, 'lr_bu': 0.0010835362831689897, 'lr_bi': 0.0004149185967572896, 'n_factors': 309, 'n_epochs': 33}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0686
RMSE: 4.0709
RMSE: 4.0606


[32m[I 2023-04-26 12:02:43,481][0m Trial 25 finished with value: 4.066682869640386 and parameters: {'reg_pu': 0.0035855941264400604, 'reg_qi': 0.6601151320884462, 'reg_bu': 0.0004874893142170764, 'reg_bi': 0.0002821668718548798, 'lr_pu': 0.00633858794769581, 'lr_qi': 0.0006712948139931583, 'lr_bu': 0.001108286186700535, 'lr_bi': 0.00010135099960302646, 'n_factors': 321, 'n_epochs': 32}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0991
RMSE: 4.0998
RMSE: 4.0959


[32m[I 2023-04-26 12:05:05,401][0m Trial 26 finished with value: 4.098277514920463 and parameters: {'reg_pu': 0.0013481302028524962, 'reg_qi': 0.3608217238128755, 'reg_bu': 0.0008083965311095959, 'reg_bi': 0.0003790082344666294, 'lr_pu': 0.00227547899833502, 'lr_qi': 0.0011793205865280871, 'lr_bu': 0.0008685717623759074, 'lr_bi': 0.0006398944595847253, 'n_factors': 138, 'n_epochs': 27}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0842
RMSE: 4.0722
RMSE: 4.0835


[32m[I 2023-04-26 12:10:37,160][0m Trial 27 finished with value: 4.079955652199207 and parameters: {'reg_pu': 0.004676132130383527, 'reg_qi': 0.22202279214870585, 'reg_bu': 0.00031991260473212976, 'reg_bi': 0.0006083418710579757, 'lr_pu': 0.0038579231809831257, 'lr_qi': 0.00039111889961355035, 'lr_bu': 0.0012302187814743127, 'lr_bi': 0.0003532173719886821, 'n_factors': 460, 'n_epochs': 34}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0727
RMSE: 4.0625
RMSE: 4.0677


[32m[I 2023-04-26 12:20:38,342][0m Trial 28 finished with value: 4.067624890444767 and parameters: {'reg_pu': 0.0024226553933977516, 'reg_qi': 0.4075003664038142, 'reg_bu': 0.0019098050325954885, 'reg_bi': 0.0002204048523656522, 'lr_pu': 0.0034575649331490357, 'lr_qi': 0.0002645967038701215, 'lr_bu': 0.00021766941036812466, 'lr_bi': 0.0007507508157748616, 'n_factors': 872, 'n_epochs': 41}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0636
RMSE: 4.0684
RMSE: 4.0687


[32m[I 2023-04-26 12:24:44,045][0m Trial 29 finished with value: 4.0669022815206235 and parameters: {'reg_pu': 0.0007921438376957107, 'reg_qi': 0.6647960752122282, 'reg_bu': 0.00408408515074921, 'reg_bi': 0.0014463350240161374, 'lr_pu': 0.007238020839424441, 'lr_qi': 0.00017810556916478753, 'lr_bu': 0.00035527547774395497, 'lr_bi': 0.00040852399082183614, 'n_factors': 313, 'n_epochs': 34}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0623
RMSE: 4.0570
RMSE: 4.0714


[32m[I 2023-04-26 12:30:33,898][0m Trial 30 finished with value: 4.063600980441148 and parameters: {'reg_pu': 0.0014017976729172265, 'reg_qi': 0.5744371915485587, 'reg_bu': 0.0010011788003054232, 'reg_bi': 0.00448689109038143, 'lr_pu': 0.006666617833037113, 'lr_qi': 0.00022377022647243223, 'lr_bu': 0.000577359192246206, 'lr_bi': 0.00024668222963762984, 'n_factors': 626, 'n_epochs': 30}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0663
RMSE: 4.0677
RMSE: 4.0705


[32m[I 2023-04-26 12:36:26,552][0m Trial 31 finished with value: 4.0681626543751515 and parameters: {'reg_pu': 0.0017554661375340276, 'reg_qi': 0.5370882664026599, 'reg_bu': 0.0003857907566165102, 'reg_bi': 0.0038811380546217634, 'lr_pu': 0.0067030266309959, 'lr_qi': 0.00021783850777769376, 'lr_bu': 0.0006114681757365961, 'lr_bi': 0.00023219059834706917, 'n_factors': 599, 'n_epochs': 31}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0734
RMSE: 4.0833
RMSE: 4.0718


[32m[I 2023-04-26 12:40:47,753][0m Trial 32 finished with value: 4.0761553828831465 and parameters: {'reg_pu': 0.005223053182924145, 'reg_qi': 0.4014919215307297, 'reg_bu': 0.001294970487460451, 'reg_bi': 0.0010218061585334827, 'lr_pu': 0.004768910711071791, 'lr_qi': 0.0001520747455797809, 'lr_bu': 0.00077208495533309, 'lr_bi': 0.00030889783544760046, 'n_factors': 462, 'n_epochs': 27}. Best is trial 16 with value: 4.059935038660574.[0m


RMSE: 4.0592
RMSE: 4.0608
RMSE: 4.0579


[32m[I 2023-04-26 12:44:22,745][0m Trial 33 finished with value: 4.059263639479627 and parameters: {'reg_pu': 0.0021912351750206205, 'reg_qi': 0.7915188201515515, 'reg_bu': 0.0009673018808533263, 'reg_bi': 0.006260490667591867, 'lr_pu': 0.006871714795534729, 'lr_qi': 0.0003386926164997961, 'lr_bu': 0.0005122341846165309, 'lr_bi': 0.00020594120210956946, 'n_factors': 259, 'n_epochs': 33}. Best is trial 33 with value: 4.059263639479627.[0m


RMSE: 4.0562
RMSE: 4.0596


[32m[I 2023-04-26 12:51:36,457][0m Trial 34 finished with value: 4.056740405924985 and parameters: {'reg_pu': 0.0011580526718134555, 'reg_qi': 0.817145278582794, 'reg_bu': 0.0002618106813715463, 'reg_bi': 0.00780263023185707, 'lr_pu': 0.005232350700168847, 'lr_qi': 0.0002801604175511963, 'lr_bu': 0.0009154923180480234, 'lr_bi': 0.0001553326131031038, 'n_factors': 688, 'n_epochs': 35}. Best is trial 34 with value: 4.056740405924985.[0m


RMSE: 4.0544
RMSE: 4.0582
RMSE: 4.0619
RMSE: 4.0629


[32m[I 2023-04-26 13:01:12,592][0m Trial 35 finished with value: 4.061011928255468 and parameters: {'reg_pu': 0.0005662216629940967, 'reg_qi': 0.8269376061992388, 'reg_bu': 0.00011631887768228272, 'reg_bi': 0.008014414191224095, 'lr_pu': 0.004006230423214899, 'lr_qi': 0.0003253770749366739, 'lr_bu': 0.0008707754584092834, 'lr_bi': 0.00014470026269708345, 'n_factors': 860, 'n_epochs': 40}. Best is trial 34 with value: 4.056740405924985.[0m


RMSE: 4.0542
RMSE: 4.0610
RMSE: 4.0587


[32m[I 2023-04-26 13:10:46,674][0m Trial 36 finished with value: 4.057922735960098 and parameters: {'reg_pu': 0.0011621159309778387, 'reg_qi': 0.7831590735500998, 'reg_bu': 0.00030941352065079095, 'reg_bi': 0.009732137030109026, 'lr_pu': 0.00517053155399731, 'lr_qi': 0.00030390337929455795, 'lr_bu': 0.0010044044100407663, 'lr_bi': 0.00015082711256712364, 'n_factors': 1030, 'n_epochs': 34}. Best is trial 34 with value: 4.056740405924985.[0m


RMSE: 4.0583
RMSE: 4.0580
RMSE: 4.0556


[32m[I 2023-04-26 13:21:09,277][0m Trial 37 finished with value: 4.057295637723957 and parameters: {'reg_pu': 0.0011448591457074886, 'reg_qi': 0.8440497048460534, 'reg_bu': 6.467071058997201e-05, 'reg_bi': 0.009931275642834672, 'lr_pu': 0.004968717128433608, 'lr_qi': 0.00029736607033096607, 'lr_bu': 0.0007493116095528266, 'lr_bi': 0.00017629806864810502, 'n_factors': 1072, 'n_epochs': 36}. Best is trial 34 with value: 4.056740405924985.[0m


RMSE: 4.0622
RMSE: 4.0572
RMSE: 4.0570


[32m[I 2023-04-26 13:32:43,295][0m Trial 38 finished with value: 4.058804900798624 and parameters: {'reg_pu': 0.0022192574554651193, 'reg_qi': 0.8788468775231728, 'reg_bu': 6.608676746683275e-05, 'reg_bi': 0.007097475449305056, 'lr_pu': 0.005231319116966904, 'lr_qi': 0.0001408221010353754, 'lr_bu': 0.0014584506966893265, 'lr_bi': 0.00016119034201014532, 'n_factors': 1177, 'n_epochs': 37}. Best is trial 34 with value: 4.056740405924985.[0m


RMSE: 4.0568
RMSE: 4.0497
RMSE: 4.0567


[32m[I 2023-04-26 13:46:49,912][0m Trial 39 finished with value: 4.054419398620368 and parameters: {'reg_pu': 0.0010398462453200708, 'reg_qi': 0.975481537099458, 'reg_bu': 6.405279534313364e-05, 'reg_bi': 0.008446937909973943, 'lr_pu': 0.005037130407846973, 'lr_qi': 0.0001532653071418146, 'lr_bu': 0.0016783726294656905, 'lr_bi': 0.00014842221255242954, 'n_factors': 1235, 'n_epochs': 44}. Best is trial 39 with value: 4.054419398620368.[0m


In [None]:
# Kaggle Score = 3.97333

# обучим с лучшими параметрами (Trial 39 из ячейки выше)
params = {'reg_pu': 0.0010398462453200708, 'reg_qi': 0.975481537099458, 'reg_bu': 6.405279534313364e-05, 'reg_bi': 0.008446937909973943, 'lr_pu': 0.005037130407846973, 'lr_qi': 0.0001532653071418146, 'lr_bu': 0.0016783726294656905, 'lr_bi': 0.00014842221255242954, 'n_factors': 1235, 'n_epochs': 44}

algo = SVD(reg_pu=params['reg_pu'], reg_qi=params['reg_qi'], reg_bu=params['reg_bu'], reg_bi=params['reg_bi'], 
           lr_pu=params['lr_pu'], lr_qi=params['lr_qi'], lr_bu=params['lr_bu'], lr_bi=params['lr_bi'],
            n_factors=params['n_factors'], n_epochs=params['n_epochs'])
algo.fit(trainset)

# получим предикт и посмотрим метрику
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 4.0176


4.017592166765312

In [None]:
# обучим с лучшими параметрами на всём трейне
algo.fit(trainset_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff256539ac0>

### Тестирование и результаты

In [None]:
# получаем предикт
uid = 1  # id пользователя 
iid = 1  # iв шутки

# получим предик на основе обученных данных
# -7.82 - это фактический рейтинг, но посмотрим, какой ответ будет в предикте
pred = algo.predict(uid, iid, r_ui=-7.82, verbose=True)

user: 1          item: 1          r_ui = -7.82   est = -7.49   {'was_impossible': False}


In [None]:
uid = 24983  # id пользователя 
iid = 62     # iв шутки

pred = algo.predict(uid, iid, r_ui=-0.29, verbose=True)

user: 24983      item: 62         r_ui = -0.29   est = -0.05   {'was_impossible': False}


### Обзор рекомендаций

In [None]:
# построим таблицу для обзора набора рекомендаций
# посмотрим, какие элементы и в каком кол-ве рекомендуем
trainset = algo.trainset

predictions_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])

predictions_df['№ кол-во пользовательских рейтингов'] = predictions_df.uid.apply(get_num_user_ratings)
predictions_df['№ кол-во рейтингов элементов'] = predictions_df.iid.apply(get_num_item_ratings)
predictions_df['error'] = abs(predictions_df.est - predictions_df.rui)

best_predictions = predictions_df.sort_values(by='error')[:10]
worst_predictions = predictions_df.sort_values(by='error')[-10:]

In [None]:
best_predictions.head(5)

Unnamed: 0,uid,iid,rui,est,details,№ кол-во пользовательских рейтингов,№ кол-во рейтингов элементов,error
19327,2889,78,6.12,6.120016,{'was_impossible': False},73,7194,1.6e-05
19089,15536,26,0.0,-1.7e-05,{'was_impossible': False},56,18969,1.7e-05
254695,18062,81,-3.74,-3.740034,{'was_impossible': False},70,7437,3.4e-05
62939,21634,62,2.33,2.330067,{'was_impossible': False},59,19985,6.7e-05
156685,7692,47,4.76,4.759929,{'was_impossible': False},54,17852,7.1e-05


In [None]:
# Предикт для всех, кого нет в выборке для обучения
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions)

# Сделаем вывод рекомендаций
a=0
for uid, user_ratings in top_n.items():
    a+=1
    print(uid, [iid for (iid, _) in user_ratings])
    
    if a==10:
        break

19208 [54, 55, 29, 78, 89]
8671 [8, 83, 77, 32, 72]
6037 [78, 81, 89, 77, 87]
3233 [50, 36, 27, 31, 42]
3449 [100, 80, 6, 73, 53]
10032 [35, 94, 32, 53, 49]
5774 [89, 47, 32, 68, 29]
23392 [36, 48, 29, 53, 21]
3039 [62, 53, 94, 91, 96]
17395 [27, 88, 50, 35, 85]


### Для отправки на тестирование

In [None]:
test = pd.read_csv('/content/drive/MyDrive/Bootcamp_Магнит/test_joke_df_nofactrating.csv', index_col=0)
test.head(5)

Unnamed: 0_level_0,UID,JID
InteractionID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11228,39
1,21724,85
2,16782,56
3,12105,42
4,14427,2


In [None]:
test['Rating'] = test[['UID', 'JID']].apply(lambda x: algo.predict(x[0], x[1], verbose=False).est,
                                                      axis = 1)

In [None]:
# вид набора данных, который должен быть отправлен для тестирования
test['Rating'].to_frame().head(5)

Unnamed: 0_level_0,Rating
InteractionID,Unnamed: 1_level_1
0,3.072118
1,-7.812274
2,-0.214448
3,7.111174
4,6.230477


In [None]:
# формирование файла для отправки в Kaggle
test['Rating'].to_frame().to_csv('/content/drive/MyDrive/Bootcamp_Магнит/Chernov_Klim_nn.csv')