<a href="https://colab.research.google.com/github/dlbkv/AI-Step/blob/master/module7/hw23_recomendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Завдання

* Створіть Reader
* Створіть датасет та розділіть його на тренувальні та тестові дані
* Виберіть метрики для поріняння якості моделей
* На основі метрик виберіть найкращу модель



In [8]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/HalyshAnton/IT-Step-Pyton-AI/main/module7/data/ratings.csv")

df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,172,94969,5.0,1396067836
1,172,98956,4.0,1396067879
2,176,73881,4.0,1499807147
3,221,1900,4.5,1288550866
4,333,33688,4.0,1412015122


In [9]:
df.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp
count,17604.0,17604.0,17604.0,17604.0
mean,141251.609009,88913.605942,3.465945,1403256000.0
std,81731.882435,51285.238375,1.203144,151508300.0
min,172.0,139.0,0.5,828222400.0
25%,71256.0,55098.0,3.0,1395695000.0
50%,138487.0,89554.0,3.5,1456552000.0
75%,210264.5,134779.5,4.5,1494418000.0
max,283195.0,193843.0,5.0,1537934000.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17604 entries, 0 to 17603
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   user_id    17604 non-null  int64  
 1   movie_id   17604 non-null  int64  
 2   rating     17604 non-null  float64
 3   timestamp  17604 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 550.2 KB


In [11]:
try:
  from surprise import Dataset, Reader
except ModuleNotFoundError:
  !pip install -q surprise
  from surprise import Dataset, Reader

reader = Reader(rating_scale=(0, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[["user_id", "movie_id", "rating"]], reader)

In [12]:
from surprise.model_selection import train_test_split
from surprise import BaselineOnly, SVD, KNNBasic, CoClustering, accuracy

trainset, testset = train_test_split(data, train_size=0.8)

In [13]:
try:
  import optuna
except ModuleNotFoundError:
  !pip install -q optuna
  import optuna
import time

def objective(trial):

    start_time = time.time()
    print(f"Iteration: {trial.number}")


    algo_type = trial.suggest_categorical('algo_type', ['BaselineOnly','SVD','KNNBasic', 'CoClustering'])

    if algo_type == 'BaselineOnly':
        method_type = trial.suggest_categorical('method_type', ['sgd', 'als'])
        if method_type == 'sgd':
          params = {
              'method': 'sgd',
              'learning_rate': trial.suggest_float('learning_rate', 1e-6, 1.0, log=True),
              'n_epochs': 20,
          }
        else:
          params = {
              'method': 'als',
              'reg_u': trial.suggest_float('reg_u', 1e-6, 1.0, log=True),
              'reg_i': trial.suggest_float('reg_i', 1e-6, 1.0, log=True),
              'n_epochs': 20,
          }

        algo = BaselineOnly(bsl_options=params)

    elif algo_type == 'SVD':
        params = {
            'n_factors': trial.suggest_int('n_factors', 10, 150),
            'reg_all': trial.suggest_float('reg_all', 1e-6, 1.0, log=True),
            'lr_all': trial.suggest_float('lr_all', 1e-6, 1.0, log=True),

        }

        algo = SVD(**params)

    elif algo_type == 'KNNBasic':

        params = {
            'k': 10,
            'min_k': 1,
        }

        sim_params = {
            'min_support': 1,
            'name': trial.suggest_categorical('name', ['cosine', 'pearson', 'msd']),
            'user_based': trial.suggest_categorical('user_based', [True, False]),
        }

        algo = KNNBasic(**params, sim_options=sim_params)

    elif algo_type == 'CoClustering':
      params = {
            'n_epochs': 20,
            'n_cltr_u': trial.suggest_int('n_cltr_u', 1, 20),
            'n_cltr_i': trial.suggest_int('n_cltr_i', 1, 20),

        }

      algo = CoClustering(**params)

    algo.fit(trainset)

    preds = algo.test(testset)
    metric = accuracy.mse(preds, verbose=False)

    end_time = time.time()
    iteration_time = end_time - start_time

    print(f"mae = {accuracy.mae(preds, verbose=False)}")
    print(f"mse = {accuracy.mse(preds, verbose=False)}")
    print(f"rmse= {accuracy.rmse(preds, verbose=False)}")
    print(f"fcp = {accuracy.fcp(preds, verbose=False)}")
    print()
    print(f"Iteration time: {iteration_time} seconds")

    return metric

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/380.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/380.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [14]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)

Iteration: 0
Estimating biases using sgd...
mae = 0.848430971801441
mse = 1.2308413794424293
rmse= 1.1094329089415138
fcp = 0.610849933628867

Iteration time: 0.1317906379699707 seconds
Iteration: 1
Estimating biases using sgd...
mae = 0.9205296980397063
mse = 1.400130084250141
rmse= 1.1832709259717915
fcp = 0.6105230170408376

Iteration time: 0.09775972366333008 seconds
Iteration: 2
Estimating biases using als...
mae = 0.6622843590285994
mse = 0.8354184229060416
rmse= 0.9140122662776696
fcp = 0.6620363843118946

Iteration time: 0.2301619052886963 seconds
Iteration: 3
Estimating biases using sgd...
mae = 0.6908601369145082
mse = 0.8513104173159761
rmse= 0.9226648456053672
fcp = 0.6541071627379736

Iteration time: 0.10670137405395508 seconds
Iteration: 4
Computing the cosine similarity matrix...
Done computing similarity matrix.
mae = 0.7726448267576927
mse = 1.0614245004213367
rmse= 1.0302545803932817
fcp = 0.49477520579373563

Iteration time: 0.3265988826751709 seconds
Iteration: 5
Co

In [15]:
study.best_value

0.8121952613387892

In [16]:
best_params = study.best_params
best_params

{'algo_type': 'BaselineOnly',
 'method_type': 'als',
 'reg_u': 0.5654833953301198,
 'reg_i': 0.9980444259171931}