## Общая схема кросс валидации

1. Из датафрейма берется та часть пользователей, которая кликала от 1 до 5 мероприятий. Они принадлежат 95 % всей выборки
2. Из этой части случайным образом выбирается один пользователь с фиксированным числом кликнутых мероприятий от 3 до 5
3. Получаем список кликнутых пользователем мероприятий
4. Цикл по кликнутым мероприятиям:
4.1 Из общей выборки исключается строка с записью о клике выбранного пользователя на мероприятие
4.2 Обучение модели по данным с исключенным мероприятием
4.3 Получение списка из N мероприятий для данного пользователя
4.4 Проверяем, есть ли исключенное мероприятие в списке рекомендованных

In [4]:
import sys
print(sys.path)
sys.path.append('/media/maxim/5d25e004-74c4-4174-8cd3-d5dce3ff6981/maxim/python_projects/pushkin_card_recommender_system')
sys.path.append('/media/maxim/5d25e004-74c4-4174-8cd3-d5dce3ff6981/maxim/python_projects/pushkin_card_recommender_system/pipeline')

['/snap/pycharm-professional/271/plugins/python/helpers-pro/jupyter_debug', '/snap/pycharm-professional/271/plugins/python/helpers/pydev', '/media/maxim/5d25e004-74c4-4174-8cd3-d5dce3ff6981/maxim/python_projects/pushkin_card_recommender_system/evaluation', '/media/maxim/5d25e004-74c4-4174-8cd3-d5dce3ff6981/maxim/anaconda/envs/pushkin_card_recommender_system/lib/python38.zip', '/media/maxim/5d25e004-74c4-4174-8cd3-d5dce3ff6981/maxim/anaconda/envs/pushkin_card_recommender_system/lib/python3.8', '/media/maxim/5d25e004-74c4-4174-8cd3-d5dce3ff6981/maxim/anaconda/envs/pushkin_card_recommender_system/lib/python3.8/lib-dynload', '', '/home/maxim/.local/lib/python3.8/site-packages', '/media/maxim/5d25e004-74c4-4174-8cd3-d5dce3ff6981/maxim/anaconda/envs/pushkin_card_recommender_system/lib/python3.8/site-packages', '/media/maxim/5d25e004-74c4-4174-8cd3-d5dce3ff6981/maxim/python_projects/pushkin_card_recommender_system', '/media/maxim/5d25e004-74c4-4174-8cd3-d5dce3ff6981/maxim/python_projects/push

In [5]:
import pandas as pd
import numpy as np
from typing import Tuple
from tqdm import tqdm_notebook
from pipeline import data_management as dm
from sklearn.model_selection import ParameterGrid
from recommenders.implicit_models import UserItemRecommender, ALSRecommender, BPRRecommender

In [6]:
def split_df_into_diapasons(df: pd.DataFrame) -> pd.DataFrame:
    activity = df.groupby('user_id')['clicks_count'].count().sort_values(ascending=False).reset_index()
    activity['diapason'] = pd.cut(activity['clicks_count'], bins=np.linspace(0, 70, 15),
                                  labels=[f'({i}:{j}]' for i, j in zip(range(0, 70, 5), range(5, 75, 5))])
    return activity

In [7]:
def evaluate(
        user_id: str,
        user_event_df: pd.DataFrame,
        recommender: UserItemRecommender,
        number_of_recommended=10
) -> float:
    clicked_by_user = user_event_df.loc[user_event_df.user_id == user_id]

    count = 0
    for _, clicked_event in clicked_by_user.iterrows():
        example = user_event_df[(user_event_df.user_id == clicked_event.user_id) &
                                (user_event_df.item_id == clicked_event.item_id)]
        us_ev = user_event_df.drop(example.index)
        recommender.fit(user_item_df=us_ev, show_progress=False)
        recommended_to_user = recommender.get_user_recommendation(user_id, number_of_recommended, as_pd_dataframe=True)
        if example.iloc[0].item_id in list(recommended_to_user.item_id.values):
            count += 1

    return round(count / len(clicked_by_user), 2)

In [8]:
def cross_validation(
        user_activity_df: pd.DataFrame,
        user_event_df: pd.DataFrame,
        recommender: UserItemRecommender,
        parameters: dict,
        num_of_clicked: int = 5,
        num_of_recommended: int = 10
) -> Tuple[UserItemRecommender, dict]:
    users_in_interval = user_activity_df.loc[
        (user_activity_df.diapason == '(0:5]') &
        (user_activity_df.clicks_count == num_of_clicked)
        ]

    user_from_interval = users_in_interval.sample().iloc[0]
    best_params = dict()
    best_recommender = None
    max_probability = 0
    for params in tqdm_notebook(ParameterGrid(parameters)):
        rec = recommender(**params)
        probability = evaluate(
            user_id=user_from_interval.user_id,
            user_event_df=user_event_df,
            recommender=rec,
            number_of_recommended=num_of_recommended
        )
        if probability > max_probability:
            print(probability)
            max_probability = probability
            best_params = params
            best_recommender = rec

    print(max_probability, best_params)
    return best_recommender, best_params


In [9]:
bashkortostan_user_event = pd.read_csv('../data/user_event_dfs/bashkortostan_user_event_df.csv')
bashkortostan_user_event.drop(columns='Unnamed: 0', inplace=True)
bashkortostan_activity = split_df_into_diapasons(df=bashkortostan_user_event)
bashkortostan_activity

Unnamed: 0,user_id,clicks_count,diapason
0,e4f8e204a900fa2b4ad6f3fbb48b5585,36,(35:40]
1,bdfae6b594bc1f28467b4f2db002746e,35,(30:35]
2,df35377e953a13a7ad537ba04737b826,30,(25:30]
3,4e3f977e66736abb763b3fcab57b122b,28,(25:30]
4,2b0c5f923468dc1b1d199ea46f503022,28,(25:30]
...,...,...,...
5870,33ca62f0ef71ecf58e3ae098c2957388,1,(0:5]
5871,33bf851266570c7d0572d531da46bae9,1,(0:5]
5872,8c90eb1e907a4799995f8cd255e2d488,1,(0:5]
5873,8cb19200880fb3ab173feb9df18b4c5e,1,(0:5]


In [10]:
bashkortostan_user_event = dm.numerate_user_event_df(bashkortostan_user_event)
bashkortostan_user_event = bashkortostan_user_event.rename(columns={
    'event_id': 'item_id',
    'event_num': 'item_num',
    'clicks_count': 'rating'
})

In [11]:
bashkortostan_user_event

Unnamed: 0,user_id,item_id,event_name,rating,user_num,item_num
0,000b1ca3e2eefa7c8b24ed417548b301,855200,Спектакль «Щелкунчик»,2,0,47
1,000d99f08793606692008c9d805bd357,1439439,Выставка «От камня до искусственного интеллекта»,1,1,467
2,00127dd4421eddeb5f206ae28f3b540c,1245777,Балет «Анюта»,2,2,101
3,002445526f1c2e1f660cad308434c9e6,1260968,Экскурсия «Башкортостан в годы Великой Отечест...,3,3,132
4,002493182f55b3337eda7e5186aba306,1294104,Спектакль «Луна и листопад»,1,4,187
...,...,...,...,...,...,...
11955,ffef12590dcade483c5843400ffbfdf1,1266191,Выставки проекта «Современное изобразительное ...,1,5871,136
11956,fff5187ee2cc6677d5a559f6951eb5b6,1450579,Спектакль «Снежная корискуолева»,1,5872,500
11957,fff5a247a711d1589fa1c3282d2689a9,1359097,Мастер-класс «Северные амуры»,8,5873,316
11958,fffb3cc939336c91c2989420836ae3d7,1255556,Спектакль «Палата бизнес-класса»,1,5874,119


In [12]:
bashkortostan_user_event.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11960 entries, 0 to 11959
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   user_id     11960 non-null  category
 1   item_id     11960 non-null  category
 2   event_name  11960 non-null  object  
 3   rating      11960 non-null  int64   
 4   user_num    11960 non-null  int16   
 5   item_num    11960 non-null  int16   
dtypes: category(2), int16(2), int64(1), object(1)
memory usage: 476.1+ KB


In [13]:
best_model, params = cross_validation(
    user_event_df=bashkortostan_user_event,
    user_activity_df=bashkortostan_activity,
    num_of_clicked=5,
    recommender=ALSRecommender,
    parameters={
        'factors': range(20, 101),
        'regularization': [j for j in np.linspace(0, 1, 11, endpoint=False)],
        'iterations': [100],
        'confidence': ['alpha'],
        'alpha_value': range(10, 15)
    },
    num_of_recommended=10
)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for params in tqdm_notebook(ParameterGrid(parameters)):


  0%|          | 0/4455 [00:00<?, ?it/s]

0.8


KeyboardInterrupt: 

In [13]:
cross_validation(
    user_event_df=bashkortostan_user_event,
    user_activity_df=bashkortostan_activity,
    num_of_clicked=5,
    recommender=BPRRecommender,
    parameters={
        'factors': range(20, 101),
        'learning_rate': [0.01, 0.02, 0.03],
        'regularization': [j for j in np.linspace(0, 1, 11, endpoint=False)],
    },
    num_of_recommended=10
)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for params in tqdm_notebook(ParameterGrid(parameters)):


  0%|          | 0/2673 [00:00<?, ?it/s]

0.4
0.6
0.8
0.8 {'factors': 26, 'learning_rate': 0.03, 'regularization': 0.0}


(<recommenders.implicit_models.BPRRecommender at 0x7f5b563f4970>,
 {'factors': 26, 'learning_rate': 0.03, 'regularization': 0.0})

In [20]:
def test_best_model(
        best_model: UserItemRecommender,
        user_item_df: pd.DataFrame,
        user_activity_df: pd.DataFrame,
        num_of_clicked: int = 5,
        num_of_recommended: int = 10
) -> float:
    users_in_interval = user_activity_df.loc[
        (user_activity_df.diapason == '(0:5]') &
        (user_activity_df.clicks_count == num_of_clicked)
        ].sample(frac=0.25)

    median_probability = 0.0
    counter = 0
    for _, user in users_in_interval.iterrows():
        median_probability += evaluate(
            user_id=user.user_id,
            user_event_df=user_item_df,
            recommender=best_model,
            number_of_recommended=num_of_recommended
        )
        counter += 1
    else:
        median_probability = median_probability / counter

    return  median_probability

In [21]:
test_best_model(
    best_model=BPRRecommender(factors=26, learning_rate=0.03, regularization=0.0),
    user_item_df=bashkortostan_user_event,
    user_activity_df=bashkortostan_activity
)

0.4882352941176471

In [23]:
test_best_model(
    best_model=best_model,
    user_item_df=bashkortostan_user_event,
    user_activity_df=bashkortostan_activity
)

0.5176470588235295