# Рекомендательные системы. Библиотека LightFM

In [None]:
%%capture
%%bash
pip install lightfm

In [None]:
import datetime
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

from tqdm.notebook import tqdm
from zipfile import ZipFile
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter('ignore')

## Разбираем "механику работы" библиотеки на демо-примере

In [None]:
# Загрузка данных
data = pd.read_csv('/content/ratings.csv', sep=',')
data = data.dropna().reset_index(drop=True)
data['movie_id'] = data['movie_id'].astype('int')
data['rating'] = data['rating'].astype('int')

In [None]:
data.head(35)

Unnamed: 0,user_id,movie_id,movie_title,rating
0,1,1,movie_1,10
1,1,3,movie_3,5
2,1,5,movie_5,8
3,1,6,movie_6,9
4,1,7,movie_7,10
5,2,1,movie_1,7
6,2,2,movie_2,10
7,2,3,movie_3,5
8,2,4,movie_4,10
9,2,5,movie_5,8


In [None]:
matrix_user_movie = data.pivot_table(index='user_id', columns='movie_id', values='rating')
print(matrix_user_movie)

movie_id     1     2    3     4    5    6     7
user_id                                        
1         10.0   NaN  5.0   NaN  8.0  9.0  10.0
2          7.0  10.0  5.0  10.0  8.0  9.0   9.0
3          7.0  10.0  5.0   5.0  8.0  9.0   NaN
4          NaN   NaN  NaN   3.0  4.0  5.0   5.0
5          NaN   4.0  NaN   3.0  4.0  5.0   5.0


In [None]:
#Создадим маппинг для пользователей и фильмов
user_id_mapping = {id:i for i, id in enumerate(data['user_id'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(data['movie_id'].unique())}
movie_title_mapping = {title:i for i, title in enumerate(data['movie_title'].unique())}

In [None]:
movie_title_mapping

{'movie_1': 0,
 'movie_2': 5,
 'movie_3': 1,
 'movie_4': 6,
 'movie_5': 2,
 'movie_6': 3,
 'movie_7': 4}

In [None]:
print(movie_id_mapping)

{1: 0, 3: 1, 5: 2, 6: 3, 7: 4, 2: 5, 4: 6}


In [None]:
#Применим его к обучающему набору
train_user_data = data['user_id'].map(user_id_mapping)
train_movie_data = data['movie_id'].map(movie_id_mapping)

In [None]:
#Создадим разреженную матрицу рейтинга
shape = (len(user_id_mapping), len(movie_id_mapping))
train_matrix = coo_matrix((data['rating'].values, (train_user_data.astype(int), train_movie_data.astype(int))), shape=shape)

In [None]:
print(train_matrix)

  (0, 0)	10
  (0, 1)	5
  (0, 2)	8
  (0, 3)	9
  (0, 4)	10
  (1, 0)	7
  (1, 5)	10
  (1, 1)	5
  (1, 6)	10
  (1, 2)	8
  (1, 3)	9
  (1, 4)	9
  (2, 0)	7
  (2, 5)	10
  (2, 1)	5
  (2, 6)	5
  (2, 2)	8
  (2, 3)	9
  (3, 6)	3
  (3, 2)	4
  (3, 3)	5
  (3, 4)	5
  (4, 5)	4
  (4, 6)	3
  (4, 2)	4
  (4, 3)	5
  (4, 4)	5


In [None]:
#Создадим модель LightFM и обучим ее
model = LightFM(loss='warp')
model.fit(train_matrix, epochs=30, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f724d236c50>

In [None]:
 #Предсказание фильмов, которые могут понравяться
user_id_current = 3
item_ids_current = np.arange(0,7)
scores = model.predict(user_ids= user_id_current, item_ids=item_ids_current)
top_items = np.argsort(scores)
print(top_items)
top_movies = []
for i in top_items:
  for k,val in movie_title_mapping.items():
    if val == i:
      top_movies.append(k)
print(top_movies)

[0 1 5 6 3 4 2]
['movie_1', 'movie_3', 'movie_2', 'movie_4', 'movie_6', 'movie_7', 'movie_5']


In [None]:
k = 5
print('Train precision at k={}:\t{:.4f}'.format(k, precision_at_k(model, train_matrix, k=k).mean()))
print('Train recall at k={}:\t{:.4f}'.format(k, recall_at_k(model, train_matrix, k=k).mean()))

Train precision at k=5:	0.9600
Train recall at k=5:	0.9095


## Данные: датасет КИОН (Your Second RecSys, MTS, ODS AI)

In [None]:
PATH_TO_DATA = "/content/data_kion.zip"

In [None]:
with ZipFile(PATH_TO_DATA) as z:
    
    with z.open("data_kion/interactions_df.csv") as f:
        interactions = pd.read_csv(f)
    
    with z.open("data_kion/items.csv") as f:
        items = pd.read_csv(f)
        
    with z.open("data_kion/users.csv") as f:
        users = pd.read_csv(f)

### EDA

### interactions: взаимодействия пользователь - айтем

* дата last_watch_dt
* длительность просмотра total_dur
* % просмотра watched_pct

In [None]:
interactions.head(5)

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [None]:
# Обработка даты
interactions['last_watch_dt'] = pd.to_datetime(interactions['last_watch_dt']).map(lambda x: x.date())

print(f"Уникальных юзеров в interactions: {interactions['user_id'].nunique():_}")
print(f"Уникальных айтемов в interactions: {interactions['item_id'].nunique():_}")

Уникальных юзеров в interactions: 962_179
Уникальных айтемов в interactions: 15_706


In [None]:
max_date = interactions['last_watch_dt'].max()
min_date = interactions['last_watch_dt'].min()

print(f"min дата в interactions: {min_date}")
print(f"max дата в interactions: {max_date}")

min дата в interactions: 2021-03-13
max дата в interactions: 2021-08-22


### users: данные о пользователях

* age бин по возрасту
* income бин по доходу
* sex пол
* kids_flg флаг наличия детей

Все признаки - результат предсказания соцдем моделей

In [None]:
users.head(5)

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0


In [None]:
f"Уникальных юзеров в users: {users.shape[0]:_}"

'Уникальных юзеров в users: 840_197'

### items: данные об айтемах

* content_type - тип контента
* title - название на русском
* title_orig - название оригинальное
* release_year - год выпуска
* countries - страны
* for_kids - флаг контент для детей
* age_rating- Возрастной рейтинг
* studios - студии
* directors - режиссеры
* actors- актеры
* keywords - ключевые слова
* description - описание

In [None]:
f"Уникальных айтемов в items {items.shape[0]:_}"

'Уникальных айтемов в items 15_963'

### Train - Test

In [None]:
train = interactions[(interactions['last_watch_dt'] < max_date - pd.Timedelta(days=7))]
test = interactions[(interactions['last_watch_dt'] >= max_date - pd.Timedelta(days=7))]

# Дополнительная фильтрация train для исключения случайных просмотров 
train = train[train['total_dur'] >= 300]

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (3832711, 5)
test: (490982, 5)


In [None]:
# Разделим данные на lfm_train и lfm_pred как 60%-40% по квантилю даты просмотра
lfm_date_threshold = train['last_watch_dt'].quantile(q=0.6, interpolation='nearest')
lfm_date_threshold

datetime.date(2021, 7, 6)

In [None]:
lfm_train = train[(train['last_watch_dt'] < lfm_date_threshold)]
lfm_pred = train[(train['last_watch_dt'] >= lfm_date_threshold)]

print(f"lfm_train: {lfm_train.shape}")
print(f"lfm_pred: {lfm_pred.shape}")

lfm_train: (2286604, 5)
lfm_pred: (1546107, 5)


In [None]:
# Будем предсказывать кандидатов только на теплых пользователях - у которых есть просмотры в обучающей выборке
lfm_pred = lfm_pred[lfm_pred['user_id'].isin(lfm_train['user_id'].unique())]

In [None]:
lfm_train.head(3)

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
3,864613,7638,2021-07-05,14483,100.0


### Обучение LightFM

In [None]:
dataset = Dataset()
dataset.fit(lfm_train['user_id'].unique(), lfm_train['item_id'].unique())

In [None]:
interactions_matrix, weights_matrix = dataset.build_interactions(
    zip(*lfm_train[['user_id', 'item_id', 'total_dur']].values.T)
)

weights_matrix_csr = weights_matrix.tocsr()

In [None]:
print(interactions_matrix)

  (0, 0)	1
  (1, 1)	1
  (2, 2)	1
  (3, 0)	1
  (4, 3)	1
  (5, 4)	1
  (6, 5)	1
  (7, 6)	1
  (8, 7)	1
  (9, 8)	1
  (10, 9)	1
  (11, 10)	1
  (12, 11)	1
  (13, 12)	1
  (14, 13)	1
  (15, 14)	1
  (16, 15)	1
  (17, 16)	1
  (18, 17)	1
  (19, 18)	1
  (20, 19)	1
  (21, 20)	1
  (22, 21)	1
  (23, 22)	1
  (24, 23)	1
  :	:
  (348746, 19)	1
  (354, 828)	1
  (266725, 236)	1
  (101970, 56)	1
  (19795, 1450)	1
  (44599, 55)	1
  (29792, 1724)	1
  (512780, 131)	1
  (19759, 56)	1
  (10287, 837)	1
  (91529, 976)	1
  (201467, 304)	1
  (387595, 979)	1
  (494563, 1087)	1
  (512781, 1152)	1
  (512782, 1829)	1
  (233457, 181)	1
  (49404, 740)	1
  (145426, 248)	1
  (44969, 6603)	1
  (125589, 186)	1
  (148167, 24)	1
  (109060, 83)	1
  (103910, 2498)	1
  (71500, 2170)	1


In [None]:
print(weights_matrix)

  (0, 0)	4250.0
  (1, 1)	8317.0
  (2, 2)	14483.0
  (3, 0)	6725.0
  (4, 3)	11286.0
  (5, 4)	26246.0
  (6, 5)	6598.0
  (7, 6)	18538.0
  (8, 7)	8422.0
  (9, 8)	6358.0
  (10, 9)	23673.0
  (11, 10)	407.0
  (12, 11)	8862.0
  (13, 12)	8413.0
  (14, 13)	67337.0
  (15, 14)	8535.0
  (16, 15)	746.0
  (17, 16)	36469.0
  (18, 17)	6394.0
  (19, 18)	1572.0
  (20, 19)	547.0
  (21, 20)	660.0
  (22, 21)	5194.0
  (23, 22)	565.0
  (24, 23)	12998.0
  :	:
  (348746, 19)	9846.0
  (354, 828)	20051.0
  (266725, 236)	6878.0
  (101970, 56)	6403.0
  (19795, 1450)	13122.0
  (44599, 55)	7301.0
  (29792, 1724)	118143.0
  (512780, 131)	7246.0
  (19759, 56)	647.0
  (10287, 837)	5937.0
  (91529, 976)	384.0
  (201467, 304)	7307.0
  (387595, 979)	8372.0
  (494563, 1087)	805.0
  (512781, 1152)	1594.0
  (512782, 1829)	8681.0
  (233457, 181)	397.0
  (49404, 740)	1709.0
  (145426, 248)	713.0
  (44969, 6603)	1830.0
  (125589, 186)	1133.0
  (148167, 24)	5752.0
  (109060, 83)	753.0
  (103910, 2498)	2308.0
  (71500, 2170)	6203.0

In [None]:
lightfm_mapping = dataset.mapping()
lightfm_mapping = {
    'users_mapping': lightfm_mapping[0],
    'items_mapping': lightfm_mapping[2],
}

lightfm_mapping['users_inv_mapping'] = {v: k for k, v in lightfm_mapping['users_mapping'].items()}
lightfm_mapping['items_inv_mapping'] = {v: k for k, v in lightfm_mapping['items_mapping'].items()}

print(f"users_mapping amount: {len(lightfm_mapping['users_mapping'])}")
print(f"items_mapping amount: {len(lightfm_mapping['items_mapping'])}")

users_mapping amount: 512783
items_mapping amount: 12796


In [None]:
lfm_model = LightFM(
    no_components=64, 
    learning_rate=0.1, 
    loss='warp', 
    max_sampled=5, 
    random_state=42
)

In [None]:
num_epochs = 20

for _ in tqdm(range(num_epochs)):
    lfm_model.fit_partial(
        weights_matrix_csr
    )

  0%|          | 0/20 [00:00<?, ?it/s]

### Генерируем предсказания LightFM как кандидатов для второго этапа

In [None]:
candidates = pd.DataFrame({'user_id': lfm_pred['user_id'].unique()})
candidates.head(3)

Unnamed: 0,user_id
0,988709
1,646903
2,215229


In [None]:
# Функция для генерации LightFM предсказаний по всем пользователям с учетом удаления просмотренных айтемов
def generate_lightfm_recs_mapper(model, item_ids, known_items, 
                                 user_features, item_features, N, 
                                 user_mapping, item_inv_mapping, 
                                 num_threads=1):
    def _recs_mapper(user):
        user_id = user_mapping[user]
        recs = model.predict(user_id, item_ids, user_features=user_features, 
                             item_features=item_features, num_threads=num_threads)
        
        additional_N = len(known_items[user_id]) if user_id in known_items else 0
        total_N = N + additional_N
        top_cols = np.argpartition(recs, -np.arange(total_N))[-total_N:][::-1]
        
        final_recs = [item_inv_mapping[item] for item in top_cols]
        if additional_N > 0:
            filter_items = known_items[user_id]
            final_recs = [item for item in final_recs if item not in filter_items]
        return final_recs[:N]
    return _recs_mapper

In [None]:
# Функция для расчета классических метрик рекомендаций
def compute_metrics(df_true, df_pred, top_N, rank_col='rank'):
    result = {}
    test_recs = df_true.set_index(['user_id', 'item_id']).join(df_pred.set_index(['user_id', 'item_id']))
    test_recs = test_recs.sort_values(by=['user_id', rank_col])

    test_recs['users_item_count'] = test_recs.groupby(level='user_id')[rank_col].transform(np.size)
    test_recs['reciprocal_rank'] = (1 / test_recs[rank_col]).fillna(0)
    test_recs['cumulative_rank'] = test_recs.groupby(level='user_id').cumcount() + 1
    test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs[rank_col]
    
    users_count = test_recs.index.get_level_values('user_id').nunique()
    for k in range(1, top_N + 1):
        hit_k = f'hit@{k}'
        test_recs[hit_k] = test_recs[rank_col] <= k
        result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count
        result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count

    result[f'MAP@{top_N}'] = (test_recs["cumulative_rank"] / test_recs["users_item_count"]).sum() / users_count
    result[f'MRR'] = test_recs.groupby(level='user_id')['reciprocal_rank'].max().mean()
    return pd.Series(result)

In [None]:
# Kол-во кандидатов 
top_N = 30

# Bспомогательные данные 
all_cols = list(lightfm_mapping['items_mapping'].values())

mapper = generate_lightfm_recs_mapper(
    lfm_model, 
    item_ids=all_cols, 
    known_items=dict(),
    N=top_N,
    user_features=None, 
    item_features=None, 
    user_mapping=lightfm_mapping['users_mapping'],
    item_inv_mapping=lightfm_mapping['items_inv_mapping'],
    num_threads=20
)

In [None]:
# Генерируем предказания
candidates['item_id'] = candidates['user_id'].map(mapper)
candidates = candidates.explode('item_id')
candidates['rank'] = candidates.groupby('user_id').cumcount() + 1 

candidates.head()

Unnamed: 0,user_id,item_id,rank
0,988709,10440,1
0,988709,15297,2
0,988709,13865,3
0,988709,4151,4
0,988709,9728,5


In [None]:
candidates.shape

(4893840, 3)

### Пропуск второго этапа с обучением CatBoost ...

### Метрики качества на глобальном test

In [None]:
# Оставляем только теплых пользователей
test = test[test['user_id'].isin(lfm_train['user_id'].unique())]

In [None]:
top_N = 20

lfm_prediction = pd.DataFrame({
    'user_id': test['user_id'].unique()
})

known_items = lfm_train.groupby('user_id')['item_id'].apply(list).to_dict()

mapper = generate_lightfm_recs_mapper(
    lfm_model, 
    item_ids=all_cols, 
    known_items=known_items,
    N=top_N,
    user_features=None, 
    item_features=None, 
    user_mapping=lightfm_mapping['users_mapping'],
    item_inv_mapping=lightfm_mapping['items_inv_mapping'],
    num_threads=20
)

lfm_prediction['item_id'] = lfm_prediction['user_id'].map(mapper)
lfm_prediction = lfm_prediction.explode('item_id').reset_index(drop=True)
lfm_prediction['rank'] = lfm_prediction.groupby('user_id').cumcount() + 1 

In [None]:
lfm_metrics = compute_metrics(test[['user_id', 'item_id']],
                              lfm_prediction, 
                              top_N=10)
lfm_metrics

Precision@1     0.025763
Recall@1        0.014394
Precision@2     0.027750
Recall@2        0.030863
Precision@3     0.026736
Recall@3        0.044355
Precision@4     0.025767
Recall@4        0.056663
Precision@5     0.024799
Recall@5        0.067901
Precision@6     0.023232
Recall@6        0.075653
Precision@7     0.022068
Recall@7        0.082984
Precision@8     0.021082
Recall@8        0.090258
Precision@9     0.020011
Recall@9        0.095365
Precision@10    0.019074
Recall@10       0.100459
MAP@10          0.040541
MRR             0.068076
dtype: float64