# Initialization

In [2]:
import logging

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import scipy
import sklearn.preprocessing

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

# Загрузка данных

In [4]:
items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")

In [5]:
# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

# количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
# количество пользователей, которые есть и в train, и в test
common_users = set(events_train["user_id"]).intersection(set(events_test["user_id"]))

print(len(users_train), len(users_test), len(common_users))

428220 123223 120858


# Разбиение с учётом хронологии

Рекомендательные системы на практике работают с учётом хронологии. Поэтому поток событий для тренировки и валидации полезно делить на то, что уже случилось, и что ещё случится. Это позволяет проводить валидацию на тех же пользователях, на которых тренировались, но на их событиях в будущем.

# === Знакомство: "холодный" старт

# === Знакомство: первые персональные рекомендации

# === Базовые подходы: коллаборативная фильтрация

In [6]:
# перекодируем идентификаторы пользователей: 
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])

# перекодируем идентификаторы объектов: 
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["item_id"])
items["item_id_enc"] = item_encoder.transform(items["item_id"])
events_train["item_id_enc"] = item_encoder.transform(events_train["item_id"])
events_test["item_id_enc"] = item_encoder.transform(events_test["item_id"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["item_id_enc"] = item_encoder.transfor

In [7]:
events_train

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_enc,item_id_enc
0,1229132,22034,2015-07-12,2015-07-17,True,5,False,2015-07-01,229132,2460
1,1229132,22318578,2015-06-07,2015-08-09,True,5,True,2015-06-01,229132,38691
2,1229132,22551730,2015-06-24,2015-07-11,True,4,True,2015-06-01,229132,38867
3,1229132,22816087,2015-09-27,2015-11-04,True,5,True,2015-09-01,229132,39109
5,1229132,17910054,2015-03-04,2015-07-28,True,3,False,2015-03-01,229132,35638
...,...,...,...,...,...,...,...,...,...,...
12914452,1364473,5297,2017-02-07,2017-02-26,True,5,False,2017-02-01,364473,597
12914453,1364473,4900,2016-12-22,2016-12-29,True,2,False,2016-12-01,364473,517
12914454,1364473,14836,2016-11-29,2017-01-15,True,3,False,2016-11-01,364473,1828
12914456,1297020,10210,2012-06-05,2013-01-17,True,5,False,2012-06-01,297020,1196


In [9]:
max(events_train['item_id_enc'])

43304

Вычислите размер матрицы user_item_matrix_train, как если бы она хранила все свои элементы, включая пропуски, и для каждого элемента использовался бы один байт:

In [10]:
len(set(events_train["user_id_enc"])) * len(set(events_train['item_id_enc'])) / 1024 ** 3

16.54028546065092

In [7]:
# создаём sparse-матрицу формата CSR 
user_item_matrix_train = scipy.sparse.csr_matrix((
    events_train["rating"],
    (events_train['user_id_enc'], events_train['item_id_enc'])),
    dtype=np.int8) 

sparse-формат numpy-матриц позволяет сильно уменьшить требование к размеру памяти:

In [12]:
import sys

sum([sys.getsizeof(i) for i in user_item_matrix_train.data])/1024**3 

0.26370687410235405

In [8]:
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train) 

  from .autonotebook import tqdm as notebook_tqdm
  check_blas_config()
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:58<00:00,  3.56s/it]


In [9]:
def get_recommendations_als(user_item_matrix, model, user_id, user_encoder, item_encoder, include_seen=True, n=5):
    """
    Возвращает отранжированные рекомендации для заданного пользователя
    """
    user_id_enc = user_encoder.transform([user_id])[0]
    recommendations = model.recommend(
         user_id_enc, 
         user_item_matrix[user_id_enc], 
         filter_already_liked_items=not include_seen,
         N=n)
    recommendations = pd.DataFrame({"item_id_enc": recommendations[0], "score": recommendations[1]})
    recommendations["item_id"] = item_encoder.inverse_transform(recommendations["item_id_enc"])
    
    return recommendations

In [10]:
# получаем список всех возможных user_id (перекодированных)
user_ids_encoded = range(len(user_encoder.classes_))

# получаем рекомендации для всех пользователей
als_recommendations = als_model.recommend(
    user_ids_encoded, 
    user_item_matrix_train[user_ids_encoded], 
    filter_already_liked_items=False, N=100) 

In [16]:
len(als_recommendations[0][0])

100

In [17]:
als_recommendations

(array([[    2,  1942,     3, ..., 28836, 30688, 10393],
        [31432, 29792, 36956, ...,   533, 32060, 34554],
        [35810, 33276, 37255, ..., 31562, 41459,  1043],
        ...,
        [20997, 20386, 23004, ...,  2293, 28200, 29560],
        [22844, 28025, 37138, ..., 37914,   422,  4112],
        [41809, 34434, 35669, ..., 33675, 28263, 22072]], dtype=int32),
 array([[0.99094146, 0.89661723, 0.8644041 , ..., 0.2261226 , 0.22548363,
         0.22546645],
        [0.674292  , 0.6229848 , 0.49019852, ..., 0.02235501, 0.02226192,
         0.02225844],
        [0.24119437, 0.22116913, 0.18066649, ..., 0.04201685, 0.04178948,
         0.04172034],
        ...,
        [0.23566297, 0.23407641, 0.22276123, ..., 0.02843785, 0.02830932,
         0.02820013],
        [0.05539129, 0.03866215, 0.03835723, ..., 0.01568658, 0.01557466,
         0.01546565],
        [0.47294533, 0.46393558, 0.4604288 , ..., 0.09494869, 0.09492695,
         0.09303415]], dtype=float32))

In [30]:
# преобразуем полученные рекомендации в табличный формат
item_ids_enc = als_recommendations[0]
als_scores = als_recommendations[1]

als_recommendations = pd.DataFrame({
    "user_id_enc": user_ids_encoded,
    "item_id_enc": item_ids_enc.tolist(), 
    "score": als_scores.tolist()})
als_recommendations = als_recommendations.explode(["item_id_enc", "score"], ignore_index=True)

# приводим типы данных
als_recommendations["item_id_enc"] = als_recommendations["item_id_enc"].astype("int")
als_recommendations["score"] = als_recommendations["score"].astype("float")

# получаем изначальные идентификаторы
als_recommendations["user_id"] = user_encoder.inverse_transform(als_recommendations["user_id_enc"])
als_recommendations["item_id"] = item_encoder.inverse_transform(als_recommendations["item_id_enc"])
als_recommendations = als_recommendations.drop(columns=["user_id_enc", "item_id_enc"])

In [17]:
als_recommendations

Unnamed: 0,score,user_id,item_id
0,0.990941,1000000,3
1,0.896617,1000000,15881
2,0.864404,1000000,5
3,0.822254,1000000,6
4,0.774095,1000000,2
...,...,...,...
43058495,0.096082,1430584,13206900
43058496,0.096065,1430584,5060378
43058497,0.094949,1430584,16071764
43058498,0.094927,1430584,9969571


In [18]:
als_recommendations[als_recommendations['user_id'] == 1000000]

Unnamed: 0,score,user_id,item_id
0,0.990941,1000000,3
1,0.896617,1000000,15881
2,0.864404,1000000,5
3,0.822254,1000000,6
4,0.774095,1000000,2
...,...,...,...
95,0.229302,1000000,4588
96,0.226873,1000000,128029
97,0.226123,1000000,10626594
98,0.225484,1000000,12812550


In [19]:
als_recommendations = als_recommendations[["user_id", "item_id", "score"]]
als_recommendations.to_parquet("als_recommendations.parquet") 

In [20]:
als_recommendations = (
    als_recommendations
    .merge(events_test[["user_id", "item_id", "rating"]]
               .rename(columns={"rating": "rating_test"}), 
           on=["user_id", "item_id"], how="left")
) 

In [21]:
als_recommendations.head()

Unnamed: 0,user_id,item_id,score,rating_test
0,1000000,3,0.990941,
1,1000000,15881,0.896617,
2,1000000,5,0.864404,
3,1000000,6,0.822254,
4,1000000,2,0.774095,


In [22]:
als_recommendations[als_recommendations['rating_test'].notna()]

Unnamed: 0,user_id,item_id,score,rating_test
611,1000006,18774964,0.367940,4.0
615,1000006,29868610,0.289370,4.0
633,1000006,7445,0.229146,4.0
664,1000006,18812405,0.162832,3.0
687,1000006,24817626,0.136432,3.0
...,...,...,...,...
43056981,1430569,28763485,0.225379,5.0
43057389,1430573,1618,0.100608,4.0
43057861,1430578,17061,0.613498,5.0
43057873,1430578,830502,0.570653,5.0


In [23]:
import sklearn.metrics

def compute_ndcg(rating: pd.Series, score: pd.Series, k):

    """ подсчёт ndcg
    rating: истинные оценки
    score: оценки модели
    k: количество айтемов (по убыванию score) для оценки, остальные - отбрасываются
    """
    
    # если кол-во объектов меньше 2, то NDCG - не определена
    if len(rating) < 2:
        return np.nan

    ndcg = sklearn.metrics.ndcg_score(np.asarray([rating.to_numpy()]), np.asarray([score.to_numpy()]), k=k)

    return ndcg

In [27]:
als_recommendations

Unnamed: 0,user_id,item_id,score,rating_test
0,1000000,3,0.990941,
1,1000000,15881,0.896617,
2,1000000,5,0.864404,
3,1000000,6,0.822254,
4,1000000,2,0.774095,
...,...,...,...,...
43058495,1430584,13206900,0.096082,
43058496,1430584,5060378,0.096065,
43058497,1430584,16071764,0.094949,
43058498,1430584,9969571,0.094927,


In [30]:
als_recommendations[als_recommendations['user_id'] == 1000006].dropna()

Unnamed: 0,user_id,item_id,score,rating_test
611,1000006,18774964,0.36794,4.0
615,1000006,29868610,0.28937,4.0
633,1000006,7445,0.229146,4.0
664,1000006,18812405,0.162832,3.0
687,1000006,24817626,0.136432,3.0


In [32]:
als_recommendations['score'].max()

2.3500816822052

In [24]:
rating_test_idx = ~als_recommendations["rating_test"].isnull()
ndcg_at_5_scores = als_recommendations[rating_test_idx].groupby("user_id").apply(lambda x: compute_ndcg(x["rating_test"], x["score"], k=5))

In [25]:
print(ndcg_at_5_scores.mean())

0.975946709792109


# === Базовые подходы: контентные рекомендации

In [6]:
items.head()

Unnamed: 0,item_id,author,title,description,genre_and_votes,num_pages,average_rating,ratings_count,text_reviews_count,publisher,publication_year,country_code,language_code,format,is_ebook,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,item_id_enc
3,6066819,Jennifer Weiner,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,"{'Womens Fiction-Chick Lit': 739, 'Fiction': 442}",368.0,3.49,51184,3282,Atria Books,2009.0,US,eng,Hardcover,False,0743294297,9780743294294,"{'Academic': None, 'Academic-Academia': None, ...","Womens Fiction-Chick Lit 739, Fiction 442",23133
6,378460,Michael Halberstam,The Wanting of Levine,,"{'Politics': 1, 'Humor': 1}",,4.38,12,4,Berkley Publishing Group,1979.0,US,,Paperback,False,0425040887,9780425040881,"{'Academic': None, 'Academic-Academia': None, ...","Politics 1user, Humor 1user",12687
15,89375,"Don Piper, Cecil Murphey",90 Minutes in Heaven: A True Story of Death an...,As he is driving home from a minister's confer...,"{'Christian': 395, 'Nonfiction': 392, 'Religio...",,3.91,68157,2885,,,US,,,False,0800759494,9780800759490,"{'Academic': None, 'Academic-Academia': None, ...","Christian 395, Nonfiction 392, Religion 142, S...",6460
16,89376,Randy Alcorn,Heaven,What is Heaven really going to be like? What w...,"{'Christian': 225, 'Religion-Theology': 154, '...",533.0,4.26,7345,566,,,US,eng,,False,0842379428,9780842379427,"{'Academic': None, 'Academic-Academia': None, ...","Christian 225, Religion-Theology 154, Nonficti...",6461
17,89377,Jennifer L. Holm,Penny from Heaven,It's 1953 and 11-year-old Penny dreams of a su...,"{'Historical-Historical Fiction': 284, 'Childr...",288.0,3.98,6949,615,Random House Books for Young Readers,2006.0,US,,Hardcover,False,037583687X,9780375836879,"{'Academic': None, 'Academic-Academia': None, ...","Historical-Historical Fiction 284, Childrens-M...",6462


In [47]:
items["genre_and_votes"] = items["genre_and_votes"].apply(eval) 

In [8]:
for k, v, in items.iterrows():
    print(k)
#     print(v)
    
    print(v['genre_and_votes_dict'])
    
    break

3
{'Academic': None, 'Academic-Academia': None, 'Academic-College': None, 'Academic-Grad School': None, 'Academic-Read For School': None, 'Academic-School': None, 'Academic-Students': None, 'Academic-Teachers': None, 'Action': None, 'Adolescence': None, 'Adult': None, 'Adult Fiction': None, 'Adult Fiction-Erotica': None, 'Adventure': None, 'Adventure-Maritime': None, 'Adventure-Pirates': None, 'Adventure-Survival': None, 'Aeroplanes': None, 'Africa-Eastern Africa': None, 'Africa-Western Africa': None, 'African Literature-Egyptian Literature': None, 'Alcohol-Beer': None, 'Alcohol-Booze': None, 'Alcohol-Cocktails': None, 'Alcohol-Wine': None, 'American History-American Civil War': None, 'American Revolution-American Revolutionary War': None, 'American-African American Literature': None, 'American-American Classics': None, 'American-American Fiction': None, 'American-Americana': None, 'American-Southern': None, 'Amish': None, 'Anarchism': None, 'Animals': None, 'Animals-Animal Fiction': N

In [9]:
items.iloc[2]

item_id                                                             89375
author                                           Don Piper, Cecil Murphey
title                   90 Minutes in Heaven: A True Story of Death an...
description             As he is driving home from a minister's confer...
genre_and_votes         {'Christian': 395, 'Nonfiction': 392, 'Religio...
num_pages                                                            <NA>
average_rating                                                       3.91
ratings_count                                                       68157
text_reviews_count                                                   2885
publisher                                                                
publication_year                                                     <NA>
country_code                                                           US
language_code                                                            
format                                

In [40]:
def get_genres(items):

    """ 
    извлекает список жанров по всем книгам, 
    подсчитывает долю голосов по каждому их них
    """
    
    genres_counter = {}
    
    for k, v, in items.iterrows():
        genre_and_votes = v['genre_and_votes_dict']
        if genre_and_votes is None or not isinstance(genre_and_votes, dict):
            continue
        for genre, votes in genre_and_votes.items():
            # увеличиваем счётчик жанров
            try:
                if votes == None:
                    genres_counter[genre] += 0
                else:
                    genres_counter[genre] += votes
            except KeyError:
                genres_counter[genre] = 0

    genres = pd.Series(genres_counter, name="votes")
    genres = genres.to_frame()
    genres = genres.reset_index().rename(columns={"index": "name"})
    genres.index.name = "genre_id"
        
    
    return genres
   
genres = get_genres(items)

In [41]:
genres["score"] = genres["votes"] / genres["votes"].sum()
genres.sort_values(by="score", ascending=False).head(10) 

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
285,Fantasy,6850115.0,0.149502
308,Fiction,6406256.0,0.139815
132,Classics,3415071.0,0.074533
800,Young Adult,3297027.0,0.071957
640,Romance,2422690.0,0.052874
535,Nonfiction,1737798.0,0.037927
378,Historical-Historical Fiction,1531489.0,0.033424
512,Mystery,1371370.0,0.02993
665,Science Fiction,1218997.0,0.026604
296,Fantasy-Paranormal,857137.0,0.018707


In [48]:
def get_item2genre_matrix(genres, items):

    genre_names_to_id = genres.reset_index().set_index("name")["genre_id"].to_dict()
    
    # list to build CSR matrix
    genres_csr_data = []
    genres_csr_row_idx = []
    genres_csr_col_idx = []
    
    for item_idx, (k, v) in enumerate(items.iterrows()):
        if v["genre_and_votes"] is None:
            continue
        for genre_name, votes in v["genre_and_votes"].items():
            genre_idx = genre_names_to_id[genre_name]
            genres_csr_data.append(int(votes))
            genres_csr_row_idx.append(item_idx)
            genres_csr_col_idx.append(genre_idx)

    genres_csr = scipy.sparse.csr_matrix((genres_csr_data, (genres_csr_row_idx, genres_csr_col_idx)), shape=(len(items), len(genres)))
    # нормализуем, чтобы сумма оценок принадлежности к жанру была равна 1
    genres_csr = sklearn.preprocessing.normalize(genres_csr, norm='l1', axis=1)
    
    return genres_csr

In [49]:
items = items.sort_values(by="item_id_enc")
all_items_genres_csr = get_item2genre_matrix(genres, items) 

In [14]:
all_items_genres_csr

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 210895 stored elements and shape (43312, 815)>

In [15]:
user_id = 1000010
user_events = events_train.query("user_id == @user_id")[["item_id", "rating"]]
user_items = items[items["item_id"].isin(user_events["item_id"])]

user_items_genres_csr = get_item2genre_matrix(genres, user_items)
user_items_genres_csr

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 149 stored elements and shape (22, 815)>

In [16]:
user_items

Unnamed: 0,item_id,author,title,description,genre_and_votes,num_pages,average_rating,ratings_count,text_reviews_count,publisher,publication_year,country_code,language_code,format,is_ebook,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,item_id_enc
784151,1618,Mark Haddon,The Curious Incident of the Dog in the Night-Time,'Haddon's portrayal of an emotionally dissocia...,"{'Fiction': 12127, 'Mystery': 3284, 'Young Adu...",226,3.85,878808,30982,,,US,eng,,False,1400032717,9781400032716,"{'Academic': None, 'Academic-Academia': None, ...","Fiction 12127, Mystery 3284, Young Adult 2928,...",187
1535498,2612,Malcolm Gladwell,The Tipping Point: How Little Things Can Make ...,An alternate cover edition exist .\nThe tippi...,"{'Nonfiction': 7733, 'Business': 2805, 'Psycho...",301,3.92,506859,10389,Back Bay Books,2002.0,US,eng,Paperback,False,0316346624,9780316346627,"{'Academic': None, 'Academic-Academia': None, ...","Nonfiction 7733, Business 2805, Psychology 217...",283
1958610,5129,Aldous Huxley,Brave New World,"Far in the future, the World Controllers have ...","{'Classics': 17717, 'Fiction': 12564, 'Science...",268,3.97,1043619,15933,Harper Perennial,1998.0,US,eng,Paperback,False,0060929871,9780060929879,"{'Academic': None, 'Academic-Academia': None, ...","Classics 17717, Fiction 12564, Science Fiction...",561
1111343,5470,"George Orwell, Erich Fromm",1984,"The year 1984 has come and gone, but George Or...","{'Classics': 33441, 'Fiction': 21086, 'Science...",328,4.14,2023937,37995,New American Library,1950.0,US,eng,Mass Market Paperback,False,0451524934,9780451524935,"{'Academic': None, 'Academic-Academia': None, ...","Classics 33441, Fiction 21086, Science Fiction...",635
202992,6185,"Emily Brontë, Richard J. Dunn, David Timson, C...",Wuthering Heights,You can find the redesigned cover of this edit...,"{'Classics': 31567, 'Fiction': 9011, 'Romance'...",464,3.83,916820,17961,Norton,2002.0,US,eng,Paperback,False,0393978893,9780393978896,"{'Academic': None, 'Academic-Academia': None, ...","Classics 31567, Fiction 9011, Romance 4695, Go...",703
1909078,7144,"Fyodor Dostoyevsky, David McDuff, Fyodor Dosto...",Crime and Punishment,Through the story of the brilliant but conflic...,"{'Classics': 15812, 'Fiction': 8028, 'Cultural...",671,4.19,390293,8138,Penguin,2002.0,US,eng,Paperback,False,0143058142,9780143058144,"{'Academic': None, 'Academic-Academia': None, ...","Classics 15812, Fiction 8028, Cultural-Russia ...",840
794321,16299,Agatha Christie,And Then There Were None,"First, there were ten - a curious assortment o...","{'Mystery': 12703, 'Classics': 6623, 'Fiction'...",264,4.23,429352,12618,,,US,eng,,False,0312330871,9780312330873,"{'Academic': None, 'Academic-Academia': None, ...","Mystery 12703, Classics 6623, Fiction 5544, My...",1991
542046,16902,Henry David Thoreau,Walden,"Walden, or, Life in the Woods, is an American ...","{'Classics': 3880, 'Nonfiction': 2239, 'Philos...",352,3.79,110083,2908,Princeton University Press,2004.0,US,eng,Paperback,False,0691096120,9780691096124,"{'Academic': None, 'Academic-Academia': None, ...","Classics 3880, Nonfiction 2239, Philosophy 220...",2090
1696719,29044,Donna Tartt,The Secret History,Under the influence of their charismatic class...,"{'Fiction': 5570, 'Mystery': 2151, 'Contempora...",559,4.08,173787,11596,,,US,eng,Paperback,False,1400031702,9781400031702,"{'Academic': None, 'Academic-Academia': None, ...","Fiction 5570, Mystery 2151, Contemporary 1287,...",3047
2237699,30289,"Plato, Desmond Lee, Maria Helena da Rocha Pere...",The Republic,Presented in the form of a dialogue between So...,"{'Philosophy': 7334, 'Classics': 3166, 'Nonfic...",416,3.91,113894,1606,Penguin Classics,2003.0,US,eng,Paperback,False,0140449140,9780140449143,"{'Academic': None, 'Academic-Academia': None, ...","Philosophy 7334, Classics 3166, Nonfiction 160...",3187


In [17]:
user_events.shape

(22, 2)

In [18]:
# вычислим склонность пользователя к жанрам как среднее взвешенное значение популяции на его оценки книг.

# преобразуем пользовательские оценки из списка в вектор-столбец
user_ratings = user_events["rating"].to_numpy() / 5
user_ratings = np.expand_dims(user_ratings, axis=1)

user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0))

In [19]:
user_ratings

array([[1. ],
       [1. ],
       [0.8],
       [0.4],
       [0.6],
       [1. ],
       [0.8],
       [0.8],
       [0.8],
       [1. ],
       [0.4],
       [0.2],
       [0.8],
       [1. ],
       [0.6],
       [0.8],
       [0.8],
       [0.6],
       [0.8],
       [0.8],
       [0.8],
       [0.2]])

In [20]:
# выведем список жанров, которые предпочитает пользователь

user_genres = genres.copy()
user_genres["score"] = np.ravel(user_genres_scores)
user_genres = user_genres[user_genres["score"] > 0].sort_values(by=["score"], ascending=False)

user_genres.head(5) 

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
308,Fiction,6406256.0,0.185241
132,Classics,3415071.0,0.103879
285,Fantasy,6850115.0,0.072447
535,Nonfiction,1737798.0,0.050865
665,Science Fiction,1218997.0,0.04092


In [25]:
user_genres.shape

(56, 3)

In [68]:
from sklearn.metrics.pairwise import cosine_similarity

# вычисляем сходство между вектором пользователя и векторами по книгам
similarity_scores = cosine_similarity(all_items_genres_csr, user_genres_scores)

# преобразуем в одномерный массив
similarity_scores = similarity_scores.flatten()

# получаем индексы top-k (по убыванию значений), по сути, индексы книг (encoded)
k = 5
top_k_indices = np.argsort(similarity_scores)[-k:]

In [69]:
top_k_indices

array([ 4460,  9476, 14087, 36093,  4471])

In [70]:
selected_items = items[items["item_id_enc"].isin(top_k_indices)]

with pd.option_context("max_colwidth", 100):
    display(selected_items[["author", "title", "genre_and_votes"]]) 

Unnamed: 0,author,title,genre_and_votes
80465,G.K. Chesterton,The Napoleon of Notting Hill,"{'Fiction': 166, 'Classics': 88, 'Fantasy': 44, 'Humor': 22, 'Literature': 20}"
1168335,Ray Bradbury,"Dandelion Wine (Green Town, #1)","{'Fiction': 1438, 'Classics': 914, 'Science Fiction': 529, 'Fantasy': 456, 'Young Adult': 212}"
393210,"G.K. Chesterton, Jonathan Lethem",The Man Who Was Thursday: A Nightmare,"{'Fiction': 1257, 'Classics': 929, 'Mystery': 469, 'Fantasy': 293, 'Philosophy': 156, 'Literatur..."
2244467,Samuel Butler,"Erewhon (Erewhon , #1)","{'Fiction': 162, 'Classics': 139, 'Science Fiction': 60, 'Fantasy': 55}"
39408,"Paulo Coelho, Alan R. Clarke, James Noel Smith",The Alchemist,"{'Fiction': 14023, 'Classics': 5787, 'Fantasy': 3289, 'Philosophy': 2759}"


In [71]:
genres = get_genres(selected_items)

In [72]:
genres["score"] = genres["votes"] / genres["votes"].sum()
genres.sort_values(by="score", ascending=False).head(10) 

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
308,Fiction,16880.0,0.510247
132,Classics,7769.0,0.234841
285,Fantasy,4093.0,0.123723
567,Philosophy,2915.0,0.088114
665,Science Fiction,589.0,0.017804
512,Mystery,469.0,0.014177
800,Young Adult,212.0,0.006408
440,Literature,155.0,0.004685
537,Northern Africa-Algeria,0.0,0.0
538,Northern Africa-Egypt,0.0,0.0


# === Базовые подходы: валидация

In [123]:
def process_events_recs_for_binary_metrics(events_train, events_test, recs, top_k=None):

    """
    размечает пары <user_id, item_id> для общего множества пользователей признаками
    - gt (ground truth)
    - pr (prediction)
    top_k: расчёт ведётся только для top k-рекомендаций
    """

    events_test["gt"] = True
    common_users = set(events_test["user_id"]) & set(recs["user_id"])

    print(f"Common users: {len(common_users)}")
    
    events_for_common_users = events_test[events_test["user_id"].isin(common_users)].copy()
    recs_for_common_users = recs[recs["user_id"].isin(common_users)].copy()

    recs_for_common_users = recs_for_common_users.sort_values(["user_id", "score"], ascending=[True, False])

    # оставляет только те item_id, которые были в events_train, 
    # т. к. модель не имела никакой возможности давать рекомендации для новых айтемов
    events_for_common_users = events_for_common_users[events_for_common_users["item_id"].isin(events_train["item_id"].unique())]

    if top_k is not None:
        recs_for_common_users = recs_for_common_users.groupby("user_id").head(top_k)
    
    events_recs_common = events_for_common_users[["user_id", "item_id", "gt"]].merge(
        recs_for_common_users[["user_id", "item_id", "score"]], 
        on=["user_id", "item_id"], how="outer")    

    events_recs_common["gt"] = events_recs_common["gt"].fillna(False)
    events_recs_common["pr"] = ~events_recs_common["score"].isnull()
    
    events_recs_common["tp"] = events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fp"] = ~events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fn"] = events_recs_common["gt"] & ~events_recs_common["pr"]

    return events_recs_common

In [31]:
events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(
    events_train,
    events_test, 
    als_recommendations, 
    top_k=10) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["gt"] = True


Common users: 123223


In [32]:
events_recs_for_binary_metrics

Unnamed: 0,user_id,item_id,gt,score,pr,tp,fp,fn
0,1196635,18467802,True,,False,False,False,True
1,1188739,10799,True,,False,False,False,True
2,1001879,13206828,True,,False,False,False,True
3,1001879,13206900,True,,False,False,False,True
4,1001879,13206760,True,,False,False,False,True
...,...,...,...,...,...,...,...,...
1640188,1430584,7896527,False,0.439412,True,False,True,False
1640189,1430584,18006496,False,0.428401,True,False,True,False
1640190,1430584,38447,False,0.379718,True,False,True,False
1640191,1430584,28260587,False,0.379196,True,False,True,False


In [125]:
def compute_cls_metrics(events_recs_for_binary_metric):
    
    groupper = events_recs_for_binary_metric.groupby("user_id")

    # precision = tp / (tp + fp)
    precision = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fp"].sum())
    precision = precision.fillna(0).mean()
    
    # recall = tp / (tp + fn)
    recall = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fn"].sum())
    recall = recall.fillna(0).mean()

    return precision, recall 

In [24]:
# k = 5
compute_cls_metrics(events_recs_for_binary_metrics)

(0.007581376853347184, 0.014121568795222568)

In [34]:
# k = 10
compute_cls_metrics(events_recs_for_binary_metrics)

(0.008732947582837622, 0.03130238527136974)

# === Двухстадийный подход: метрики

In [18]:
als_recommendations

(array([[    2,  1942,     3, ..., 28836, 30688, 10393],
        [31432, 29792, 36956, ...,   533, 32060, 34554],
        [35810, 33276, 37255, ..., 31562, 41459,  1043],
        ...,
        [20997, 20386, 23004, ...,  2293, 28200, 29560],
        [22844, 28025, 37138, ..., 37914,   422,  4112],
        [41809, 34434, 35669, ..., 33675, 28263, 22072]], dtype=int32),
 array([[0.99094146, 0.89661723, 0.8644041 , ..., 0.2261226 , 0.22548363,
         0.22546645],
        [0.674292  , 0.6229848 , 0.49019852, ..., 0.02235501, 0.02226192,
         0.02225844],
        [0.24119437, 0.22116913, 0.18066649, ..., 0.04201685, 0.04178948,
         0.04172034],
        ...,
        [0.23566297, 0.23407641, 0.22276123, ..., 0.02843785, 0.02830932,
         0.02820013],
        [0.05539129, 0.03866215, 0.03835723, ..., 0.01568658, 0.01557466,
         0.01546565],
        [0.47294533, 0.46393558, 0.4604288 , ..., 0.09494869, 0.09492695,
         0.09303415]], dtype=float32))

In [17]:
len(als_recommendations)

2

In [15]:
len(als_recommendations[0])

430585

In [19]:
len(als_recommendations[0][0])

100

In [16]:
len(als_recommendations[1])

430585

In [35]:
# расчёт покрытия по объектам
cov_items = len(set(als_recommendations['item_id'])) / len(items['item_id'])
print(f"{cov_items:.2f}") 

0.09


In [36]:
als_recommendations.head()

Unnamed: 0,score,user_id,item_id
0,0.990941,1000000,3
1,0.896617,1000000,15881
2,0.864404,1000000,5
3,0.822254,1000000,6
4,0.774095,1000000,2


In [40]:
# разметим каждую рекомендацию признаком read
events_train["read"] = True
als_recommendations = als_recommendations.merge(events_train[["user_id", "item_id", "read"]], 
                                                on=["user_id", "item_id"], 
                                                how="left")
als_recommendations["read"] = als_recommendations["read"].fillna(False).astype("bool")

# проставим ранги
als_recommendations = als_recommendations.sort_values(by='score', ascending=False)
als_recommendations["rank"] = als_recommendations.groupby("user_id").cumcount() + 1

# посчитаем novelty по пользователям
novelty_5 = (1-als_recommendations.query("rank <= 5").groupby("user_id")["read"].mean())

# посчитаем средний novelty
print(novelty_5.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["read"] = True


In [44]:
print(novelty_5.mean())

0.607333279143491


# === Двухстадийный подход: модель

In [8]:
# задаём точку разбиения
split_date_for_labels = pd.to_datetime("2017-09-15").date()

split_date_for_labels_idx = events_test["started_at"] < split_date_for_labels

events_labels = events_test[split_date_for_labels_idx].copy()
events_test_2 = events_test[~split_date_for_labels_idx].copy() 

In [9]:
events_labels['user_id'].nunique()

99849

In [10]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations = pd.read_parquet("candidates/training/als_recommendations.parquet")
content_recommendations = pd.read_parquet("candidates/training/content_recommendations.parquet")

candidates = pd.merge(
    als_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer")

In [13]:
candidates.shape

(82993094, 4)

In [14]:
candidates.head()

Unnamed: 0,user_id,item_id,als_score,cnt_score
0,1000000,3,0.972557,0.920225
1,1000000,15881,0.890201,0.90574
2,1000000,5,0.86585,0.918026
3,1000000,6,0.834282,0.916345
4,1000000,2,0.792929,0.925806


In [15]:
events_labels

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_enc,item_id_enc
84,1196635,18467802,2017-09-01,2017-09-22,True,1,False,2017-09-01,196635,36588
257,1188739,10799,2017-08-06,2017-10-14,True,3,False,2017-08-01,188739,1262
273,1001879,18658071,2017-09-11,2017-09-15,True,2,True,2017-09-01,1879,36848
274,1001879,25785357,2017-08-23,2017-08-26,True,4,False,2017-08-01,1879,40875
275,1001879,22557272,2017-08-15,2017-08-21,True,3,False,2017-08-01,1879,38878
...,...,...,...,...,...,...,...,...,...,...
12913921,1145655,25781157,2017-08-04,2017-08-14,True,3,False,2017-08-01,145655,40870
12914084,1166047,92057,2017-08-20,2017-09-17,True,5,False,2017-08-01,166047,6577
12914085,1166047,168668,2017-08-03,2017-08-19,True,2,False,2017-08-01,166047,9075
12914150,1155073,25300956,2017-08-12,2017-10-14,True,5,False,2017-08-01,155073,40537


In [11]:
# добавляем таргет к кандидатам со значением:
# — 1 для тех item_id, которые пользователь прочитал
# — 0, для всех остальных 

events_labels["target"] = 1
candidates = candidates.merge(events_labels[["user_id", "item_id", "target"]], 
                              on=["user_id", "item_id"], how="left")
candidates["target"] = candidates["target"].fillna(0).astype("int")

# в кандидатах оставляем только тех пользователей, у которых есть хотя бы один положительный таргет
candidates_to_sample = candidates.groupby("user_id").filter(lambda x: x["target"].sum() > 0)

# для каждого пользователя оставляем только 4 негативных примера
negatives_per_user = 4
candidates_for_train = pd.concat([
    candidates_to_sample.query("target == 1"),
    candidates_to_sample.query("target == 0") \
        .groupby("user_id") \
        .apply(lambda x: x.sample(negatives_per_user, random_state=0))
    ])

In [17]:
candidates_for_train.shape

(213708, 5)

In [18]:
candidates_for_train.head()

Unnamed: 0,user_id,item_id,als_score,cnt_score,target
615,1000006,29868610,0.286715,,1
632,1000006,7445,0.230529,,1
649,1000006,18812405,0.178382,,1
1998,1000019,37415,0.043595,,1
2302,1000023,7260188,0.598791,,1


In [62]:
candidates_for_train[(candidates_for_train['als_score'].isna())
                    & (candidates_for_train['cnt_score'].isna())]

Unnamed: 0,user_id,item_id,als_score,cnt_score,target


In [63]:
candidates[(candidates['als_score'].isna())
                    & (candidates['cnt_score'].isna())]

Unnamed: 0,user_id,item_id,als_score,cnt_score,target


In [66]:
events_labels.groupby('user_id')['item_id'].count().mean()

2.5414876463459826

In [67]:
candidates.groupby('user_id')['item_id'].count().mean()

192.74497253736197

### Обучение модели
Выше мы подготовили все нужные артефакты для обучения модели ранжирования. Выполните код, чтобы обучить модель.

In [19]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score']
target = 'target'

# Create the Pool object
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0
)

# тренируем модель
cb_model.fit(train_data) 

0:	learn: 0.6526057	total: 77.9ms	remaining: 1m 17s
100:	learn: 0.5118959	total: 1.76s	remaining: 15.7s
200:	learn: 0.5111710	total: 4.54s	remaining: 18s
300:	learn: 0.5105208	total: 7.21s	remaining: 16.8s
400:	learn: 0.5100174	total: 9.05s	remaining: 13.5s
500:	learn: 0.5095747	total: 11.8s	remaining: 11.8s
600:	learn: 0.5091600	total: 13.8s	remaining: 9.17s
700:	learn: 0.5087803	total: 15.7s	remaining: 6.71s
800:	learn: 0.5084220	total: 18.7s	remaining: 4.64s
900:	learn: 0.5080930	total: 21s	remaining: 2.31s
999:	learn: 0.5078081	total: 22.7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fdba6074580>

In [21]:
cb_model.save_model('model.cb')

### Подготовка кандидатов для рекомендаций
Представим, что натренированная модель используется только некоторое время спустя, когда уже появились новые рекомендации (кандидаты) от базовых генераторов, обученных на объединении событий из events_train и events_label. Иными словами, когда события из events_label уже стали частью тренировочного набора данных. Эти новые рекомендации были заранее подготовлены и сохранены в файлах als_recommendations.parquet и content_recommendations.parquet в директории candidates/inference. Используем их для составления нового списка кандидатов candidates_to_rank, который понадобится готовой ранжирующей модели. 

In [15]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations_2 = pd.read_parquet("candidates/inference/als_recommendations.parquet")
content_recommendations_2 = pd.read_parquet("candidates/inference/content_recommendations.parquet")

candidates_to_rank = pd.merge(
    als_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer")

# оставляем только тех пользователей, что есть в тестовой выборке, для экономии ресурсов
candidates_to_rank = candidates_to_rank[candidates_to_rank["user_id"].isin(events_test_2["user_id"].drop_duplicates())]
print(len(candidates_to_rank))

14517152


### Ранжирование кандидатов для рекомендаций
Применим обученную ранжирующую модель к кандидатам для рекомендаций. Таргет уже не нужен, поскольку мы будем применять модель в режиме инференса.

Задание 5 из 6

Дополните код для того, чтобы вызвать модель и получить для каждого пользователя топ-100 рекомендаций — значение rank нужно выставить не более ста.

In [11]:
cb_model = CatBoostClassifier().load_model('model.cb')

In [27]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = predictions[:, 1]

# для каждого пользователя проставляем rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = candidates_to_rank.groupby("user_id").cumcount() + 1

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.query("rank <= @max_recommendations_per_user")

In [29]:
final_recommendations

Unnamed: 0,user_id,item_id,als_score,cnt_score,cb_score,rank
347,1000003,49628,0.446143,0.906649,0.583617,1
300,1000003,7260188,1.129979,,0.509005,2
301,1000003,6148028,1.123475,,0.509005,3
302,1000003,2767052,1.112699,,0.509005,4
320,1000003,43641,0.617602,,0.477032,5
...,...,...,...,...,...,...
43058095,1430580,23705512,0.016477,,0.232760,96
43058096,1430580,6314763,0.016404,,0.232760,97
43058097,1430580,11710373,0.016035,,0.221228,98
43058098,1430580,7445,0.015793,,0.221228,99


### Валидация

Для валидации применимы всё те же метрики из предыдущих уроков. Считать их можно, как мы уже упоминали, как для базовых генераторов, так и для ранжирующей модели.
Подобным же образом при валидации можно визуально просматривать как рекомендации от базовых моделей, так и финальные. Такой ручной просмотр по пользователям может подсказать, какой смысловой вклад вносят те или иные кандидатогенераторы в итоговые рекомендации.

Задание 6 из 6

Посчитайте метрики recall и precision.

Используйте полученные рекомендации final_recommendations, отложенную тестовую выборку events_test_2, созданные в уроке «Валидация» предыдущей темы.
А также функции process_events_recs_for_binary_metrics и compute_cls_metrics.

In [36]:
events_inference = pd.concat([events_train, events_labels])

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    events_test_2,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}")

Common users: 75194
precision: 0.006, recall: 0.015


In [38]:
final_recommendations

Unnamed: 0,user_id,item_id,als_score,cnt_score,cb_score,rank
347,1000003,49628,0.446143,0.906649,0.583617,1
300,1000003,7260188,1.129979,,0.509005,2
301,1000003,6148028,1.123475,,0.509005,3
302,1000003,2767052,1.112699,,0.509005,4
320,1000003,43641,0.617602,,0.477032,5
...,...,...,...,...,...,...
43058095,1430580,23705512,0.016477,,0.232760,96
43058096,1430580,6314763,0.016404,,0.232760,97
43058097,1430580,11710373,0.016035,,0.221228,98
43058098,1430580,7445,0.015793,,0.221228,99


# === Двухстадийный подход: построение признаков

### Признаки объектов

In [12]:
items["age"] = 2018 - items["publication_year"]
invalid_age_idx = items["age"] < 0
items.loc[invalid_age_idx, "age"] = np.nan
items["age"] = items["age"].astype("float")

In [18]:
items.head()

Unnamed: 0,item_id,author,title,description,genre_and_votes,num_pages,average_rating,ratings_count,text_reviews_count,publisher,...,country_code,language_code,format,is_ebook,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,item_id_enc,age
3,6066819,Jennifer Weiner,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,"{'Womens Fiction-Chick Lit': 739, 'Fiction': 442}",368.0,3.49,51184,3282,Atria Books,...,US,eng,Hardcover,False,0743294297,9780743294294,"{'Academic': None, 'Academic-Academia': None, ...","Womens Fiction-Chick Lit 739, Fiction 442",23133,9.0
6,378460,Michael Halberstam,The Wanting of Levine,,"{'Politics': 1, 'Humor': 1}",,4.38,12,4,Berkley Publishing Group,...,US,,Paperback,False,0425040887,9780425040881,"{'Academic': None, 'Academic-Academia': None, ...","Politics 1user, Humor 1user",12687,39.0
15,89375,"Don Piper, Cecil Murphey",90 Minutes in Heaven: A True Story of Death an...,As he is driving home from a minister's confer...,"{'Christian': 395, 'Nonfiction': 392, 'Religio...",,3.91,68157,2885,,...,US,,,False,0800759494,9780800759490,"{'Academic': None, 'Academic-Academia': None, ...","Christian 395, Nonfiction 392, Religion 142, S...",6460,
16,89376,Randy Alcorn,Heaven,What is Heaven really going to be like? What w...,"{'Christian': 225, 'Religion-Theology': 154, '...",533.0,4.26,7345,566,,...,US,eng,,False,0842379428,9780842379427,"{'Academic': None, 'Academic-Academia': None, ...","Christian 225, Religion-Theology 154, Nonficti...",6461,
17,89377,Jennifer L. Holm,Penny from Heaven,It's 1953 and 11-year-old Penny dreams of a su...,"{'Historical-Historical Fiction': 284, 'Childr...",288.0,3.98,6949,615,Random House Books for Young Readers,...,US,,Hardcover,False,037583687X,9780375836879,"{'Academic': None, 'Academic-Academia': None, ...","Historical-Historical Fiction 284, Childrens-M...",6462,12.0


In [13]:
candidates_for_train

Unnamed: 0,user_id,item_id,als_score,cnt_score,target
615,1000006,29868610,0.286715,,1
632,1000006,7445,0.230529,,1
649,1000006,18812405,0.178382,,1
1998,1000019,37415,0.043595,,1
2302,1000023,7260188,0.598791,,1
...,...,...,...,...,...
"(1430579, 82992597)",1430579,15698462,,0.900922,0
"(1430584, 43058418)",1430584,18774964,0.222126,,0
"(1430584, 82993064)",1430584,8393104,,0.795215,0
"(1430584, 82993001)",1430584,24929,,0.847833,0


In [16]:
candidates_to_rank

Unnamed: 0,user_id,item_id,als_score,cnt_score
300,1000003,7260188,1.129979,
301,1000003,6148028,1.123475,
302,1000003,2767052,1.112699,
303,1000003,9361589,1.060634,
304,1000003,9969571,0.903286,
...,...,...,...,...
83152908,1430500,31202835,,0.902576
83152909,1430500,31423133,,0.902539
83152910,1430500,32041114,,0.901478
83152911,1430500,32078335,,0.901235


In [20]:
candidates_for_train = candidates_for_train.merge(items[['item_id', 'age', 'average_rating']], how='left')
candidates_to_rank = candidates_to_rank.merge(items[['item_id', 'age', 'average_rating']], how='left')

In [21]:
candidates_to_rank['age'].describe()

count    1.263090e+07
mean     9.922216e+00
std      9.044382e+00
min      0.000000e+00
25%      4.000000e+00
50%      7.000000e+00
75%      1.300000e+01
max      2.005000e+03
Name: age, dtype: float64

### Признаки пользователей

In [22]:
events_train.head()

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_enc,item_id_enc
0,1229132,22034,2015-07-12,2015-07-17,True,5,False,2015-07-01,229132,2460
1,1229132,22318578,2015-06-07,2015-08-09,True,5,True,2015-06-01,229132,38691
2,1229132,22551730,2015-06-24,2015-07-11,True,4,True,2015-06-01,229132,38867
3,1229132,22816087,2015-09-27,2015-11-04,True,5,True,2015-09-01,229132,39109
5,1229132,17910054,2015-03-04,2015-07-28,True,3,False,2015-03-01,229132,35638


In [30]:
def get_user_features(events):
    """ считает пользовательские признаки """
    
    user_features = events.groupby("user_id").agg(
        reading_years=("started_at", lambda x: (x.max()-x.min()).days/365.25),
        books_read=("item_id", "count"),
        rating_avg=("rating", "mean"),
        rating_std=("rating", "std"))
    
    user_features["books_per_year"] = user_features["books_read"] / user_features["reading_years"]
    
    return user_features
    
user_features_for_train = get_user_features(events_train)
candidates_for_train = candidates_for_train.merge(user_features_for_train, on="user_id", how="left")
  
# оставим только тех пользователей, что есть в тесте, для экономии ресурсов
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test["user_id"].drop_duplicates())]

user_features_for_ranking = get_user_features(events_inference)
candidates_to_rank = candidates_to_rank.merge(user_features_for_ranking, on="user_id", how="left")

In [38]:
candidates_for_train['books_read'].describe()

count    213222.000000
mean         51.867289
std          61.665885
min           1.000000
25%          13.000000
50%          32.000000
75%          67.000000
max        1280.000000
Name: books_read, dtype: float64

In [42]:
genres

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Academic,152.0,3.317352e-06
1,Academic-Academia,123.0,2.684436e-06
2,Academic-College,19712.0,4.302082e-04
3,Academic-Grad School,31.0,6.765653e-07
4,Academic-Read For School,10199.0,2.225900e-04
...,...,...,...
810,k,767.0,1.673953e-05
811,nd Grade,104.0,2.269767e-06
812,st Century,225.0,4.910554e-06
813,st Grade,32.0,6.983900e-07


In [109]:
# определяем индексы топ-10 жанров и всех остальных
genres_top_k = 10
genres_top_idx = genres.sort_values("votes", ascending=False).head(genres_top_k).index
genres_others_idx = list(set(genres.index) - set(genres_top_idx))

genres_top_columns = [f"genre_{id}" for id in genres_top_idx]
genres_others_column = "genre_others"
genre_columns = list(genres_top_columns) + [genres_others_column]

# составляем таблицу принадлежности книг к жанрам
item_genres = (
    pd.concat([
        # топ жанров
        pd.DataFrame(all_items_genres_csr[:, genres_top_idx].toarray(), columns=genres_top_columns),
        # все остальные жанры
        pd.DataFrame(all_items_genres_csr[:, genres_others_idx].sum(axis=1), columns=[genres_others_column])
        ],
        axis=1)
    .reset_index()
    .rename(columns={"index": "item_id_enc"})
)

# объединяем информацию принадлежности книг к жанрам с основной информацией о книгах
items = items.merge(item_genres, on="item_id_enc", how="left")

def get_user_genres(events, items, item_genre_columns):
    user_genres = (
        events
        .merge(items[["item_id"] + item_genre_columns], on="item_id", how="left")
        .groupby("user_id")[item_genre_columns].mean()
    )
    return user_genres
    
user_genres_for_train = get_user_genres(events_train, items, genre_columns)
candidates_for_train = candidates_for_train.merge(user_genres_for_train, on="user_id", how="left")

user_genres_for_ranking = get_user_genres(events_inference, items, genre_columns)
candidates_to_rank = candidates_to_rank.merge(user_genres_for_ranking, on="user_id", how="left")

In [110]:
candidates_for_train

Unnamed: 0,user_id,item_id,als_score,cnt_score,target,age,average_rating,reading_years,books_read,rating_avg,...,genre_308,genre_132,genre_800,genre_640,genre_535,genre_378,genre_512,genre_665,genre_296,genre_others
0,1000006,29868610,0.286715,,1,,3.90,1.820671,17.0,4.294118,...,0.246138,0.105182,0.057684,0.010375,0.078927,0.004294,0.021665,0.008603,0.000000,0.286282
1,1000006,7445,0.230529,,1,12.0,4.24,1.820671,17.0,4.294118,...,0.246138,0.105182,0.057684,0.010375,0.078927,0.004294,0.021665,0.008603,0.000000,0.286282
2,1000006,18812405,0.178382,,1,4.0,3.81,1.820671,17.0,4.294118,...,0.246138,0.105182,0.057684,0.010375,0.078927,0.004294,0.021665,0.008603,0.000000,0.286282
3,1000019,37415,0.043595,,1,12.0,3.87,0.276523,6.0,4.166667,...,0.158224,0.000000,0.000000,0.000000,0.195082,0.000000,0.000000,0.082617,0.000000,0.514445
4,1000023,7260188,0.598791,,1,8.0,4.03,0.005476,2.0,3.500000,...,0.170366,0.000000,0.159612,0.019622,0.000000,0.000000,0.000000,0.077326,0.000000,0.254950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213703,1430579,15698462,,0.900922,0,6.0,3.60,6.499658,264.0,3.928030,...,0.069304,0.007392,0.271108,0.080489,0.017241,0.007644,0.027871,0.021434,0.020003,0.300587
213704,1430584,18774964,0.222126,,0,4.0,4.35,2.899384,11.0,3.454545,...,0.224713,0.017886,0.061621,0.047617,0.084946,0.020774,0.041339,0.002553,0.001796,0.339348
213705,1430584,8393104,,0.795215,0,8.0,3.64,2.899384,11.0,3.454545,...,0.224713,0.017886,0.061621,0.047617,0.084946,0.020774,0.041339,0.002553,0.001796,0.339348
213706,1430584,24929,,0.847833,0,16.0,2.81,2.899384,11.0,3.454545,...,0.224713,0.017886,0.061621,0.047617,0.084946,0.020774,0.041339,0.002553,0.001796,0.339348


In [112]:
romance_id = genres[genres['name'] == 'Romance'].index[0]
romance_col = f"genre_{romance_id}"

In [113]:
romance_col

'genre_640'

In [116]:
genres_top_idx

Index([285, 308, 132, 800, 640, 535, 378, 512, 665, 296], dtype='int64', name='genre_id')

In [115]:
candidates_for_train[romance_col].describe()

count    213222.000000
mean          0.063086
std           0.075779
min           0.000000
25%           0.009309
50%           0.038489
75%           0.088928
max           0.635226
Name: genre_640, dtype: float64

### Обучение и получение рекомендаций

Вы добавили в candidates_for_train и candidates_to_rank различные признаки. Обучите новую ранжирующую модель, которая их будет учитывать.

#### Задание 4 из 6
Обучите модель, выполнив код ниже:

In [117]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score', 
    'age', 'average_rating', 'reading_years', 'books_read', 
    'rating_avg', 'rating_std', 
    'books_per_year'] + genre_columns
target = 'target'

# создаём Pool
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0,
)

# тренируем модель
cb_model.fit(train_data)

0:	learn: 0.6485043	total: 83ms	remaining: 1m 22s
100:	learn: 0.4665268	total: 2.66s	remaining: 23.7s
200:	learn: 0.4578610	total: 5.76s	remaining: 22.9s
300:	learn: 0.4518428	total: 8.88s	remaining: 20.6s
400:	learn: 0.4471687	total: 11.8s	remaining: 17.6s
500:	learn: 0.4429055	total: 15.1s	remaining: 15s
600:	learn: 0.4390647	total: 18.1s	remaining: 12s
700:	learn: 0.4355651	total: 21.5s	remaining: 9.19s
800:	learn: 0.4321506	total: 24.2s	remaining: 6s
900:	learn: 0.4288350	total: 26.7s	remaining: 2.93s
999:	learn: 0.4257325	total: 29.2s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fdcee0093f0>

In [118]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = cb_model.predict_proba(inference_data)[:, 1]

# для каждого пользователя проставим rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = candidates_to_rank.groupby("user_id").cumcount() + 1

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.query("rank <= @max_recommendations_per_user")

In [120]:
final_recommendations['user_id'].nunique()

75194

In [121]:
final_recommendations.to_parquet("final_recommendations_feat.parquet") 

Итак, вы получили рекомендации, которые уже должны учитывать не только оценки от базовых генераторов als_score и cnt_score, но и информацию, заложенную в признаках. Посмотрим, помогло ли это повысить качество рекомендаций по метрике recall, по которой вы уже оценивали результаты работы модели в прошлом уроке после внедрения двухстадийного подхода. Напомним, что тогда получилось значение 0.016. 
#### Задание 5 из 6
Используя отложенную тестовую выборку events_test_2, посчитайте метрики recall и precision для полученных рекомендаций.

In [126]:
# для экономии ресурсов оставим события только тех пользователей, 
# для которых следует оценить рекомендации
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test_2["user_id"].drop_duplicates())]

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    events_test_2,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}")

Common users: 75194
precision: 0.011, recall: 0.030


### Проверка важности признаков
Любопытно понять, какие признаки вносят наибольший вклад в ранжирование. Алгоритм CatBoost позволяет получить такую информацию (англ. feature importance), которая генерируется во время тренировки модели. Для этого используйте метод get_feature_importance(). 
#### Задание 6
Выполните код для получения информации о важности признаков. Выведите список признаков feature_importance в порядке убывания их важности.

In [1]:
feature_importance = pd.DataFrame(cb_model.get_feature_importance(), 
    index=features, 
    columns=["fi"])
feature_importance = feature_importance.sort_values(by="fi", ascending=False)

print(feature_importance )