# Initialization

In [1]:
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

# Загрузка данных

In [5]:
items = pd.read_parquet("goodsread/items.par")
events = pd.read_parquet("goodsread/events.par")

events.rename(columns={'book_id':'item_id'}, inplace=True)

# Разбиение с учётом хронологии

Рекомендательные системы на практике работают с учётом хронологии. Поэтому поток событий для тренировки и валидации полезно делить на то, что уже случилось, и что ещё случится. Это позволяет проводить валидацию на тех же пользователях, на которых тренировались, но на их событиях в будущем.

# === Знакомство: "холодный" старт

# === Знакомство: первые персональные рекомендации

In [6]:
events[['user_id', 'item_id', 'rating']]

Unnamed: 0,user_id,item_id,rating
0,8842281e1d1347389f2ab93d60773d4d,22034,5
1,8842281e1d1347389f2ab93d60773d4d,22318578,5
2,8842281e1d1347389f2ab93d60773d4d,22551730,4
3,8842281e1d1347389f2ab93d60773d4d,22816087,5
4,8842281e1d1347389f2ab93d60773d4d,17910054,3
...,...,...,...
11836277,d890e8079d8896e0cc6c4f178634850a,14836,3
11836278,d0f6d1a4edcab80a6010cfcfeda4999f,14935,5
11836279,b0883ebf8e31731f1c5d91e678c26666,10210,5
11836280,b0883ebf8e31731f1c5d91e678c26666,99561,4


In [7]:
user_counts = events['user_id'].value_counts()
uids = user_counts[user_counts >= 2].index
filtered_events = events[events['user_id'].isin(uids)]
filtered_events.sort_values(by='user_id', inplace=True)

cols_to_rename = {'book_id': 'item_id'}
filtered_events.rename(columns=cols_to_rename, inplace=True)
items.rename(columns=cols_to_rename, inplace=True)

unique_users = list(filtered_events['user_id'].unique())
ids_mapping = {user:idx for idx, user in enumerate(unique_users, start=1_000_000)}

filtered_events['user_id'] = filtered_events['user_id'].map(ids_mapping)
filtered_events.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_events.sort_values(by='user_id', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_events.rename(columns=cols_to_rename, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_events['user_id'] = filtered_events['user_id'].map(ids_mapping)


In [None]:
train_test_global_time_split_date = pd.to_datetime('2017-08-01').date()

train_test_global_time_split_idx = filtered_events['started_at'] < train_test_global_time_split_date
events_train = filtered_events[train_test_global_time_split_idx]
events_test = filtered_events[~train_test_global_time_split_idx]

In [9]:
print('Sparsity of U-I-matrix is: ', \
      (filtered_events.user_id.nunique()*filtered_events.item_id.nunique() - \
       len(filtered_events))/(filtered_events.user_id.nunique()*filtered_events.item_id.nunique()))

Sparsity of U-I-matrix is:  0.9993451160571009


In [8]:
from surprise import Dataset, Reader
from surprise import SVD 

reader = Reader(rating_scale=(1,5))
surprise_train_set = Dataset.load_from_df(events_train[['user_id', 'item_id', 'rating']], reader)
surprise_train_set = surprise_train_set.build_full_trainset()

In [9]:
svd_model = SVD(n_factors=100, random_state=0)
svd_model.fit(surprise_train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcc7b1b8ee0>

In [13]:
surprise_test_set = list(events_test[['user_id', 'item_id', 'rating']].itertuples(index=False))
svd_predictions = svd_model.test(surprise_test_set)

In [14]:
from surprise import accuracy 

rmse = accuracy.rmse(svd_predictions)
mae = accuracy.mae(svd_predictions)

print(rmse, mae)

RMSE: 0.8263
MAE:  0.6460
0.826346375350908 0.6460143973270805


In [15]:
from surprise import NormalPredictor

np.random.seed(0)
random_model = NormalPredictor()

random_model.fit(surprise_train_set)
random_predictions = random_model.test(surprise_test_set) 

rmse = accuracy.rmse(random_predictions)
mae = accuracy.mae(random_predictions)

print(rmse, mae)

RMSE: 1.2590
MAE:  0.9982
1.2590325375790072 0.9982060614713011


# === Факультативное задание ===

In [37]:
n_interactions = filtered_events['item_id'].value_counts()
used_ids = set(n_interactions[n_interactions > 3].index)

In [38]:
interacted_events = filtered_events[filtered_events['item_id'].isin(used_ids)].copy().reset_index(drop=True)
interacted_events.head(2)

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month
0,1000000,11012,2015-12-05,2015-12-11,True,4,False,2015-12-01
1,1000000,4671,2014-06-05,2014-06-30,True,5,False,2014-06-01


In [42]:
train_test_global_time_split_date = pd.to_datetime('2017-08-01').date()
time_split_idx = interacted_events['started_at'] < train_test_global_time_split_date

interacted_events_train = interacted_events[time_split_idx]
interacted_events_test = interacted_events[~time_split_idx]

In [44]:
from surprise import Dataset, Reader 
from surprise import SVD 

reader = Reader(rating_scale=(1,5))
surprise_interacted_train_set = Dataset.load_from_df(interacted_events_train[['user_id',
                                                                              'item_id',
                                                                              'rating']], reader)
surprise_interacted_train_set = surprise_interacted_train_set.build_full_trainset()

In [46]:
svd_model_new = SVD(n_factors=100, random_state=0)
svd_model_new.fit(surprise_interacted_train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcb819a2260>

In [None]:
surprise_interacted_test_set = list(interacted_events_test[['user_id', 
                                                            'item_id', 
                                                            'rating']].itertuples(index=False))
interacted_svd_predictions = svd_model.test(surprise_interacted_test_set)

In [None]:
from surprise import accuracy 

rmse = accuracy.rmse(interacted_svd_predictions)
mae = accuracy.mae(interacted_svd_predictions)

print(rmse, mae)

RMSE: 0.8262
MAE:  0.6459
0.8262391648389367 0.6459146378103414


# === Продолжаем ===

In [None]:
svd_model.predict(events_test[['user_id', 'item_id']])

In [54]:
def get_recommendations_svd(user_id, all_items, events, model, include_seen=True, n=5):
    
    all_items = set(events['item_id'].unique())
    if include_seen:
        items_to_predict = list(all_items)
    else:
        seen_items = set(events[events['user_id'] == user_id]['item_id'].unique())
        items_to_predict = list(all_items - seen_items)
    
    predictions = [model.predict(user_id, item_id) for item_id in items_to_predict]
    recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]

    return pd.DataFrame([(pred.iid, pred.est) for pred in recommendations], columns=['book_id', 'score'])

In [59]:
get_recommendations_svd(1296647, items, events_train, svd_model, include_seen=False)

Unnamed: 0,book_id,score
0,24812,5.0
1,8471387,5.0
2,481749,5.0
3,30688013,4.9969
4,1108124,4.979711


In [None]:
user_id = events_train['user_id'].sample().iat[0]

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_svd(user_id, items, events_train, svd_model)
user_recommendations = user_recommendations.merge(items[["item_id", "author", "title", "genre_and_votes"]], left_on='book_id', right_on="item_id")
display(user_recommendations)

user_id: 1420211
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
16,Julie Buxbaum,Tell Me Three Things,2017-06-27,2017-06-27,5,"{'Young Adult': 1367, 'Contemporary': 810, 'Ro..."
17,Rick Yancey,"The 5th Wave (The 5th Wave, #1)",2015-09-28,2015-10-03,5,"{'Young Adult': 5331, 'Science Fiction': 3197,..."
18,Lauren Kate,"Rapture (Fallen, #4)",2013-01-08,2013-04-15,3,"{'Young Adult': 1287, 'Fantasy': 1066, 'Parano..."
19,Lauren Kate,"Passion (Fallen, #3)",2013-01-07,2013-01-08,4,"{'Young Adult': 1781, 'Fantasy': 1396, 'Parano..."
20,Nicholas Sparks,The Best of Me,2013-01-04,2013-01-07,4,"{'Romance': 1540, 'Fiction': 676, 'Womens Fict..."
21,Marie Lu,"Legend (Legend, #1)",2013-01-02,2013-01-04,3,"{'Young Adult': 6453, 'Science Fiction-Dystopi..."
22,Beth Revis,"Across the Universe (Across the Universe, #1)",2012-12-22,2012-12-30,3,"{'Young Adult': 3060, 'Science Fiction': 2387,..."
23,Lauren Kate,"Torment (Fallen, #2)",2013-01-04,2013-01-07,3,"{'Young Adult': 2344, 'Fantasy': 1874, 'Parano..."
24,Kathryn Stockett,The Help,2011-12-14,2011-12-26,3,"{'Fiction': 12016, 'Historical-Historical Fict..."
25,Amanda Hocking,"Hollowland (The Hollows, #1)",2011-12-13,2011-12-14,4,"{'Horror-Zombies': 569, 'Young Adult': 381, 'S..."


Рекомендации


Unnamed: 0,book_id,score,item_id,author,title,genre_and_votes
0,24812,5.0,24812,Bill Watterson,The Complete Calvin and Hobbes,"{'Sequential Art-Comics': 867, 'Humor': 378, '..."
1,481749,4.997968,481749,James E. Talmage,Jesus the Christ,"{'Religion': 451, 'Christianity-Lds': 256, 'No..."
2,9810317,4.963374,9810317,"Margaret Mitchell, Pat Conroy",Gone with the Wind,"{'Classics': 13093, 'Historical-Historical Fic..."
3,30688013,4.952521,30688013,Robin Hobb,"Assassin's Fate (The Fitz and the Fool, #3)","{'Fantasy': 1657, 'Fiction': 172, 'Fantasy-Epi..."
4,29844341,4.951126,29844341,"John Lewis, Andrew Aydin, Nate Powell",March,"{'Sequential Art-Graphic Novels': 48, 'Nonfict..."


# === Базовые подходы: коллаборативная фильтрация

In [1]:
import warnings
warnings.filterwarnings('ignore')

import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

In [2]:
items = pd.read_parquet("goodsread/items.par")
events = pd.read_parquet("goodsread/events.par")

events.rename(columns={'book_id':'item_id'}, inplace=True)

user_counts = events['user_id'].value_counts()
uids = user_counts[user_counts >= 2].index
filtered_events = events[events['user_id'].isin(uids)]
filtered_events.sort_values(by='user_id', inplace=True)

cols_to_rename = {'book_id': 'item_id'}
filtered_events.rename(columns=cols_to_rename, inplace=True)
items.rename(columns=cols_to_rename, inplace=True)

unique_users = list(filtered_events['user_id'].unique())
ids_mapping = {user:idx for idx, user in enumerate(unique_users, start=1_000_000)}

filtered_events['user_id'] = filtered_events['user_id'].map(ids_mapping)
filtered_events.reset_index(drop=True, inplace=True)

In [3]:
train_test_global_time_split_date = pd.to_datetime('2017-08-01').date()

train_test_global_time_split_idx = filtered_events['started_at'] < train_test_global_time_split_date
events_train = filtered_events[train_test_global_time_split_idx]
events_test = filtered_events[~train_test_global_time_split_idx]

In [4]:
import scipy
import sklearn.preprocessing 

user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(filtered_events['user_id'])

events_train['user_id_enc'] = user_encoder.transform(events_train['user_id'])
events_test['user_id_enc'] = user_encoder.transform(events_test['user_id'])

item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items['item_id'])
items['item_id_enc'] = item_encoder.transform(items['item_id'])
events_train['item_id_enc'] = item_encoder.transform(events_train['item_id'])
events_test['item_id_enc'] = item_encoder.transform(events_test['item_id'])

In [5]:
np.max(events_train['item_id_enc'])

43304

In [26]:
user_item_matrix_train = scipy.sparse.csr_matrix(
    (
        events_train['rating'],
        (events_train['user_id_enc'], events_train['item_id_enc'])
    ),
    dtype=np.int8
)

In [27]:
import sys
sum([sys.getsizeof(i) for i in user_item_matrix_train.data])/1024**3

0.26370687410235405

In [28]:
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(factors=50,
                                    iterations=50, 
                                    regularization=0.05,
                                    random_state=0)
als_model.fit(user_item_matrix_train)

100%|██████████| 50/50 [03:31<00:00,  4.23s/it]


In [29]:
def get_recommendations_als(user_item_matrix, model, user_id, user_encoder, item_encoder, include_seen=True, n=5):
    user_id_enc = user_encoder.transform([user_id])[0]
    recommendations = model.recommend(
        user_id_enc,
        user_item_matrix[user_id_enc],
        filter_already_liked_items = not include_seen,
        N=n)
    recommendations = pd.DataFrame({'item_id_enc':recommendations[0],
                                    'score':recommendations[1]})
    recommendations['item_id'] = item_encoder.inverse_transform(recommendations["item_id_enc"])

    return recommendations

In [30]:
user_item_matrix_test = scipy.sparse.csr_matrix(
    (
        events_test['rating'],
        (events_test['user_id_enc'], events_test['item_id_enc'])
    ),
    dtype=np.int8
)

In [31]:
random_user = events_test.sample(1)['user_id'].iat[0]

reki = get_recommendations_als(user_item_matrix=user_item_matrix_test,
                               model=als_model,
                               user_id=random_user,
                               user_encoder=user_encoder,
                               item_encoder=item_encoder,
                               n=5)

In [32]:
history = filtered_events[filtered_events['user_id'] == random_user]
print('История прочтения пользователя: ')
history.merge(items[['item_id', 'author', 'title', 'genre_and_votes']], on='item_id').head(3)

История прочтения пользователя: 


Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,author,title,genre_and_votes
0,1187393,27405006,2017-05-24,2017-05-28,True,3,False,2017-05-01,Elan Mastai,All Our Wrong Todays,"{'Science Fiction': 749, 'Fiction': 553, 'Scie..."
1,1187393,65110,2017-01-26,2017-01-26,True,4,False,2017-01-01,"Lemony Snicket, Brett Helquist",The Penultimate Peril (A Series of Unfortunate...,"{'Fiction': 964, 'Young Adult': 791, 'Children..."
2,1187393,27071490,2017-01-25,2017-02-05,True,5,False,2017-01-01,Yaa Gyasi,Homegoing,"{'Historical-Historical Fiction': 3664, 'Ficti..."


In [33]:
print('Рекомендации пользователю: ')
reki.merge(items[['item_id', 'author', 'title', 'genre_and_votes']], on='item_id')

Рекомендации пользователю: 


Unnamed: 0,item_id_enc,score,item_id,author,title,genre_and_votes
0,35810,0.459273,18007564,Andy Weir,The Martian,"{'Science Fiction': 11966, 'Fiction': 8430}"
1,37497,0.41423,20170404,Emily St. John Mandel,Station Eleven,"{'Fiction': 5860, 'Science Fiction': 3274, 'Sc..."
2,38569,0.343319,22055262,V.E. Schwab,"A Darker Shade of Magic (Shades of Magic, #1)","{'Fantasy': 14539, 'Young Adult': 2101, 'Ficti..."
3,35259,0.328718,17675462,Maggie Stiefvater,"The Raven Boys (The Raven Cycle, #1)","{'Fantasy': 9481, 'Young Adult': 7067, 'Fantas..."
4,36086,0.283066,18143977,Anthony Doerr,All the Light We Cannot See,"{'Historical-Historical Fiction': 13679, 'Fict..."


# === Базовые подходы: контентные рекомендации

In [5]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43312 entries, 0 to 43311
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   item_id               43312 non-null  int64  
 1   author                43312 non-null  object 
 2   title                 43312 non-null  object 
 3   description           43312 non-null  object 
 4   genre_and_votes       43312 non-null  object 
 5   num_pages             37001 non-null  Int64  
 6   average_rating        43312 non-null  float64
 7   ratings_count         43312 non-null  Int64  
 8   text_reviews_count    43312 non-null  int64  
 9   publisher             43312 non-null  object 
 10  publication_year      35891 non-null  Int64  
 11  country_code          43312 non-null  object 
 12  language_code         43312 non-null  object 
 13  format                43312 non-null  object 
 14  is_ebook              43312 non-null  bool   
 15  isbn               

In [6]:
items['genre_and_votes'] = items['genre_and_votes'].apply(eval)

In [7]:
def get_genres(items):
    genres_counter = {}

    for idx, row in items.iterrows():
        genre_and_votes = row['genre_and_votes']
        if genre_and_votes is None or not isinstance(genre_and_votes, dict):
            continue
        for genre, votes in genre_and_votes.items():
            try:
                genres_counter[genre] += votes
            except KeyError:
                genres_counter[genre] = 0
    
    genres = pd.Series(genres_counter, name='votes')
    genres = genres.to_frame()
    genres = genres.reset_index().rename(columns={"index": "name"})
    genres.index.name = 'genre_id'

    return genres

genres = get_genres(items)

In [8]:
genres['score'] = genres['votes']/genres['votes'].sum()
genres.sort_values(by='score', ascending=False).head(10)

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25,Fantasy,6850060,0.149651
1,Fiction,6406256,0.139955
38,Classics,3414934,0.074605
18,Young Adult,3296951,0.072027
34,Romance,2422614,0.052926
5,Nonfiction,1737406,0.037957
16,Historical-Historical Fiction,1531205,0.033452
20,Mystery,1371196,0.029956
24,Science Fiction,1218917,0.026629
33,Fantasy-Paranormal,857012,0.018723


In [9]:
import scipy

def get_item2genre_matrix(genres, items):

    genre_names_to_id = genres.reset_index().set_index("name")["genre_id"].to_dict() # словарь id:название жанра
    
    # list to build CSR matrix
    genres_csr_data = []
    genres_csr_row_idx = []
    genres_csr_col_idx = []
    
    for item_idx, (k, v) in enumerate(items.iterrows()):
        if v["genre_and_votes"] is None:
            continue
        for genre_name, votes in v["genre_and_votes"].items():
            genre_idx = genre_names_to_id[genre_name]
            genres_csr_data.append(int(votes))
            genres_csr_row_idx.append(item_idx)
            genres_csr_col_idx.append(genre_idx)

    genres_csr = scipy.sparse.csr_matrix((genres_csr_data, (genres_csr_row_idx, genres_csr_col_idx)), shape=(len(items), len(genres)))
    # нормализуем, чтобы сумма оценок принадлежности к жанру была равна 1
    genres_csr = sklearn.preprocessing.normalize(genres_csr, norm='l1', axis=1)
    
    return genres_csr

In [10]:
items = items.sort_values(by="item_id_enc")
all_items_genres_csr = get_item2genre_matrix(genres, items)

In [12]:
user_id = 1000010
user_events = events_train.query("user_id == @user_id")[["item_id", "rating"]]
user_items = items[items["item_id"].isin(user_events["item_id"])]

user_items_genres_csr = get_item2genre_matrix(genres, user_items)
user_items_genres_csr 

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 149 stored elements and shape (22, 815)>

In [13]:
user_ratings = user_events["rating"].to_numpy() / 5
user_ratings = np.expand_dims(user_ratings, axis=1)

user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0)) 

In [14]:
user_ratings = user_events["rating"].to_numpy() / 5
user_ratings = np.expand_dims(user_ratings, axis=1)

user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0)) 

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(all_items_genres_csr, user_genres_scores)
similarity_scores = similarity_scores.flatten()

k = 5
top_k_indices = (-similarity_scores).argsort()[:k]

In [24]:
top_k_indices

array([ 4471,  1120, 14087,  4460,  1988])

In [25]:
selected_items = items[items["item_id_enc"].isin(top_k_indices)]

with pd.option_context("max_colwidth", 100):
   display(selected_items[["author", "title", "genre_and_votes"]])

Unnamed: 0,author,title,genre_and_votes
10541,Ray Bradbury,"Farewell Summer (Green Town, #3)","{'Fiction': 170, 'Fantasy': 72, 'Science Fiction': 72, 'Classics': 52}"
25073,John Fowles,The Magus,"{'Fiction': 1204, 'Classics': 421, 'Fantasy': 228, 'Mystery': 203, 'Literature': 167}"
1555,G.K. Chesterton,The Napoleon of Notting Hill,"{'Fiction': 166, 'Classics': 88, 'Fantasy': 44, 'Humor': 22, 'Literature': 20}"
21241,Ray Bradbury,"Dandelion Wine (Green Town, #1)","{'Fiction': 1438, 'Classics': 914, 'Science Fiction': 529, 'Fantasy': 456, 'Young Adult': 212}"
41250,Samuel Butler,"Erewhon (Erewhon , #1)","{'Fiction': 162, 'Classics': 139, 'Science Fiction': 60, 'Fantasy': 55}"


# === Базовые подходы: валидация

In [36]:
user_ids_encoded = range(len(user_encoder.classes_))

In [38]:
als_recommendations = als_model.recommend(user_ids_encoded, 
    user_item_matrix_train[user_ids_encoded], 
    filter_already_liked_items=False, N=100) 

In [53]:
# преобразуем полученные рекомендации в табличный формат
item_ids_enc = als_recommendations[0]
als_scores = als_recommendations[1]

als_recommendations = pd.DataFrame({
    "user_id_enc": user_ids_encoded,
    "item_id_enc": item_ids_enc.tolist(), 
    "score": als_scores.tolist()})
als_recommendations = als_recommendations.explode(["item_id_enc", "score"], ignore_index=True)

# приводим типы данных
als_recommendations["item_id_enc"] = als_recommendations["item_id_enc"].astype("int")
als_recommendations["score"] = als_recommendations["score"].astype("float")

# получаем изначальные идентификаторы
als_recommendations["user_id"] = user_encoder.inverse_transform(als_recommendations["user_id_enc"])
als_recommendations["item_id"] = item_encoder.inverse_transform(als_recommendations["item_id_enc"])
als_recommendations = als_recommendations.drop(columns=["user_id_enc", "item_id_enc"])

In [25]:
def process_events_recs_for_binary_metrics(events_train, events_test, recs, top_k=None):

    """
    размечает пары <user_id, item_id> для общего множества пользователей признаками
    - gt (ground truth)
    - pr (prediction)
    top_k: расчёт ведётся только для top k-рекомендаций
    """

    events_test["gt"] = True
    common_users = set(events_test["user_id"]) & set(recs["user_id"])

    print(f"Common users: {len(common_users)}")
    
    events_for_common_users = events_test[events_test["user_id"].isin(common_users)].copy()
    recs_for_common_users = recs[recs["user_id"].isin(common_users)].copy()

    recs_for_common_users = recs_for_common_users.sort_values(["user_id", "score"], ascending=[True, False])

    # оставляет только те item_id, которые были в events_train, 
    # т. к. модель не имела никакой возможности давать рекомендации для новых айтемов
    events_for_common_users = events_for_common_users[events_for_common_users["item_id"].isin(events_train["item_id"].unique())]

    if top_k is not None:
        recs_for_common_users = recs_for_common_users.groupby("user_id").head(top_k)
    
    events_recs_common = events_for_common_users[["user_id", "item_id", "gt"]].merge(
        recs_for_common_users[["user_id", "item_id", "score"]], 
        on=["user_id", "item_id"], how="outer")    

    events_recs_common["gt"] = events_recs_common["gt"].fillna(False)
    events_recs_common["pr"] = ~events_recs_common["score"].isnull()
    
    events_recs_common["tp"] = events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fp"] = ~events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fn"] = events_recs_common["gt"] & ~events_recs_common["pr"]

    return events_recs_common

In [61]:
events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(events_train,
                                                                        events_test, 
                                                                        als_recommendations, 
                                                                        top_k=5)

Common users: 123223


In [26]:
def compute_cls_metrics(events_recs_for_binary_metric):
    
    groupper = events_recs_for_binary_metric.groupby("user_id")

    # precision = tp / (tp + fp)
    precision = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fp"].sum())
    precision = precision.fillna(0).mean()
    
    # recall = tp / (tp + fn)
    recall = groupper['tp'].sum()/(groupper['tp'].sum()+groupper['fn'].sum())
    recall = recall.fillna(0).mean()

    return precision, recall

In [64]:
compute_cls_metrics(events_recs_for_binary_metrics)

(0.007581376853347184, 0.014121568795222568)

### === Факультативное задание ===

In [None]:
events_recs_for_binary_metrics_at10 = process_events_recs_for_binary_metrics(events_train,
                                                                        events_test, 
                                                                        als_recommendations, 
                                                                        top_k=10)

compute_cls_metrics(events_recs_for_binary_metrics_at10)

Common users: 123223


(0.008732947582837622, 0.03130238527136974)

In [66]:
events_recs_for_binary_metrics_at100 = process_events_recs_for_binary_metrics(events_train,
                                                                        events_test, 
                                                                        als_recommendations, 
                                                                        top_k=100)

compute_cls_metrics(events_recs_for_binary_metrics_at100)

Common users: 123223


(0.0065323843762933875, 0.20423433248344025)

# === Двухстадийный подход: метрики

# === Двухстадийный подход: модель

In [12]:
split_date_for_labels = pd.to_datetime('2017-09-15').date()
split_date_for_labels_idx = events_test["started_at"] < split_date_for_labels

events_labels = events_test[split_date_for_labels_idx].copy()
events_test_2 = events_test[~split_date_for_labels_idx].copy()

In [13]:
als_recommendations = pd.read_parquet("candidates/training/als_recommendations.parquet")
content_recommendations = pd.read_parquet("candidates/training/content_recommendations.parquet")

In [14]:
candidates = pd.merge(
    als_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=['user_id', 'item_id'],
    how="outer")

In [15]:
candidates.head()

Unnamed: 0,user_id,item_id,als_score,cnt_score
0,1000000,3,0.972557,0.920225
1,1000000,15881,0.890201,0.90574
2,1000000,5,0.86585,0.918026
3,1000000,6,0.834282,0.916345
4,1000000,2,0.792929,0.925806


In [16]:
events_labels["target"] = 1
candidates = candidates.merge(events_labels[["user_id", "item_id", "target"]], on=['user_id', 'item_id'], how='left')

In [17]:
events_labels

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_enc,item_id_enc,target
288,1000006,29868610,2017-08-30,2017-09-16,True,4,False,2017-08-01,6,42017,1
291,1000006,7445,2017-08-26,2017-08-30,True,4,False,2017-08-01,6,868,1
296,1000006,18812405,2017-08-05,2017-08-19,True,3,False,2017-08-01,6,37138,1
302,1000007,168642,2017-09-05,2017-09-17,True,5,True,2017-09-01,7,9074,1
303,1000007,28257707,2017-08-16,2017-09-06,True,4,True,2017-08-01,7,41610,1
...,...,...,...,...,...,...,...,...,...,...,...
11750633,1430578,216378,2017-08-27,2017-08-28,True,4,False,2017-08-01,430578,10066,1
11750634,1430578,6723348,2017-08-02,2017-08-03,True,4,False,2017-08-01,430578,24681,1
11750800,1430579,27272506,2017-09-03,2017-10-07,True,3,True,2017-09-01,430579,41386,1
11751080,1430584,23395680,2017-08-09,2017-08-15,True,4,True,2017-08-01,430584,39645,1


In [18]:
candidates["target"] = candidates["target"].fillna(0).astype("int")

In [19]:
candidates

Unnamed: 0,user_id,item_id,als_score,cnt_score,target
0,1000000,3,0.972557,0.920225,0
1,1000000,15881,0.890201,0.905740,0
2,1000000,5,0.865850,0.918026,0
3,1000000,6,0.834282,0.916345,0
4,1000000,2,0.792929,0.925806,0
...,...,...,...,...,...
82993089,1430584,31327371,,0.786363,0
82993090,1430584,32841355,,0.784905,0
82993091,1430584,33828743,,0.784706,0
82993092,1430584,34037113,,0.784556,0


In [20]:
candidates['user_id'].nunique()

430585

In [21]:
candidates_to_sample = candidates.groupby("user_id").filter(lambda x: x["target"].sum() > 0)

In [22]:
candidates_to_sample['user_id'].nunique()

38804

In [23]:
negatives_per_user = 4
candidates_for_train = pd.concat([
    candidates_to_sample.query('target == 1'),
    candidates_to_sample.query("target == 0") \
        .groupby("user_id") \
        .apply(lambda x: x.sample(negatives_per_user, random_state=0))
    ]) 

In [24]:
candidates_for_train

Unnamed: 0,user_id,item_id,als_score,cnt_score,target
615,1000006,29868610,0.286715,,1
632,1000006,7445,0.230529,,1
649,1000006,18812405,0.178382,,1
1998,1000019,37415,0.043595,,1
2302,1000023,7260188,0.598791,,1
...,...,...,...,...,...
"(1430579, 82992597)",1430579,15698462,,0.900922,0
"(1430584, 43058418)",1430584,18774964,0.222126,,0
"(1430584, 82993064)",1430584,8393104,,0.795215,0
"(1430584, 82993001)",1430584,24929,,0.847833,0


In [25]:
del als_recommendations
del content_recommendations
del candidates
del candidates_to_sample

In [26]:
from catboost import CatBoostClassifier, Pool

features = ['als_score', 'cnt_score']
target = 'target'

train_data = Pool(
    data=candidates_for_train[features],
    label=candidates_for_train[target]
)

cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0
)

cb_model.fit(train_data)

0:	learn: 0.6526057	total: 113ms	remaining: 1m 53s
100:	learn: 0.5118959	total: 2.57s	remaining: 22.9s
200:	learn: 0.5111710	total: 4.52s	remaining: 18s
300:	learn: 0.5105208	total: 6.55s	remaining: 15.2s
400:	learn: 0.5100174	total: 8.54s	remaining: 12.8s
500:	learn: 0.5095747	total: 10.6s	remaining: 10.5s
600:	learn: 0.5091600	total: 12.5s	remaining: 8.32s
700:	learn: 0.5087803	total: 14.5s	remaining: 6.2s
800:	learn: 0.5084220	total: 16.5s	remaining: 4.1s
900:	learn: 0.5080930	total: 18.6s	remaining: 2.05s
999:	learn: 0.5078081	total: 21s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fc5231b8700>

In [27]:
als_recommendations_2 = pd.read_parquet("candidates/inference/als_recommendations.parquet")
content_recommendations_2 = pd.read_parquet("candidates/inference/content_recommendations.parquet")

candidates_to_rank = pd.merge(als_recommendations_2.rename(columns={"score": "als_score"}),
                              content_recommendations_2.rename(columns={"score": "cnt_score"}),
                              on=['user_id', 'item_id'],
                              how='outer')

candidates_to_rank = candidates_to_rank[candidates_to_rank["user_id"].isin(events_test["user_id"].drop_duplicates())]
print(len(candidates_to_rank)) 

23830721


In [22]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

In [23]:
candidates_to_rank["cb_score"] = predictions[:, 1]
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])

candidates_to_rank['rank'] = candidates_to_rank.groupby('user_id').cumcount() + 1
max_recommendations_per_user = 100

final_recommendations = candidates_to_rank.query('rank <= 100').copy().reset_index(drop=True)

In [27]:
events_inference = pd.concat([events_train, events_labels])

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    events_test_2,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5)

Common users: 75194


In [28]:
cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}") 

precision: 0.006, recall: 0.015


# === Двухстадийный подход: построение признаков

In [28]:
items['age'] = 2018-items['publication_year']
invalid_age_idx = items['age']<0
items.loc[invalid_age_idx, 'age'] = np.nan 
items['age'] = items['age'].astype('float')

candidates_for_train = candidates_for_train.merge(items[['item_id', 'age']], on=['item_id'], how='left').copy()
candidates_to_rank = candidates_to_rank.merge(items[['item_id', 'age']], on=['item_id'], how='left').copy()

In [29]:
candidates_to_rank['age'].median()

7.0

In [30]:
def get_user_features(events):
    """ считает пользовательские признаки """
    
    user_features = events.groupby("user_id").agg(
        reading_years=("started_at", lambda x: (x.max()-x.min()).days/365.25),
        books_read=('item_id', 'nunique'),
        rating_avg=("rating", "mean"),
        rating_std=("rating", "std"))
    
    user_features["books_per_year"] = user_features["books_read"] / user_features["reading_years"]
    
    return user_features
    
user_features_for_train = get_user_features(events_train)
candidates_for_train = candidates_for_train.merge(user_features_for_train, on="user_id", how="left")
  
# оставим только тех пользователей, что есть в тесте, для экономии ресурсов
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test["user_id"].drop_duplicates())]

user_features_for_ranking = get_user_features(events_inference)
candidates_to_rank = candidates_to_rank.merge(user_features_for_ranking, on="user_id", how="left")


In [31]:
candidates_for_train['books_read'].median()

32.0

In [33]:
genres_top_k = 10
genres_top_idx = genres.sort_values("votes", ascending=False).head(genres_top_k).index
genres_others_idx = list(set(genres.index) - set(genres_top_idx))

genres_top_columns = [f"genre_{id}" for id in genres_top_idx]
genres_others_column = "genre_others"
genre_columns = genres_top_columns + [genres_others_column]

item_genres = (
    pd.concat([
        # топ жанров
        pd.DataFrame(all_items_genres_csr[:, genres_top_idx].toarray(), columns=genres_top_columns),
        # все остальные жанры
        pd.DataFrame(all_items_genres_csr[:, genres_others_idx].sum(axis=1), columns=[genres_others_column])
        ],
        axis=1)
    .reset_index()
    .rename(columns={"index": "item_id_enc"})
)

items = items.merge(item_genres, on="item_id_enc", how="left")

def get_user_genres(events, items, item_genre_columns):
    user_genres = (
        events
        .merge(items[["item_id"] + item_genre_columns], on="item_id", how="left")
        .groupby("user_id")[item_genre_columns].mean()
    )
    return user_genres

user_genres_for_train = get_user_genres(events_train, items, genre_columns)
candidates_for_train = candidates_for_train.merge(user_genres_for_train, on="user_id", how="left")

user_genres_for_ranking = get_user_genres(events_inference, items, genre_columns)
candidates_to_rank = candidates_to_rank.merge(user_genres_for_ranking, on="user_id", how="left")


In [32]:
del als_recommendations_2
del content_recommendations_2

In [34]:
%whos DataFrame

Variable                    Type         Data/Info
--------------------------------------------------
candidates_for_train        DataFrame            user_id   item_id<...>213708 rows x 22 columns]
candidates_to_rank          DataFrame              user_id   item_<...>830721 rows x 21 columns]
events                      DataFrame                             <...>1836282 rows x 8 columns]
events_inference            DataFrame              user_id   item_<...>488800 rows x 11 columns]
events_labels               DataFrame              user_id   item_<...>253765 rows x 11 columns]
events_test                 DataFrame              user_id   item_<...>424962 rows x 10 columns]
events_test_2               DataFrame              user_id   item_<...>171197 rows x 10 columns]
events_train                DataFrame              user_id   item_<...>326124 rows x 10 columns]
filtered_events             DataFrame              user_id   item_<...>1751086 rows x 8 columns]
genres                   

In [35]:
del events
del events_inference
del events_labels
del events_test
del events_test_2
del events_train
del filtered_events
del user_features_for_ranking
del user_features_for_train
del user_genres_for_ranking
del user_genres_for_train

In [36]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score', 
    'age', 'reading_years', 'books_read', 
    'rating_avg', 'rating_std', 
    'books_per_year'] + genre_columns
target = 'target'

# создаём Pool
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0,
)

# тренируем модель
cb_model.fit(train_data)

0:	learn: 0.6463996	total: 30.4ms	remaining: 30.4s
100:	learn: 0.4710013	total: 2.73s	remaining: 24.3s
200:	learn: 0.4644069	total: 5.35s	remaining: 21.3s
300:	learn: 0.4595792	total: 7.96s	remaining: 18.5s
400:	learn: 0.4556840	total: 10.6s	remaining: 15.8s
500:	learn: 0.4522164	total: 13.2s	remaining: 13.1s
600:	learn: 0.4489011	total: 15.8s	remaining: 10.5s
700:	learn: 0.4458227	total: 18.4s	remaining: 7.85s
800:	learn: 0.4428398	total: 21.8s	remaining: 5.41s
900:	learn: 0.4401834	total: 26.1s	remaining: 2.87s
999:	learn: 0.4376329	total: 30s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fc556259270>

In [37]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = predictions[:, 1]

# для каждого пользователя проставим rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank['rank'] = candidates_to_rank.groupby('user_id').cumcount() + 1

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.query("rank <= @max_recommendations_per_user")

In [39]:
final_recommendations['user_id'].nunique()

123223

In [40]:
feature_importance = pd.DataFrame(cb_model.get_feature_importance(), 
    index=features, 
    columns=["fi"])

In [41]:
feature_importance

Unnamed: 0,fi
als_score,31.810044
cnt_score,4.123602
age,21.707668
reading_years,3.017702
books_read,8.889603
rating_avg,1.851159
rating_std,1.196695
books_per_year,2.40852
genre_25,2.570518
genre_1,2.899258
