In [2]:
import warnings
warnings.filterwarnings('ignore')

import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

In [3]:
items = pd.read_parquet("goodsread/items.par")
events = pd.read_parquet("goodsread/events.par")

events.rename(columns={'book_id':'item_id'}, inplace=True)

user_counts = events['user_id'].value_counts()
uids = user_counts[user_counts >= 2].index
filtered_events = events[events['user_id'].isin(uids)]
filtered_events.sort_values(by='user_id', inplace=True)

cols_to_rename = {'book_id': 'item_id'}
filtered_events.rename(columns=cols_to_rename, inplace=True)
items.rename(columns=cols_to_rename, inplace=True)

unique_users = list(filtered_events['user_id'].unique())
ids_mapping = {user:idx for idx, user in enumerate(unique_users, start=1_000_000)}

filtered_events['user_id'] = filtered_events['user_id'].map(ids_mapping)
filtered_events.reset_index(drop=True, inplace=True)

KeyboardInterrupt: 

In [None]:
train_test_global_time_split_date = pd.to_datetime('2017-08-01').date()

train_test_global_time_split_idx = filtered_events['started_at'] < train_test_global_time_split_date
events_train = filtered_events[train_test_global_time_split_idx]
events_test = filtered_events[~train_test_global_time_split_idx]

In [None]:
import scipy
import sklearn.preprocessing 

user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(filtered_events['user_id'])

events_train['user_id_enc'] = user_encoder.transform(events_train['user_id'])
events_test['user_id_enc'] = user_encoder.transform(events_test['user_id'])

item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items['item_id'])
items['item_id_enc'] = item_encoder.transform(items['item_id'])
events_train['item_id_enc'] = item_encoder.transform(events_train['item_id'])
events_test['item_id_enc'] = item_encoder.transform(events_test['item_id'])

In [None]:
user_item_matrix_train = scipy.sparse.csr_matrix(
    (
        events_train['rating'],
        (events_train['user_id_enc'], events_train['item_id_enc'])
    ),
    dtype=np.int8
)

In [None]:
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(factors=50,
                                    iterations=50, 
                                    regularization=0.05,
                                    random_state=0)
als_model.fit(user_item_matrix_train)

100%|██████████| 50/50 [03:35<00:00,  4.32s/it]


In [None]:
train_item_ids_enc = events_train['item_id_enc'].unique()
max_similar_items = 10

similar_items = als_model.similar_items(train_item_ids_enc, N=max_similar_items+1)
sim_item_item_ids_enc = similar_items[0]
sim_item_scores = similar_items[1]

similar_items = pd.DataFrame({
    "item_id_enc": train_item_ids_enc,
    "sim_item_id_enc": sim_item_item_ids_enc.tolist(), 
    "score": sim_item_scores.tolist()})

similar_items_exploded = similar_items.explode(['sim_item_id_enc', 'score'],
                                               ignore_index=True)

# приводим типы данных
similar_items_exploded["sim_item_id_enc"] = similar_items_exploded["sim_item_id_enc"].astype('int')
similar_items_exploded["score"] = similar_items_exploded["score"].astype("float")

similar_items_exploded["item_id_1"] = item_encoder.inverse_transform(similar_items_exploded["item_id_enc"])
similar_items_exploded["item_id_2"] = item_encoder.inverse_transform(similar_items_exploded["sim_item_id_enc"])
similar_items_exploded = similar_items_exploded.drop(columns=["item_id_enc", "sim_item_id_enc"])

similar_items_exploded = similar_items_exploded.query("item_id_1 != item_id_2") 


In [None]:
similar_items

Unnamed: 0,item_id_enc,sim_item_id_enc,score
0,1303,"[1303, 881, 517, 2304, 1297, 6379, 4967, 8398,...","[1.0, 0.986731767654419, 0.9646497368812561, 0..."
1,493,"[493, 36575, 36896, 38973, 17560, 32880, 17937...","[1.0000001192092896, 0.8846721053123474, 0.872..."
2,3,"[3, 1942, 4, 1, 0, 2, 8250, 26800, 23781, 1844...","[0.9999998211860657, 0.9975799322128296, 0.996..."
3,1,"[1, 0, 4, 8250, 3, 1942, 2, 26800, 23781, 1844...","[1.0, 0.9990151524543762, 0.9974552989006042, ..."
4,1808,"[1808, 2083, 298, 1464, 20264, 2078, 2080, 330...","[0.9999999403953552, 0.9485806822776794, 0.939..."
...,...,...,...
41469,40191,"[40191, 39510, 38222, 40435, 42399, 36779, 417...","[1.0000001192092896, 0.8624750375747681, 0.648..."
41470,41396,"[41396, 19916, 31198, 27269, 25546, 2253, 1382...","[1.0, 0.7108330726623535, 0.6838828921318054, ..."
41471,29936,"[29936, 30900, 27947, 29853, 30339, 26630, 251...","[0.9999999403953552, 0.6215892434120178, 0.618..."
41472,5160,"[5160, 38103, 37237, 21429, 18291, 11609, 4041...","[1.0, 0.6781921982765198, 0.6698132753372192, ..."


In [4]:
# similar_items.to_parquet("similar_items.parquet")
similar_items = pd.read_parquet('similar_items.parquet')

In [5]:
# similar_items_exploded.to_parquet("similar_items_exploded.parquet")
similar_items_exploded = pd.read_parquet('similar_items_exploded.parquet')

In [6]:
def print_sim_items(item_id, similar_items):

    item_columns_to_use = ["item_id", "author", "title", "genre_and_votes", "average_rating", "ratings_count"]
    
    item_id_1 = items.query("item_id == @item_id")[item_columns_to_use]
    display(item_id_1)
    
    si = similar_items.query("item_id_1 == @item_id")
    si = si.merge(items[item_columns_to_use].set_index("item_id"), left_on="item_id_2", right_index=True)
    display(si)

In [7]:
print_sim_items(17245, similar_items_exploded)

Unnamed: 0,item_id,author,title,genre_and_votes,average_rating,ratings_count
19338,17245,"Bram Stoker, Nina Auerbach, David J. Skal",Dracula,"{'Classics': 19603, 'Horror': 10601, 'Fiction'...",3.98,636895


Unnamed: 0,score,item_id_1,item_id_2,author,title,genre_and_votes,average_rating,ratings_count
24278,0.928823,17245,480204,"Gaston Leroux, Alexander Teixeira de Mattos",The Phantom of the Opera,"{'Classics': 7010, 'Fiction': 2103, 'Horror': ...",3.97,144859
24279,0.900337,17245,51496,"Robert Louis Stevenson, Vladimir Nabokov, Merv...",The Strange Case of Dr. Jekyll and Mr. Hyde,"{'Classics': 12342, 'Fiction': 4037, 'Horror':...",3.79,229898
24280,0.898938,17245,93261,Washington Irving,The Legend of Sleepy Hollow,"{'Classics': 2594, 'Horror': 1182, 'Fiction': ...",3.74,26776
24281,0.897706,17245,295,Robert Louis Stevenson,Treasure Island,"{'Classics': 11249, 'Fiction': 4405, 'Adventur...",3.82,274424
24282,0.89647,17245,2623,"Charles Dickens, Marisa Sestino",Great Expectations,"{'Classics': 19645, 'Fiction': 6662, 'Literatu...",3.75,468462
24283,0.895993,17245,18254,"Charles Dickens, Philip Horne, Gerald Dickens",Oliver Twist,"{'Classics': 11450, 'Fiction': 3656, 'Historic...",3.85,235560
24284,0.886899,17245,7190,Alexandre Dumas,"The Three Musketeers (The D'Artagnan Romances,...","{'Classics': 9823, 'Fiction': 3256, 'Historica...",4.06,198892
24285,0.881911,17245,24213,"Lewis Carroll, John Tenniel, Martin Gardner",Alice's Adventures in Wonderland & Through the...,"{'Classics': 11568, 'Fantasy': 6184, 'Fiction'...",4.06,344482
24286,0.878392,17245,2932,"Daniel Defoe, Virginia Woolf",Robinson Crusoe,"{'Classics': 7725, 'Fiction': 3305, 'Adventure...",3.66,181415
24287,0.870232,17245,1953,"Charles Dickens, Richard Maxwell",A Tale of Two Cities,"{'Classics': 20021, 'Fiction': 6969, 'Historic...",3.82,646983


In [8]:
i2i = similar_items_exploded.loc[similar_items_exploded['item_id_1'] == 17245]
i2i = i2i[['item_id_2', 'score']].to_dict(orient="list")
i2i

{'item_id_2': [480204,
  51496,
  93261,
  295,
  2623,
  18254,
  7190,
  24213,
  2932,
  1953],
 'score': [0.9288230538368225,
  0.9003370404243469,
  0.8989384174346924,
  0.8977058529853821,
  0.8964701294898987,
  0.8959931135177612,
  0.8868994116783142,
  0.8819113373756409,
  0.8783923387527466,
  0.870232105255127]}

In [16]:
i2i['item_id_2']

[480204, 51496, 93261, 295, 2623, 18254, 7190, 24213, 2932, 1953]

In [20]:
recs_blended = []

min_length = 3
# чередуем элементы из списков, пока позволяет минимальная длина
for i in range(min_length):
    recs_blended.append(i2i['item_id_2'][i])

# добавляем оставшиеся элементы в конец
# if len(recs_offline) < len(recs_online):
    # rec_online[min_lengt:]

recs_blended.extend(i2i['item_id_2'][3:])

In [21]:
recs_blended

[480204, 51496, 93261, 295, 2623, 18254, 7190, 24213, 2932, 1953]

In [14]:
recs_blended.append(i2i['item_id_2'][3:])

In [15]:
recs_blended

[480204, 51496, 93261, [295, 2623, 18254, 7190, 24213, 2932, 1953]]