# <center> Разбор кейса ML-инженера

## Обучим и протестируем модель

In [1]:
!pip install lightfm
! pip install pandas
! pip install numpy
! pip install scipy
! pip install pickle


Collecting lightfm
  Using cached lightfm-1.17-cp311-cp311-macosx_10_9_universal2.whl
Collecting numpy (from lightfm)
  Using cached numpy-2.3.2-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting scipy>=0.17.0 (from lightfm)
  Using cached scipy-1.16.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting scikit-learn (from lightfm)
  Using cached scikit_learn-1.7.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn->lightfm)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->lightfm)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scipy-1.16.1-cp311-cp311-macosx_14_0_arm64.whl (20.9 MB)
Using cached numpy-2.3.2-cp311-cp311-macosx_14_0_arm64.whl (5.4 MB)
Using cached scikit_learn-1.7.1-cp311-cp311-macosx_12_0_arm64.whl (8.7 MB)
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached threadpoolctl-3.6.0-

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k
import pickle



In [7]:
ratings = pd.read_csv('data/ratings.csv')
books = pd.read_csv('data/books.csv')
tags = pd.read_csv('data/tags.csv')
book_tags = pd.read_csv('data/book_tags.csv')

In [8]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [9]:
mapper = dict(zip(books.goodreads_book_id,books.book_id))

In [10]:
tags = pd.read_csv('data/tags_cleaned.csv')
book_tags = book_tags[book_tags.tag_id.isin(tags.tag_id)]
book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])

In [11]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count,id
1,1,11305,37174,27
4,1,33114,12716,27
5,1,11743,9954,27
6,1,14017,7169,27
10,1,27199,3857,27


In [12]:
ratings_coo = sparse.coo_matrix((ratings.rating,(ratings.user_id, ratings.book_id)))
feature_ratings  = sparse.coo_matrix(([1]*len(book_tags), (book_tags.id, book_tags.tag_id)))

Объявим вспомогательные константы для обучения модели:

In [13]:
#число потоков нашего процессора. Ставим 1, так как lightfm на macos ставится без OpenMP
NUM_THREADS = 1

#число параметров вектора 
NUM_COMPONENTS = 60

#число эпох обучения
NUM_EPOCHS = 10 

#зерно датчика случайных чисел
RANDOM_STATE = 42

На этапе создания модели мы используем библиотеку LightFM, чтобы сделать матричное разложение (ALS) наших рейтингов книг и получить два набора векторов. 

In [14]:
#Разбиваем наш датасет на обучающую и тестовую выборки
RANDOM_STATE = 42
train, test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=42)

#Создаём модель
model = LightFM(
    learning_rate=0.05, #темп (скорость) обучения
    loss='warp', #loss-функция
    no_components=NUM_COMPONENTS,#размерность вектора признаков
    random_state=42 #генератор случайных чисел
)

#Обучаем модель
model = model.fit(
    train, #обучающая выборка
    epochs=NUM_EPOCHS, #количество эпох обучения
    num_threads=NUM_THREADS, #количество потоков процессора
    item_features=feature_ratings #признаки товаров (рейтинги книг)
)

Протестируем модель

In [15]:
#Тестируем нашу модель
precision_score = precision_at_k(
    model, #модель
    test, #тестовая выборка
    num_threads=NUM_THREADS, #количество потоков процессора
    k=10, #количество предложений
    item_features=feature_ratings #признаки товаров
).mean() #усредняем результаты
 
recall_score = recall_at_k(
    model, #модель
    test, #тестовая выборка
    num_threads=NUM_THREADS, #количество потоков процессора
    k=10, #количество предложений
    item_features=feature_ratings #признаки товаров
).mean() #усредняем результаты

print(recall_score, precision_score)

0.04008034798209188 0.08673394


Сохраним модель

In [16]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file, protocol=pickle.HIGHEST_PROTOCOL)

## Добавим эмбеддинги к модели и посмотрим, что получилось

In [17]:
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

In [18]:
# Достаём эбмеддинги
item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

print(item_biases.shape, item_embeddings.shape)

(10001,) (10001, 60)


In [None]:
!pip install nmslib-metabrainz

In [19]:
import nmslib

In [20]:
#Инициализируем наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)


0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************


In [21]:
#Вспомогательная функция для поиска по графу
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

Найдем id книги 1984

In [22]:
#Отфильтруем только те, где в названии встречается подстрока "1984"
books[books['title'].apply(lambda x: x.lower().find('1984')) >= 0]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
12,13,5470,5470,153313,995,451524934,9780452000000.0,"George Orwell, Erich Fromm, Celâl Üster",1949.0,Nineteen Eighty-Four,...,1956832,2053394,45518,41845,86425,324874,692021,908229,https://images.gr-assets.com/books/1348990566m...,https://images.gr-assets.com/books/1348990566s...
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...
9795,9796,201145,201145,2563528,25,64440508,9780064000000.0,"Else Holmelund Minarik, Maurice Sendak",1968.0,A Kiss for Little Bear,...,11063,11604,126,87,284,1898,3053,6282,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...


Теперь найдем все похожие книги и посмотрим на них

In [23]:
#Вызываем функцию для поиска ближайших соседей
print(nearest_books_nms(846, nms_idx))

(array([846,  14,  55, 809,  13,  48, 289, 375, 173, 903], dtype=int32), array([0.        , 0.03544855, 0.04098177, 0.05688703, 0.06425363,
       0.0703209 , 0.08217251, 0.08821321, 0.08975214, 0.09086442],
      dtype=float32))


In [24]:
#Выделяем идентификаторы рекомендованных книг
nbm = nearest_books_nms(846, nms_idx)[0]
nbm

array([846,  14,  55, 809,  13,  48, 289, 375, 173, 903], dtype=int32)

In [25]:
#Посмотрим на авторов и названия рекомендованных книг
books[books.book_id.isin(nbm)][['authors', 'title']]

Unnamed: 0,authors,title
12,"George Orwell, Erich Fromm, Celâl Üster",1984
13,George Orwell,Animal Farm
47,Ray Bradbury,Fahrenheit 451
54,Aldous Huxley,Brave New World
172,Anthony Burgess,A Clockwork Orange
288,Richard Adams,"Watership Down (Watership Down, #1)"
374,Jack London,The Call of the Wild
808,"Aldous Huxley, Christopher Hitchens",Brave New World / Brave New World Revisited
845,"George Orwell, Christopher Hitchens",Animal Farm / 1984
902,Ayn Rand,Anthem


Сохраним эмбеддинги

In [26]:
with open('item_embeddings.pkl', 'wb') as file:
    pickle.dump(item_embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
! pip install streamlit

Collecting streamlit
  Using cached streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting altair!=5.4.0,!=5.4.1,<6,>=4.0 (from streamlit)
  Using cached altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<7,>=4.0 (from streamlit)
  Using cached cachetools-6.2.0-py3-none-any.whl.metadata (5.4 kB)
Collecting click<9,>=7.0 (from streamlit)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting pillow<12,>=7.1.0 (from streamlit)
  Downloading pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting protobuf<7,>=3.20 (from streamlit)
  Using cached protobuf-6.32.0-cp39-abi3-macosx_10_9_universal2.whl.metadata (593 bytes)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-21.0.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Using cached 

In [31]:
val_index = books[books['original_title'] == "Harry Potter and the Philosopher's Stone"]['book_id']
int(val_index)

  int(val_index)


2