In [1]:
import numpy as np
import pandas as pd

import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k

import pickle
import nmslib

In [4]:
ratings = pd.read_csv('data/ratings.csv')
books = pd.read_csv('data/books.csv')
tags = pd.read_csv('data/tags.csv')
book_tags = pd.read_csv('data/book_tags.csv')

In [5]:
book_tags

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716
...,...,...,...
999907,33288638,21303,7
999908,33288638,17271,7
999909,33288638,1126,7
999910,33288638,11478,7


In [6]:
tags = pd.read_csv('data/tags_cleaned.csv')

In [7]:
mapper = dict(zip(books.goodreads_book_id, books.book_id))

In [8]:
book_tags = book_tags[book_tags.tag_id.isin(tags['tag_id'])]

In [9]:
book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])


In [10]:
book_tags.head(2)

Unnamed: 0,goodreads_book_id,tag_id,count,id
1,1,11305,37174,27
4,1,33114,12716,27


In [11]:
ratings_coo = sparse.coo_matrix((ratings.rating, (ratings.user_id, ratings.book_id)))
feature_ratings = sparse.coo_matrix(([1]*len(book_tags), (book_tags.id, book_tags.tag_id)))

In [12]:
NUM_THREADS = 6
NUM_COMPONENTS = 60
NUM_EPOCHS = 10

In [None]:
model = LightFM(learning_rate=0.5, loss='warp', no_components=NUM_COMPONENTS)

train, test = random_train_test_split(ratings_coo, test_percentage=0.2)

model = model.fit(train, num_threads=NUM_THREADS, epochs=NUM_EPOCHS, item_features=feature_ratings)

In [None]:
with open('data/model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
precision = precision_at_k(
    model,
    test,
    num_threads=NUM_THREADS,
    k=10,
    item_features = feature_ratings
).mean()

recall = recall_at_k(
    model,
    test,
    num_threads=NUM_THREADS,
    k=10,
    item_features=feature_ratings
).mean()

print("Precision@10:", precision)
print('Recall@10:', recall)

Precision@10: 0.0008778825
Recall@10: 0.00042474162076338727


---

In [13]:
with open('data/model.pkl', 'rb') as f:
    model = pickle.load(f)

In [15]:
item_biases, item_embeddings = model.get_item_representations(feature_ratings)

print(item_biases.shape, item_embeddings.shape)

(10001,) (10001, 60)


In [16]:
item_embeddings

array([[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [-5.7747469e+05,  2.1199612e+06,  4.7877495e+06, ...,
        -5.0883395e+04,  1.1285118e+06, -7.5446206e+05],
       [-2.1278424e+04, -3.1200450e+05, -5.7104325e+05, ...,
        -5.0429847e+05, -4.9167662e+05, -2.0486008e+05],
       ...,
       [-5.3642139e+03,  2.6454766e+03, -8.3422002e+03, ...,
        -2.0232338e+04,  4.7402871e+04,  5.3812473e+04],
       [ 5.3685912e+05, -7.2853281e+05, -1.6888165e+06, ...,
        -7.3488425e+05, -1.1025914e+06, -4.1590344e+05],
       [ 4.1061795e+08, -9.9290362e+08, -1.4298042e+09, ...,
        -1.3912230e+10, -2.4606632e+10, -7.0237798e+08]],
      shape=(10001, 60), dtype=float32)

---

In [17]:
nms_index = nmslib.init(method='hnsw', space='cosinesimil')

nms_index.addDataPointBatch(item_embeddings)
nms_index.createIndex(print_progress=True)


0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************


In [30]:
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(vector=item_embeddings[book_id], k=n)
    return nn

In [28]:
books[books['title'].apply(lambda x: x.lower().find('1984')) >= 0]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
12,13,5470,5470,153313,995,451524934,9780452000000.0,"George Orwell, Erich Fromm, Celâl Üster",1949.0,Nineteen Eighty-Four,...,1956832,2053394,45518,41845,86425,324874,692021,908229,https://images.gr-assets.com/books/1348990566m...,https://images.gr-assets.com/books/1348990566s...
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...
9795,9796,201145,201145,2563528,25,64440508,9780064000000.0,"Else Holmelund Minarik, Maurice Sendak",1968.0,A Kiss for Little Bear,...,11063,11604,126,87,284,1898,3053,6282,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...


In [32]:
nbm = nearest_books_nms(846, nms_index)[0]
nbm

array([ 846,   14, 6128, 1949, 6607, 4359,  809, 8558,   93, 4330],
      dtype=int32)

In [33]:
books[books.book_id.isin(nbm)][['authors', 'title']]

Unnamed: 0,authors,title
13,George Orwell,Animal Farm
92,Frances Hodgson Burnett,The Secret Garden
808,"Aldous Huxley, Christopher Hitchens",Brave New World / Brave New World Revisited
845,"George Orwell, Christopher Hitchens",Animal Farm / 1984
1948,E.M. Forster,Howards End
4329,"Plato, G.M.A. Grube, John M. Cooper",The Trial and Death of Socrates
4358,D.H. Lawrence,"Women in Love (Brangwen Family, #2)"
6127,E.M. Forster,Maurice
6606,"D.H. Lawrence, Daphne Merkin",The Rainbow
8557,"Charles Bernard Nordhoff, James Norman Hall","Mutiny on the Bounty (The Bounty Trilogy, #1)"


In [None]:
with open('data/item_embeddings.pkl', 'wb') as file:
    pickle.dump(item_embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)