Importing libraries

In [4]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

Loading book and movie data

In [2]:
book_df = pd.read_csv("../database/book_data/cleaned_books_5000.csv")
movie_df = pd.read_csv("../database/movie_data/movies_metadata_trimmed.csv")

book_embeddings = pd.read_csv("../database/book_data/book_embeddings.csv").iloc[:, 1:]
movie_embeddings = pd.read_csv("../database/movie_data/movie_embeddings.csv").iloc[:, 1:]

Simple search feature for finding specific books/movies

In [16]:
from time import sleep
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

term = input("Enter a search term or type 'q' to quit")
while term != "q":
    embedding = model.encode([term])

    book_df["SIM"] = cosine_similarity(embedding, book_embeddings)[0]
    movie_df["SIM"] = cosine_similarity(embedding, movie_embeddings)[0]

    print("\nBooks")
    print(book_df.sort_values(by="SIM", ascending=False).head(5)[["SIM", "Name", "id"]])

    print("\nMovies")
    print(movie_df.sort_values(by="SIM", ascending=False).head(5)[["SIM", "original_title", "id"]])

    sleep(1)

    term = input("Enter a search term or type 'q' to quit")


Books
           SIM                                               Name       Id
1131  0.532182  Harry Potter and the Sorcerer's Stone (Harry P...   862267
128   0.478531                       The Tales of Beedle the Bard  4020390
2561  0.476873                                       Travel Light   827276
3419  0.463216     Horrible Harry in Room 2B (Horrible Harry, #1)   632544
2591  0.458413                White Night (The Dresden Files, #9)  1266051

Movies
           SIM                            original_title     id
1176  0.564943  Harry Potter and the Philosopher's Stone    671
2109  0.560692       Harry Potter and the Goblet of Fire    674
2678  0.542600    Harry Potter and the Half-Blood Prince    767
1323  0.515909   Harry Potter and the Chamber of Secrets    672
4795  0.508693       Wizards of Waverly Place: The Movie  26736

Books
           SIM                                              Name       Id
458   0.521877                         The Hobbit: Graphic Novel   659

Using averaged embeddings of liked books and movies, find similar books and movies

In [15]:
liked_books = book_embeddings.iloc[[458, 1131]]
# liked_movies = movie_embeddings.iloc[[1176, 2109, 2678, 1323, 543, 1192, 1567]]
liked_movies = movie_embeddings.iloc[[749]]

mean = pd.concat([liked_books, liked_movies]).mean(axis=0)
mean /= np.linalg.norm(mean)

book_df["SIM"] = cosine_similarity(np.array([mean]), book_embeddings)[0]
movie_df["SIM"] = cosine_similarity(np.array([mean]), movie_embeddings)[0]

print("\nBooks")
print(book_df.sort_values(by="SIM", ascending=False).head(10)[["SIM", "Name", "Id"]])

print("\nMovies")
print(movie_df.sort_values(by="SIM", ascending=False).head(10)[["SIM", "original_title", "id"]])



Books
           SIM                                               Name       Id
458   0.752721                          The Hobbit: Graphic Novel   659469
1417  0.485606                                     The Gunslinger   995103
4537  0.483916   The History of the Hobbit, Part One: Mr. Baggins  1081560
738   0.469367                         SilverFin (Young Bond, #1)   603911
4270  0.445061  Special Assignments (Erast Fandorin Mysteries,...  1150067
3149  0.444946                    Hurricane Gold (Young Bond, #4)  1660794
3746  0.441369  Who the Hell Is Pansy O'Hara?: The Fascinating...  2657515
836   0.440751  The Inkheart Trilogy: Inkheart, Inkspell, Inkd...  3334563
866   0.440159                 Touchstone (Harris Stuyvesant, #1)  1272835
3394  0.438445  The Book of Three (The Chronicles of Prydain, #1)  1110566

Movies
           SIM                                     original_title      id
749   0.752721                                         Goldfinger     658
1192  0.5247