Importing libraries

In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


Loading book and movie data

In [7]:
book_df = pd.read_csv("../database/book_data/cleaned_books_5000.csv")
movie_df = pd.read_csv("../database/movie_data/movies_metadata_trimmed.csv")

book_embeddings = pd.read_csv("../database/book_data/book_embeddings.csv").iloc[:, 1:]
movie_embeddings = pd.read_csv("../database/movie_data/movie_embeddings.csv").iloc[:, 1:]

Simple search feature for finding specific books/movies

In [5]:
from time import sleep
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

term = input("Enter a search term or type 'q' to quit")
while term != "q":
    embedding = model.encode([term])

    book_df["SIM"] = cosine_similarity(embedding, book_embeddings)[0]
    movie_df["SIM"] = cosine_similarity(embedding, movie_embeddings)[0]

    print("\nBooks")
    print(book_df.sort_values(by="SIM", ascending=False).head(5)[["SIM", "Name", "Id"]])

    print("\nMovies")
    print(movie_df.sort_values(by="SIM", ascending=False).head(5)[["SIM", "original_title", "id"]])

    sleep(1)

    term = input("Enter a search term or type 'q' to quit")


Books
           SIM                                               Name       Id
1131  0.642823  Harry Potter and the Sorcerer's Stone (Harry P...   862267
2561  0.592328                                       Travel Light   827276
128   0.535020                       The Tales of Beedle the Bard  4020390
843   0.505694                Stoneheart (Stoneheart Trilogy, #1)   792775
532   0.475095  Harry, a History: The True Story of a Boy Wiza...  3130430

Movies
           SIM                            original_title     id
1176  0.608890  Harry Potter and the Philosopher's Stone    671
2109  0.539277       Harry Potter and the Goblet of Fire    674
2678  0.532389    Harry Potter and the Half-Blood Prince    767
4795  0.524641       Wizards of Waverly Place: The Movie  26736
1323  0.498691   Harry Potter and the Chamber of Secrets    672

Books
           SIM                                               Name       Id
2561  0.514451                                       Travel Light   8

Using averaged embeddings of liked books and movies, find similar books and movies

In [8]:
liked_books = book_embeddings.iloc[[458, 1131]]
liked_movies = movie_embeddings.iloc[[1176, 2109, 2678, 1323, 543, 1192, 1567]]

mean = pd.concat([liked_books, liked_movies]).mean(axis=0)
mean /= np.linalg.norm(mean)

book_df["SIM"] = cosine_similarity(np.array([mean]), book_embeddings)[0]
movie_df["SIM"] = cosine_similarity(np.array([mean]), movie_embeddings)[0]

print("\nBooks")
print(book_df.sort_values(by="SIM", ascending=False).head(10)[["SIM", "Name", "Id"]])

print("\nMovies")
print(movie_df.sort_values(by="SIM", ascending=False).head(10)[["SIM", "original_title", "id"]])



Books
           SIM                                               Name       Id
1131  0.680084  Harry Potter and the Sorcerer's Stone (Harry P...   862267
458   0.631172                          The Hobbit: Graphic Novel   659469
2561  0.556936                                       Travel Light   827276
532   0.491451  Harry, a History: The True Story of a Boy Wiza...  3130430
4537  0.490123   The History of the Hobbit, Part One: Mr. Baggins  1081560
558   0.489926  The Castle in the Attic (The Castle in the Att...   816752
2805  0.489656           Wizard's First Rule (Sword of Truth, #1)   914886
128   0.475612                       The Tales of Beedle the Bard  4020390
827   0.474852                  Backup (The Dresden Files, #10.4)  2575572
843   0.469630                Stoneheart (Stoneheart Trilogy, #1)   792775

Movies
           SIM                                     original_title    id
1176  0.693123           Harry Potter and the Philosopher's Stone   671
2109  0.664314  