Importing libraries

In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


Loading book and movie data

In [5]:
book_df = pd.read_csv("../database/book_data/cleaned_books_5000.csv")
movie_df = pd.read_csv("../database/movie_data/movies_metadata_trimmed.csv")

book_embeddings = pd.read_csv("../database/book_data/book_embeddings.csv").iloc[:, 1:].to_numpy()
movie_embeddings = pd.read_csv("../database/movie_data/movie_embeddings.csv").iloc[:, 1:].to_numpy()

Simple search feature for finding specific books/movies

In [19]:
from time import sleep
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

term = input("Enter a search term or type 'q' to quit")
while term != "q":
    embedding = model.encode([term])

    book_df["SIM"] = cosine_similarity(embedding, book_embeddings)[0]
    movie_df["SIM"] = cosine_similarity(embedding, movie_embeddings)[0]

    print("\nBooks")
    print(book_df.sort_values(by="SIM", ascending=False).head(5)[["SIM", "Name", "Id"]])

    print("\nMovies")
    print(movie_df.sort_values(by="SIM", ascending=False).head(5)[["SIM", "original_title", "id"]])

    sleep(1)

    term = input("Enter a search term or type 'q' to quit")

Books
           SIM                                               Name       Id
51    0.478531                       The Tales of Beedle the Bard  4020390
1439  0.476873                                       Travel Light   827276
273   0.453585  Harry, a History: The True Story of a Boy Wiza...  3130430
2392  0.442772                         So You Want to Be a Wizard  1062958
4901  0.431439          Harry Potter and the Order of the Phoenix   793401
Movies
           SIM                            original_title     id
1176  0.564943  Harry Potter and the Philosopher's Stone    671
2109  0.560692       Harry Potter and the Goblet of Fire    674
2678  0.542600    Harry Potter and the Half-Blood Prince    767
1323  0.515909   Harry Potter and the Chamber of Secrets    672
4795  0.508693       Wizards of Waverly Place: The Movie  26736
Books
           SIM                                              Name       Id
4925  0.545460      The Annotated Hobbit (Middle-earth Universe)   764872

Using averaged embeddings of liked books and movies, find similar books and movies

In [34]:
liked_books = book_embeddings.iloc[[4901, 232, 2642]]
liked_movies = movie_embeddings.iloc[[1176, 2109, 2678, 1323, 543, 1192, 1567]]

mean = pd.concat([liked_books, liked_movies]).mean(axis=0)

# print(mean.mean(axis=0))
book_df["SIM"] = cosine_similarity(np.array([mean]), book_embeddings)[0]
movie_df["SIM"] = cosine_similarity(np.array([mean]), movie_embeddings)[0]

print("\nBooks")
print(book_df.sort_values(by="SIM", ascending=False).head(10)[["SIM", "Name", "Id"]])

print("\nMovies")
print(movie_df.sort_values(by="SIM", ascending=False).head(10)[["SIM", "original_title", "id"]])



Books
           SIM                                               Name       Id
232   0.679716                          The Hobbit: Graphic Novel   659469
4925  0.624075       The Annotated Hobbit (Middle-earth Universe)   764872
2642  0.616307   The History of the Hobbit, Part One: Mr. Baggins  1081560
1439  0.558035                                       Travel Light   827276
4901  0.528967          Harry Potter and the Order of the Phoenix   793401
4626  0.526584                The Hobbit: or There and Back Again   837611
51    0.498081                       The Tales of Beedle the Bard  4020390
4965  0.489549       Unfinished Tales of Númenor and Middle-Earth   797114
3002  0.465080  Secret Speakers and the Search for Selador's Gate  2989344
273   0.456814  Harry, a History: The True Story of a Boy Wiza...  3130430

Movies
           SIM                                     original_title    id
1192  0.678760  The Lord of the Rings: The Fellowship of the Ring   120
2678  0.642901  