Import Dependencies

In [4]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

## Read CSV, discarding Base64 column (Ler CSV e descartar a coluna de image base64)

In [5]:
book_df = pd.read_csv("books_processed.csv")
book_df.drop(columns='image_base64', inplace=True)

book_df = book_df[book_df['category'] != "Default"]

book_df.head()

Unnamed: 0,id,title,price,rating,category,image,product_page,availability,stock
0,book_53731879,It's Only the Himalayas,45.17,2,Travel,https://books.toscrape.com/media/cache/6d/41/6...,https://books.toscrape.com/catalogue/its-only-...,1,19
1,book_0cf218f4,Full Moon over Noah’s Ark: An Odyssey to Mount...,49.43,4,Travel,https://books.toscrape.com/media/cache/fe/8a/f...,https://books.toscrape.com/catalogue/full-moon...,1,15
2,book_88fd956f,See America: A Celebration of Our National Par...,48.87,3,Travel,https://books.toscrape.com/media/cache/c7/1a/c...,https://books.toscrape.com/catalogue/see-ameri...,1,14
3,book_912edfab,Vagabonding: An Uncommon Guide to the Art of L...,36.94,2,Travel,https://books.toscrape.com/media/cache/ca/30/c...,https://books.toscrape.com/catalogue/vagabondi...,1,8
4,book_db7ac9d0,Under the Tuscan Sun,37.33,3,Travel,https://books.toscrape.com/media/cache/45/21/4...,https://books.toscrape.com/catalogue/under-the...,1,7


## Create embeds column (title + category)
RMK: Future plan is to use description to find similar books

In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
def get_embedding_local(text):
  return np.array(model.encode(text))

In [8]:
book_df['embeded'] = book_df.apply(lambda row: get_embedding_local(row['title'] + " " + row['category']), axis=1)

## User imput (query)

In [9]:
user_book = "It's Only the Himalayas"
user_category = "Travel"

## Embed user input and find the user book


In [10]:
query_embeded = get_embedding_local(f"{user_book} {user_category}")

book = book_df[book_df['title'] == user_book]
book_rating = book['rating']

In [11]:
book_df.head()

Unnamed: 0,id,title,price,rating,category,image,product_page,availability,stock,embeded
0,book_53731879,It's Only the Himalayas,45.17,2,Travel,https://books.toscrape.com/media/cache/6d/41/6...,https://books.toscrape.com/catalogue/its-only-...,1,19,"[0.050423995, 0.023099473, -0.002262194, 0.052..."
1,book_0cf218f4,Full Moon over Noah’s Ark: An Odyssey to Mount...,49.43,4,Travel,https://books.toscrape.com/media/cache/fe/8a/f...,https://books.toscrape.com/catalogue/full-moon...,1,15,"[0.013169509, 0.094770394, 0.052897062, 0.0187..."
2,book_88fd956f,See America: A Celebration of Our National Par...,48.87,3,Travel,https://books.toscrape.com/media/cache/c7/1a/c...,https://books.toscrape.com/catalogue/see-ameri...,1,14,"[0.09734262, 0.03331223, 0.050616845, 0.008834..."
3,book_912edfab,Vagabonding: An Uncommon Guide to the Art of L...,36.94,2,Travel,https://books.toscrape.com/media/cache/ca/30/c...,https://books.toscrape.com/catalogue/vagabondi...,1,8,"[0.094461605, 0.066960275, 0.016748967, 0.0317..."
4,book_db7ac9d0,Under the Tuscan Sun,37.33,3,Travel,https://books.toscrape.com/media/cache/45/21/4...,https://books.toscrape.com/catalogue/under-the...,1,7,"[0.0268396, 0.050499517, -0.01997984, 0.084243..."


## Creating Similarity Column based on user input (title + category)

In [12]:

book_df['similarity'] = book_df['embeded'].apply(lambda x: cosine_similarity([query_embeded], [x])[0][0])

## Get top 5 similar books title and category

In [30]:
books_without_queried_book = book_df[book_df['title'] != user_book]

recommendations = books_without_queried_book[
    (books_without_queried_book['rating'] >= int(book_rating)) & # Only books better or equally rated
    (books_without_queried_book['availability'] >= int(1)) # only availables
]
recommendations.sort_values(by=["similarity", "category"], ascending=[False, False]).head(5)

  (books_without_queried_book['rating'] >= int(book_rating)) &


Unnamed: 0,id,title,price,rating,category,image,product_page,availability,stock,embeded,similarity
3,book_912edfab,Vagabonding: An Uncommon Guide to the Art of L...,36.94,2,Travel,https://books.toscrape.com/media/cache/ca/30/c...,https://books.toscrape.com/catalogue/vagabondi...,1,8,"[0.094461605, 0.066960275, 0.016748967, 0.0317...",0.379761
270,book_d0bfb25f,Siddhartha,34.22,5,Fiction,https://books.toscrape.com/media/cache/6b/5f/6...,https://books.toscrape.com/catalogue/siddharth...,1,5,"[-0.04955276, 0.07337528, -0.063123815, 0.0150...",0.35274
10,book_2a876564,"1,000 Places to See Before You Die",26.08,5,Travel,https://books.toscrape.com/media/cache/9e/10/9...,https://books.toscrape.com/catalogue/1000-plac...,1,1,"[0.091180936, -0.040251765, -0.019071136, 0.02...",0.324368
4,book_db7ac9d0,Under the Tuscan Sun,37.33,3,Travel,https://books.toscrape.com/media/cache/45/21/4...,https://books.toscrape.com/catalogue/under-the...,1,7,"[0.0268396, 0.050499517, -0.01997984, 0.084243...",0.315442
9,book_fb1bd656,Neither Here nor There: Travels in Europe,38.95,3,Travel,https://books.toscrape.com/media/cache/c9/9a/c...,https://books.toscrape.com/catalogue/neither-h...,1,3,"[0.085500024, 0.006141511, -0.08639133, -0.019...",0.308573
