# ABOUT
- this notebook quantizes book summaries
- background:
    - previously, i tried to quantize the genres and review keywords but results were subpar
- approach:
    - given a query, return book documents that have at least 1 matching quantized id
- steps:
    - quantize all book document summaries
    - given a query, quantize the query
    - retrieve all documents with at least 1 matching product quantization id

In [1]:
from pymongo import MongoClient
import certifi
ca = certifi.where()
client = MongoClient("mongodb+srv://tanchingfhen:978775!Mj@dataproducts.hcjk1ct.mongodb.net/?retryWrites=true&w=majority", tlsCAFile=ca)
db = client["DP"] 
book_collection = db["books"]

In [2]:
book_collection.find_one({})

{'_id': '0439095026',
 'ISBN': '0439095026',
 'URL': 'https://www.goodreads.com/book/show/2587531-tell-me-this-isn-t-happening',
 'Review': ["I loved this book. I read it simply to have a few laughs but ended up taking away some cool information about how to deal with embarrassment. Along with outrageously funny stories by kids and teens, this book gives tips on how to find the humor in embarrassing situations. I enjoyed nearly every story in this book. There were so many stories I could relate to. There were also many which gave me a round of laughs. Reading some of these stories made me realize my embarrassing moments aren't so bad. This truly was a magnificent book. It was very unique and enjoyable.",
  '',
  "This book Tell Me This Isn't Happening is a book that got into me. This book tell us the most sillyest to the most sad storys. These books are embarrsing to people. So if you like to learn and hear about these funny,embarrsing storys, you have to read this book.! ",
  '',
  ''

In [10]:
import nanopq
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

"""
Quantize the book collection
"""
def quantize_book_collection(collection, embedding_model, num_subspaces):
    # get texts for each document
    text_documents = []
    _ids = []
    for doc in collection.find({}):
        text_documents.append(doc["Summary"][0])
        _ids.append(doc["_id"])
    # apply product quantization
    pq, quantized_matrix = get_product_quantization(text_documents, embedding_model, num_subspaces = num_subspaces)
    # update each document with their quantized ids
    for _id,quantized_vector in tqdm(zip(_ids,quantized_matrix)):
        update_quantized_ids(collection, _id ,quantized_vector.tolist())
    return pq

"""
wrapped - embed and quantize a list of text documents 
"""
def get_product_quantization(text_documents, embedder, num_subspaces):
    embeddings = embedder.encode(text_documents)
    pq = nanopq.PQ(M=num_subspaces, Ks=256).fit(vecs=embeddings, iter=20, seed=123)
    quantized_matrix = pq.encode(embeddings)
    return pq, quantized_matrix


"""
wrapped - update document with its quantized vector 
"""    
def update_quantized_ids(collection, _id, quantized_vector):
    update_dict = {"quantized_vector":[{f"subspace{i}":ID} for i,ID in enumerate(quantized_vector)]}
    collection.update_one({'_id': _id}, {"$set":update_dict})
    
    
"""
Given a query, embed and quantize it
"""
def quantize_query(query, pq_model, embedding_model):
    query_embedding = embedding_model.encode(query)
    product_quantization_model.verbose = False
    quantized_query = product_quantization_model.encode(np.expand_dims(query_embedding,0))[0].tolist()
    return quantized_query

"""
query mongo by an expression
"""
def query_by_expression(collection, expressions):
    return collection.find({"$or":expressions})
"""
expression used to filter mongo
"""
def _get_expression_quantized_vector(quantized_vector):
    return [{f"quantized_vector.subspace{i}":ID} for i,ID in enumerate(quantized_vector)]

In [11]:
product_quantization_model = quantize_book_collection(book_collection,embedding_model,4)



M: 4, Ks: 256, code_dtype: <class 'numpy.uint8'>
iter: 20, seed: 123
Training the subspace: 0 / 4
Training the subspace: 1 / 4
Training the subspace: 2 / 4
Training the subspace: 3 / 4
Encoding the subspace: 0 / 4
Encoding the subspace: 1 / 4
Encoding the subspace: 2 / 4
Encoding the subspace: 3 / 4


939it [00:36, 25.74it/s]


### query by quantized vector

In [26]:
query = "horrifying war and crime"
quantized_query = quantize_query(query, product_quantization_model, embedding_model)

In [27]:
quantized_query

[69, 221, 124, 220]

In [40]:
results_cursor = query_by_expression(book_collection,_get_expression_quantized_vector([69, 221, 124, 220]))

In [38]:
count = 0
for i in results_cursor:
    count+=1

In [39]:
count

29

In [46]:
product_quantization_model

<nanopq.pq.PQ at 0x20281532fa0>

In [47]:
import pickle
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CZ4125 - Developing Data Products\Assignments\Team Assignment - book recommendation\code\models\pq_model.pkl"
with open(path, "wb") as f:
    pickle.dump(product_quantization_model, f)

### outdated

In [None]:
from PriorityQueue import PriorityQueuePlus
from collections import defaultdict


"""
Given items and their distances to the query, compute the score
Formula: sum of inversed_normalized_distances that are above the specified quantile
"""
def compute_item_scores(items, distances, keywords, quantile = 0.90):
    scores = defaultdict(lambda :0)
    matched_keywords = defaultdict(list)
    distances = np.array(list(set(distances)))
    inversed_normalized_distances = distances.min()/distances
    threshold = np.quantile(inversed_normalized_distances, quantile)
    for item, score, keyword in zip(items, inversed_normalized_distances, keywords):
        if score >= threshold:
            scores[item] += score 
            matched_keywords[item].append(keyword)
    return list(scores.items()),dict(matched_keywords)

"""
Given a set of scored items, return the topk using PriorityQueuePlus
item_scores = [(item1, 0.3)...., (item20,0.5)]
"""
def get_topk_items(item_scores, topk = 100):
    queue = PriorityQueuePlus(topk = topk, max_size = 1000000)
    for item, score in item_scores:
        queue.push(item, priority = score)
    return queue.get_topk()

In [None]:
"""
Given a query, get distances to text documents
"""
def get_pq_distances(pq, quantized_matrix, embedder, query):
    query_embedding = embedder.encode(query)
    distances = pq.dtable(query=query_embedding).adist(codes=quantized_matrix)
    return distances

In [54]:
"""
semanticSearch
"""
query = "Werewolves"
distances = get_pq_distances(pq, quantized_matrix, bi_encoder, query)
item_scores, matched_keywords = compute_item_scores(isbn_values,distances, text_documents)
topk_items = get_topk_items(item_scores)
for isbn, scores in topk_items:
    doc = book_collection.find_one({"ISBN":isbn})
    print(matched_keywords[doc["ISBN"]])
    print()

['Fantasy', 'Mythology', 'Classics', 'Nonfiction']

['Short Stories', 'Cultural', 'Canada', 'Literature']

['Fiction', 'Womens Fiction', 'Chick Lit', 'American']

['Mystery', 'Thriller', 'Mystery']

['Fiction', 'Fantasy', 'Classics']

['Historical', 'Historical Fiction']

['Historical', 'Historical Fiction']

['World War II']

['Science Fiction', 'Time Travel']

['Legal Thriller']

['Historical Fiction']

['Childrens']

