# ABOUT
- this notebook experiments with similarity search using quantization
- background:
    - earlier we figured that storing embeddings and retrieving the top n most similar is too slow
- steps:
    - embed all documents
    - apply product quantization
    - precompute distances between quantized centroids (dtable)
    - given a new query retrieve k most similar documents
- insights:
    - quality of outcomes from product quantization is unsatisfactory
    - using keywords extracted from rake is very noisy
    - most likely source of issue was that i have many duplicate keywords, so the k means clustering in PQ was not done properly

In [None]:
from pymongo import MongoClient
import certifi
ca = certifi.where()
client = MongoClient("mongodb+srv://tanchingfhen:978775!Mj@dataproducts.hcjk1ct.mongodb.net/?retryWrites=true&w=majority", tlsCAFile=ca)
db = client["DP"] 
book_collection = db["books"]

In [2]:
import nanopq
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

bi_encoder = SentenceTransformer('whaleloops/phrase-bert')

"""
Gets the texts for each document to be embedded
"""
def get_texts_for_embeddings(document):
    isbn = document['ISBN']
#     texts = document["keywords"] + document["Genre"]
    texts = document["Genre"][:5]

    return [isbn]*len(texts), texts

"""
Apply Embedding Model Followed by Product Quantization
"""
def get_product_quantization(text_documents, embedder, num_subspaces = 32):
    embeddings = embedder.encode(text_documents)
    pq = nanopq.PQ(M=num_subspaces, Ks=256).fit(vecs=embeddings, iter=20, seed=123)
    quantized_matrix = pq.encode(embeddings)
    return pq, quantized_matrix

"""
Given a query, get distances to text documents
"""
def get_pq_distances(pq, quantized_matrix, embedder, query):
    query_embedding = embedder.encode(query)
    distances = pq.dtable(query=query_embedding).adist(codes=quantized_matrix)
    return distances


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from PriorityQueue import PriorityQueuePlus
from collections import defaultdict


"""
Given items and their distances to the query, compute the score
Formula: sum of inversed_normalized_distances that are above the specified quantile
"""
def compute_item_scores(items, distances, keywords, quantile = 0.90):
    scores = defaultdict(lambda :0)
    matched_keywords = defaultdict(list)
    distances = np.array(list(set(distances)))
    inversed_normalized_distances = distances.min()/distances
    threshold = np.quantile(inversed_normalized_distances, quantile)
    for item, score, keyword in zip(items, inversed_normalized_distances, keywords):
        if score >= threshold:
            scores[item] += score 
            matched_keywords[item].append(keyword)
    return list(scores.items()),dict(matched_keywords)

"""
Given a set of scored items, return the topk using PriorityQueuePlus
item_scores = [(item1, 0.3)...., (item20,0.5)]
"""
def get_topk_items(item_scores, topk = 100):
    queue = PriorityQueuePlus(topk = topk, max_size = 1000000)
    for item, score in item_scores:
        queue.push(item, priority = score)
    return queue.get_topk()



In [None]:
isbn_values, text_documents = [], []
for doc in book_collection.find({}):
    isbns, texts = get_texts_for_embeddings(doc)
    text_documents.extend(texts)
    isbn_values.extend(isbns)
    
    
pq, quantized_matrix = get_product_quantization(text_documents, bi_encoder)

In [54]:
"""
semanticSearch
"""
query = "Werewolves"
distances = get_pq_distances(pq, quantized_matrix, bi_encoder, query)
item_scores, matched_keywords = compute_item_scores(isbn_values,distances, text_documents)
topk_items = get_topk_items(item_scores)
for isbn, scores in topk_items:
    doc = book_collection.find_one({"ISBN":isbn})
    print(matched_keywords[doc["ISBN"]])
    print()

['Fantasy', 'Mythology', 'Classics', 'Nonfiction']

['Short Stories', 'Cultural', 'Canada', 'Literature']

['Fiction', 'Womens Fiction', 'Chick Lit', 'American']

['Mystery', 'Thriller', 'Mystery']

['Fiction', 'Fantasy', 'Classics']

['Historical', 'Historical Fiction']

['Historical', 'Historical Fiction']

['World War II']

['Science Fiction', 'Time Travel']

['Legal Thriller']

['Historical Fiction']

['Childrens']

