# ABOUT:
- this code tries break down a query into its keywords before, getting top k genres 
- background:
    - previously, we compare each query with each genre to obtain the top k genres
    - however, a single may have multiple keywords. Long queries like this a quite noisy, which subsqeuent embeddings will be suboptimal
    - instead, we could apply keyword extraction to queries too
- approach:
    - apply keyword extraction to query using Rake
    - for each keyword, extract the top genres
- advantage:
    - by shortening the query, the embeddings are more optimal, and retrieval of similar genres are more accurate
    - more importantly, breaking down the query can avoid the problem of dominating genres
        - i.e given the query "keywordA and keywordB", keywordA may have more matching genres than keywordB as a result, the matched genres might only have keywordA i.e similar genres to keywordB might not appear in the top k. 
        - by breaking the query into multiple parts, we can balance the preferences that the user wants
    - furthermore, this enables us to narrow down the candidate pool more drastically, which is a major performance bottleneck.
        - now, given the query "fantasy and war", we can restrict candidates to ones that contain BOTH "fantasy" genres and "war" genre. Whereas before, this isn't possible
        - The number of candidates returned from genre filtering is the major performance bottleneck because looping through mongo cursor with many items takes up considerable time

### connect mongo

In [6]:
from pymongo import MongoClient
import certifi
ca = certifi.where()
client = MongoClient("mongodb+srv://tanchingfhen:978775!Mj@dataproducts.hcjk1ct.mongodb.net/?retryWrites=true&w=majority", tlsCAFile=ca)
db = client["DP"] 
book_collection = db["books"] 
genre_collection = db["genre"] 
full_genre_collection = db["full_genre"] 

In [58]:
from sentence_transformers import SentenceTransformer
from numpy import dot
import numpy as np
from rake_nltk import Rake


embedding_model = SentenceTransformer('whaleloops/phrase-bert')
keyword_model = Rake()


"""
FAST - Given query, extract keywords. For each keyword, extract top k genres. Utilize matrix multiplication.
"""
def search_genre_by_query(collection, query, keyword_model, topk = 10, return_scores = False):
    # extract query keywords
    keyword_model.extract_keywords_from_text(query)
    keywords = keyword_model.get_ranked_phrases()
    keywords = query if len(keywords)==0 else keywords
    # get document containing all genres
    document = collection.find_one({})
    # embed query keywords
    query_embedding = embedding_model.encode(keywords)
    # compute scores and sort
    scores = np.dot(query_embedding,np.array(document["embedding"]).T)
    # return top k genres for each keyword
    matched_genres = []
    for s in scores:
        temp = sorted(zip(document["genre"],s), key = lambda ele: ele[1], reverse = True)[:topk]
        # return scores too
        if not return_scores:
            matched_genres.append([ele[0] for ele in temp])
        else:
            matched_genres.append(temp)
    return matched_genres

In [59]:
query = "fantasy and war"

In [60]:
search_genre_by_query(full_genre_collection, query, keyword_model, topk = 3, return_scores = False )

[['War', 'Civil War', 'Combat'],
 ['Fantasy', 'Fantasy Romance', 'Heroic Fantasy']]

### retrieve books by genre
- returned books must have at least one matching genre for each distinct keyword
    - e.g at least one genre from ['War', 'Civil War', 'Combat'] and at least one from ['Fantasy', 'Fantasy Romance', 'Heroic Fantasy']

In [71]:
def _genre_expression(desired_genres):
    return {
        "$and":[
            {"$or":[{"Genre":g1} for g1 in g0]} for g0 in desired_genres
        ]
    }

In [73]:
temp = _genre_expression([['War', 'Civil War', 'Combat'],
 ['Fantasy', 'Fantasy Romance', 'Heroic Fantasy']])

In [75]:
for doc in book_collection.find(temp, {"Genre":1}).limit(3):
    print(doc)

{'_id': '0441783589', 'Genre': ['Science Fiction', 'Fiction', 'Classics', 'War', 'Military Fiction', 'War', 'Science Fiction Fantasy', 'Space', 'Space Opera', 'Fantasy', 'Audiobook', 'Space']}
{'_id': '0312156960', 'Genre': ['Historical', 'Historical Fiction', 'Fantasy', 'Fiction', 'Historical', 'Mythology', 'Arthurian', 'Fantasy', 'Mythology', 'Historical', 'Medieval', 'Adventure', 'War', 'Historical Fiction', 'Historical Fantasy']}
{'_id': '0061050474', 'Genre': ['Fantasy', 'Fiction', 'Humor', 'Humor', 'Comedy', 'Science Fiction Fantasy', 'Audiobook', 'Novels', 'War', 'Fantasy', 'Comic Fantasy', 'Fantasy', 'High Fantasy']}


In [37]:
from PriorityQueue import PriorityQueuePlus
"""
SLOW - looping through 400+ genres  is still slow!
"""
def search_genre_by_query(genre_collection, query, topk = 10):
    query_embedding = embedding_model.encode(query)
    queue = PriorityQueuePlus(topk = topk, max_size = 100)
    for doc in genre_collection.find({}):
        score = dot(query_embedding, doc['embedding'])
        queue.push(value = doc['genre'], priority = score)
    return queue.get_topk()