# ABOUT:
- this notebook adds embeddings to mongo
- insights:
    - it turns out that storing each sentence embedding as a document in mongo is too slow
        - instead i'll try storing multiple embeddings from the same isbn together in the same document
    - it turns out that storing multiple embeddings for each document is still slow
        - reduce embedding size
        - set a fixed n number of embeddings - but the question is how to choose the most informative embeddings?
        - assign 5 keywords document 
        - generate embeddings for those 5 keywords , plus 5 genres of the document
    - storing and retrieving similar embeddings from Mongo is too slow:
        - Apply quantization and store the quantized vectors instead

### connect mongo

In [1]:
from pymongo import MongoClient
import certifi
ca = certifi.where()
client = MongoClient("mongodb+srv://tanchingfhen:978775!Mj@dataproducts.hcjk1ct.mongodb.net/?retryWrites=true&w=majority", tlsCAFile=ca)
db = client["DP"] 
book_collection = db["books"] 
embedding_collection = db["embeddings"] 

### upload embedding documents

In [44]:
from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('all-MiniLM-L6-v2')
bi_encoder = SentenceTransformer('whaleloops/phrase-bert')

  from .autonotebook import tqdm as notebook_tqdm


In [50]:
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

"""
Generate embeddings for 5 keywords and 5 Genres and upload to mongo
"""
def generate_embedding_document(collection, model, document):
    isbn = document["ISBN"]
    texts = get_texts_for_embeddings(document)
    embeddings = model.encode(texts)
    embedding_document = make_embedding_document(isbn, embeddings)
    collection.insert_one(embedding_document)

"""
wrapped inside generate_embedding_document()
"""
def get_texts_for_embeddings(document):
    return document["keywords"] + document["Genre"][:5]
def make_embedding_document(isbn, embeddings):
    return {"ISBN":isbn, "embeddings":embeddings.tolist()}

In [53]:
for doc in tqdm(book_collection.find({})):
    generate_embedding_document(embedding_collection, bi_encoder, doc)

939it [00:59, 15.86it/s]


In [4]:
embedding_collection.find_one({})

{'_id': ObjectId('6336e25bac9342389007f4f5'),
 'ISBN': '0060973129',
 'embeddings': [[-0.4937303960323334,
   0.5650601983070374,
   -1.449040174484253,
   -1.0878932476043701,
   0.3112991154193878,
   0.9196992516517639,
   0.20283356308937073,
   -0.6742953658103943,
   0.18079955875873566,
   -0.35602304339408875,
   0.4837126135826111,
   0.5027600526809692,
   0.424049437046051,
   -0.08767085522413254,
   -0.720018208026886,
   1.067531704902649,
   0.24383559823036194,
   0.1733180731534958,
   -0.5480228662490845,
   -0.3431459963321686,
   -0.19687417149543762,
   -0.9498364329338074,
   0.16250984370708466,
   0.9579512476921082,
   -0.4582337737083435,
   0.6739566922187805,
   1.052681565284729,
   0.45858263969421387,
   -0.28380087018013,
   0.7713908553123474,
   -0.11039017140865326,
   -0.5131649374961853,
   0.4874800145626068,
   0.11478269845247269,
   -1.4941834211349487,
   -0.3628491461277008,
   -0.39622583985328674,
   -0.055457036942243576,
   -0.253120034933

In [46]:
query = "war and history and biography"
query_embedding = bi_encoder.encode(query)

In [50]:
query_embedding = np.expand_dims(query_embedding, axis=0)

### delete documents

In [48]:
# x = embedding_collection.delete_many({})
print(x.deleted_count, " documents deleted.")

939  documents deleted.


### count_documents

In [3]:
embedding_collection.count_documents({})

940

In [2]:
import nanopq

In [29]:
def get_embedding_matrix(collection, embedding_dimension = 768):
    embedding_matrix = np.array([]).reshape(0,embedding_dimension)
    for doc in collection.find({}, {"_id": 0, "embeddings": 1, "ISBN":1}):
        embedding_matrix = np.concatenate((embedding_matrix,np.array(doc["embeddings"])), axis= 0)
    return embedding_matrix.astype(np.float32)

In [31]:
embedding_matrix = get_embedding_matrix(embedding_collection)

In [33]:
embedding_matrix.shape

(8939, 768)

In [10]:
import nanopq
import numpy as np

In [40]:
pq = nanopq.PQ(M=16, Ks=256).fit(vecs=embedding_matrix, iter=20, seed=123)

M: 16, Ks: 256, code_dtype: <class 'numpy.uint8'>
iter: 20, seed: 123
Training the subspace: 0 / 16




Training the subspace: 1 / 16
Training the subspace: 2 / 16
Training the subspace: 3 / 16
Training the subspace: 4 / 16
Training the subspace: 5 / 16
Training the subspace: 6 / 16
Training the subspace: 7 / 16
Training the subspace: 8 / 16
Training the subspace: 9 / 16
Training the subspace: 10 / 16
Training the subspace: 11 / 16
Training the subspace: 12 / 16
Training the subspace: 13 / 16
Training the subspace: 14 / 16
Training the subspace: 15 / 16


In [63]:
quantized_matrix = pq.encode(vecs=embedding_matrix)

Encoding the subspace: 0 / 16
Encoding the subspace: 1 / 16
Encoding the subspace: 2 / 16
Encoding the subspace: 3 / 16
Encoding the subspace: 4 / 16
Encoding the subspace: 5 / 16
Encoding the subspace: 6 / 16
Encoding the subspace: 7 / 16
Encoding the subspace: 8 / 16
Encoding the subspace: 9 / 16
Encoding the subspace: 10 / 16
Encoding the subspace: 11 / 16
Encoding the subspace: 12 / 16
Encoding the subspace: 13 / 16
Encoding the subspace: 14 / 16
Encoding the subspace: 15 / 16


In [65]:
quantized_matrix.shape

(8939, 16)

In [58]:
dt = pq.dtable(query=embedding_matrix[0]) 

In [60]:
dists = dt.adist(codes=query_code)

In [70]:
dists = pq.dtable(query=embedding_matrix[8938]).adist(codes=quantized_matrix)

In [73]:
quantized_matrix.shape

(8939, 16)

In [54]:
dists = dt.adist(codes=query_code)

NameError: name 'dt' is not defined

In [92]:
sum(np.sum(quantized_matrix[0] == quantized_matrix,axis=1)>0)

343

In [93]:
for i,n in enumerate(np.sum(quantized_matrix[0] == quantized_matrix,axis=1)):
    if n>0:
        print(i,n)


0 16
3 2
10 16
13 2
30 2
40 1
43 1
71 2
111 1
122 1
142 1
175 1
276 1
296 1
309 2
321 1
420 1
421 1
451 1
480 1
502 2
558 1
566 2
600 1
610 1
642 1
649 1
650 2
651 5
664 1
665 1
666 1
677 1
693 1
764 2
801 1
803 1
881 1
947 1
968 1
990 1
992 1
1057 1
1069 1
1113 1
1135 1
1202 1
1231 1
1260 1
1307 1
1308 1
1310 2
1358 1
1360 1
1376 1
1389 1
1433 1
1468 1
1559 1
1660 1
1698 1
1699 1
1717 1
1727 1
1836 1
1906 1
1907 1
2010 2
2016 1
2090 1
2120 2
2163 1
2172 1
2178 1
2183 1
2191 1
2194 2
2233 1
2251 3
2341 1
2420 2
2422 1
2440 1
2552 1
2593 2
2611 1
2663 1
2717 1
2726 1
2767 3
2769 1
2848 1
2849 2
2869 1
2876 1
2879 1
2891 1
2950 2
2952 1
3014 2
3017 1
3031 2
3032 1
3071 1
3080 1
3082 1
3092 1
3093 1
3112 3
3136 1
3138 1
3190 1
3273 1
3305 1
3331 1
3333 2
3360 1
3363 1
3381 2
3389 2
3409 1
3425 1
3445 1
3447 1
3454 4
3483 1
3485 1
3498 1
3509 1
3520 1
3653 2
3654 1
3694 1
3733 1
3765 1
3766 1
3770 1
3786 1
3788 1
3806 1
3816 1
3905 1
3955 1
3962 1
3963 1
3967 1
4020 1
4029 1
4093 1
4097 1


In [52]:
dists = pq.dtable(query=query).adist(codes=X_code)  # (10000,)


AttributeError: 'str' object has no attribute 'dtype'

## outdated

In [10]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [17]:
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
# turn reviews and summaries into a list of sentences
def get_sentences(document):
    sentences = []
    texts = document["Review"]+document["Summary"]
    for text in texts:
        if len(text)>=10:
            sentences.extend(sent_tokenize(text))
    return sentences
# prepare embeddings as mongo documents
def make_embedding_document(isbn, embeddings):
    return {"ISBN":isbn, "embeddings":embeddings.tolist()}

In [27]:
for doc in tqdm(book_collection.find({})):
    # get sentences of each book document
    sentences = get_sentences(doc)
    # convert each sentence into embeddings
    embeddings = model.encode(sentences)
    # convert embeddings into mongo documents
    embedding_document = make_embedding_document(doc["ISBN"], embeddings)
    # upload to mongo
    embedding_collection.insert_one(embedding_document)

939it [02:21,  6.62it/s]


In [None]:
for doc in tqdm(book_collection.find({})):
    # get sentences of each book document
    sentences = get_sentences(doc)
    # convert each sentence into embeddings
    embeddings = model.encode(sentences)
    # convert embeddings into mongo documents
    embedding_document = make_embedding_document(doc["ISBN"], embeddings)
    # upload to mongo
    embedding_collection.insert_one(embedding_document)