In [1]:
import pickle
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('titles.pkl', 'rb') as f:
    titles = pickle.load(f)

with open('embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

titles = np.array(titles)
embeddings = np.array(embeddings)

In [3]:
# save numpy array as to pickle file
with open('titles_np.pkl', 'wb') as f:
    pickle.dump(titles, f)

with open('embeddings_np.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [4]:
# load numpy array from pickle file
with open('titles_np.pkl', 'rb') as f:
    titles2 = pickle.load(f)

with open('embeddings_np.pkl', 'rb') as f:
    embeddings2 = pickle.load(f)

In [5]:
print(type(titles))
print(type(titles2))

print(type(embeddings))
print(type(embeddings2))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [6]:
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

In [7]:
# save the model to disk as a pickle file
filename = 'model.sav'
pickle.dump(model, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename , 'rb'))

In [8]:
sample = 'What is the best way to learn Python?'
#print(model.encode([sample]))
#print(loaded_model.encode([sample]))
print('---------------------' * 3)
print(len(model.encode([sample])[0]))
print(len(loaded_model.encode([sample])[0]))

---------------------------------------------------------------
384
384


----

In [9]:
nlist = 50
m = 8
bits = 8
dimensions = embeddings.shape[1]

quantizer = faiss.IndexFlatIP(dimensions)
index = faiss.IndexIVFPQ(quantizer, dimensions, nlist, m, bits)
print(index.is_trained)

False


In [10]:
index.train(embeddings)
index.add(embeddings)
print(index.ntotal)
print(index.is_trained)

24469
True


In [11]:
# save the index to disk as a pickle file
index_filename = 'index.sav'
pickle.dump(index, open(index_filename, 'wb'))

# load the index from disk
loaded_index = pickle.load(open(index_filename , 'rb'))

In [12]:
%%time
k = 5
D, I = index.search(model.encode([str(input("Enter Query: "))]), k)
print(D)

[[1.1561999 1.2435786 1.2590462 1.2758831 1.2789359]]
CPU times: user 98.7 ms, sys: 32.8 ms, total: 131 ms
Wall time: 5.31 s


In [13]:
print(I)

[[18425  6877   300  1681 11051]]


In [14]:
[f'{i}: {titles[i]}' for i in I[0]]

['18425: Real Computational Universality: The Word Problem for a Class of Groups with Infinite Presentation.',
 '6877: A Cooperative Approach to the Development of Expert Knowledge Bases Applied to Define Standard of Care in Glaucoma.',
 '300: Lifelong Forgetting: A Critical Ingredient of Lifelong Learning, and Its Implementation in the OpenCog Integrative AI Framework.',
 '1681: Combining EM Training and the MDL Principle for an Automatic Verb Classification Incorporating Selectional Preferences.',
 '11051: Group Sparsity and Geometry Constrained Dictionary Learning for Action Recognition from Depth Maps.']

In [15]:
# sample function to get the top k similar documents, with default k=5
def get_similar_documents(query, index, model, titles, k=5):
    D, I = index.search(model.encode([query]), k)
    return [titles[i] for i in I[0]]

get_similar_documents('What is the best way to learn Python?', loaded_index, loaded_model, titles)

['Learning Where You Are Going and from Whence You Came: h- and g-Cost Learning in Real-Time Heuristic Search.',
 'Learning by Reading: A Prototype System, Performance Baseline and Lessons Learned.',
 'Integrating learning objects into an open learning environment: evaluation of learning processes in an informatics learning lab.',
 'Typing Tutor: Individualized Tutoring in Text Entry for Older Adults Based on Input Stumble Detection.',
 'Analogy Tutor: A Tutoring System for Promoting Conceptual Learning via Comparison.']

In [16]:
def get_similar_documents(query, index, model, titles, k=5):
    D, I = index.search(model.encode([query]), k)
    for i in I:
        print(titles[i])

In [17]:
get_similar_documents('What is the best way to learn Python?', index, model, titles)

['Learning Where You Are Going and from Whence You Came: h- and g-Cost Learning in Real-Time Heuristic Search.'
 'Learning by Reading: A Prototype System, Performance Baseline and Lessons Learned.'
 'Integrating learning objects into an open learning environment: evaluation of learning processes in an informatics learning lab.'
 'Typing Tutor: Individualized Tutoring in Text Entry for Older Adults Based on Input Stumble Detection.'
 'Analogy Tutor: A Tutoring System for Promoting Conceptual Learning via Comparison.']


----

In [None]:
%ls

In [20]:
import pickle


# load numpy arrays from pickle file
with open('titles_np.pkl', 'rb') as f:
    titles = pickle.load(f)

'''with open('embeddings_np.pkl', 'rb') as f:
    embeddings = pickle.load(f)'''

# load the model
model = pickle.load(open('model.sav' , 'rb'))

# load the index from disk
index = pickle.load(open('index.sav' , 'rb'))

def get_embeddings(query):
    xq = model.encode([query])
    return xq


def get_similar_documents(query, index, model, titles, k=5):
    xq = get_embeddings(str(query))
    D, I = index.search(xq, k)
    for i in I:
        print(titles[i])

get_similar_documents('What is the best way to learn Python?', index, model, titles)

['Learning Where You Are Going and from Whence You Came: h- and g-Cost Learning in Real-Time Heuristic Search.'
 'Learning by Reading: A Prototype System, Performance Baseline and Lessons Learned.'
 'Integrating learning objects into an open learning environment: evaluation of learning processes in an informatics learning lab.'
 'Typing Tutor: Individualized Tutoring in Text Entry for Older Adults Based on Input Stumble Detection.'
 'Analogy Tutor: A Tutoring System for Promoting Conceptual Learning via Comparison.']


In [34]:
%%time
def get_embeddings(query):
    xq = model.encode([query])
    return xq


get_embeddings('What is the best way to learn Python?')

CPU times: user 55 ms, sys: 3.07 ms, total: 58.1 ms
Wall time: 10.5 ms


array([[-1.30599383e-02,  7.69109130e-02, -4.65829298e-02,
         5.90131655e-02, -1.74251534e-02, -7.36337155e-02,
        -1.36674028e-02,  8.72032419e-02, -5.11746816e-02,
        -1.14701046e-02, -2.70780846e-02,  3.07676792e-02,
         3.35227349e-03,  5.62819839e-02,  1.10859163e-02,
        -4.18557227e-02, -6.67836964e-02,  5.97557090e-02,
         1.02283970e-01, -9.56052393e-02, -8.14230144e-02,
        -9.38170217e-03,  6.73061376e-03,  3.69466469e-02,
         2.30291598e-02,  3.90121825e-02,  1.69627238e-02,
         2.60770451e-02, -1.36164725e-02,  2.50635557e-02,
        -2.20350865e-02,  2.16441844e-02,  2.23011486e-02,
        -2.23823488e-02, -2.62794122e-02,  4.24289927e-02,
        -1.42241188e-03, -7.57117420e-02,  3.03730555e-02,
         8.28997418e-03, -1.30727030e-02,  5.49051464e-02,
        -3.21871415e-02, -3.58182304e-02,  9.51474011e-02,
         1.93813872e-02,  2.89439969e-02,  5.97866662e-02,
         9.53251794e-02, -5.92043288e-02, -7.78268203e-0

----

In [36]:
from sentence_transformers import SentenceTransformer


model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
#xq = model.encode('How to get started with Python?')

print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)
