In [1]:
import os
import pickle

from pyspark import SparkConf
from aips.data_loaders.outdoors import load_dataframe
from aips import get_engine
from sentence_transformers import SentenceTransformer, SimilarityFunction
from sentence_transformers.util import cos_sim
from pyspark.sql import SparkSession
from aips import get_engine
import time
import numpy
import math

engine = get_engine()
#Recommended for making ALS run faster, if you have enough memory / cores allocated to docker
conf = SparkConf()
conf.set("spark.driver.memory", "8g")
conf.set("spark.executor.memory", "8g")
conf.set("spark.dynamicAllocation.enabled", "true")
conf.set("spark.dynamicAllocation.executorMemoryOverhead", "8g")
spark = SparkSession.builder.appName("AIPS-ch13").config(conf=conf).getOrCreate()

#https://github.com/facebookresearch/faiss/wiki/Pre--and-post-processing
#https://github.com/facebookresearch/faiss/wiki

  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

#outdoors_collection = engine.create_collection("outdoors")

Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-squad2-guesses.csv
._roberta-base-squad2-outdoors
roberta-base-squad2-outdoors/
roberta-base-squad2-outdoors/._tokenizer_config.json
roberta-base-squad2-outdoors/tokenizer_config.json
roberta-base-squad2-outdoors/._special_tokens_map.json
roberta-base-squad2-outdoors/special_tokens_map.json
roberta-base-squad2-outdoors/._config.json
roberta-base-

In [28]:
#model.stop_multi_process_pool(pool)
#pool = model.start_multi_process_pool()
#embeddings = model.encode(texts, convert_to_tensor=False).tolist()

def get_embeddings(texts, cache_name, model, ignore_cache=False):
    cache_file_name = f"data/embeddings/{cache_name}.pickle"
    if ignore_cache or not os.path.isfile(cache_file_name):        
        embeddings = model.encode(texts, normalize_embeddings=True)
        os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
        with open(cache_file_name, "wb") as fd:
            pickle.dump(embeddings, fd)
    else:
        with open(cache_file_name, "rb") as fd:
            embeddings = pickle.load(fd)
    return embeddings

## Boilerplate code for Quantization listings
### Generating embeddings and benchmark data

In [27]:
import faiss
from aips.data_loaders.outdoors import load_dataframe

def display_results(scores, ids, data):
    results = generate_search_results(scores, ids, data)
    display(results)
    return results

def get_outdoors_data():
    outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
    outdoors_data = list(outdoors_dataframe.rdd.map(lambda r: r.asDict()).collect())
    return outdoors_data

def calculate_outdoors_embeddings(model):
    outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
    post_texts = [post["title"] + " " + post["body"]
                  for post in outdoors_dataframe.collect()]
    return numpy.array(get_embeddings(post_texts, "outdoors_mrl_normed", model))

def display_statistics(full_search_results, quantized_search_results):
    index_name = quantized_search_results["index_name"]
    full_search_time = full_search_results["time_taken"]
    time_taken = quantized_search_results["time_taken"]
    time_imp = round((full_search_time - time_taken) * 100 / full_search_time, 2)
    quantized_size = quantized_search_results["size"]
    improvement_ms = f"({time_imp}% improvement)"
    improvement_size = f"({round((full_search_results['size'] - quantized_size) * 100 / full_search_results['size'], 2)}% improvement)"
    print(f"{index_name} search took: {time_taken:.3f} ms {improvement_ms}")
    print(f"{index_name} index size: {round(quantized_size / 1000000, 2)} MB {improvement_size}")
    calculate_recall(full_search_results["results"], quantized_search_results["results"])

def calculate_recall(scored_full_results, scored_quantized_results):
    recall = []
    for i in range(len(scored_full_results)):
        full_ids = [r["id"] for r in scored_full_results[i]]
        quantized_ids = [r["id"] for r in scored_quantized_results[i]]
        recall.append((len(set(full_ids).intersection(set(quantized_ids))) /
                       len(set(quantized_ids))))
    print("Recall: " + str(round(sum(recall) / len(recall), 4)))

def generate_search_results(faiss_scores, faiss_ids):
    outdoors_data = get_outdoors_data()
    faiss_results = []
    for i in range(len(faiss_scores)):
        results = []
        for j, id in enumerate(faiss_ids[i]):
            id = int(id)
            result = {"score": faiss_scores[i][j],
                      "title": outdoors_data[id]["title"],
                      "body": outdoors_data[id]["body"],
                      "id": id}
            results.append(result)
        faiss_results.append(results)
    return faiss_results

In [5]:
#This will generate and cache the embeddings. Takes 2-3 hours typically
embeddings = calculate_outdoors_embeddings() 
print(embeddings.shape) #     => (18456, 1024)

outdoors_data = get_outdoors_data() 
print(len(outdoors_data)) #   => 18456

(18456, 1024)
18456


## Listing 13.21
### int8 quantization

In [41]:
from sentence_transformers.quantization import quantize_embeddings

def get_test_queries():
    return ["tent poles", "hiking trails", "mountain forests",
            "white water", "best waterfalls", "mountain biking",
            "snowboarding slopes", "bungee jumping", "public parks"]

def index_embeddings(embeddings, name, print_shape=True):
    if print_shape:
        print(f"{name} embeddings shape:", embeddings.shape)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, name)
    return index

def time_and_execute_search(index, index_name, query_embeddings, k=25):
    start_time = time.time()
    faiss_scores, faiss_ids = index.search(query_embeddings, k=k)
    time_taken = ((time.time() - start_time) * 1000)
    
    return {"results": generate_search_results(faiss_scores, faiss_ids),
            "size": os.path.getsize(index_name), 
            "time_taken": time_taken, "index_name": index_name}

def execute_full_search(embeddings, query_embeddings, k=25,
                        index_name="full_out_embs"):      
    full_index = index_embeddings(embeddings, index_name, print_shape=False)
    return time_and_execute_search(full_index, index_name, query_embeddings, k=k)

def evaluate_search(index, index_name, query_embeddings, quantized_query_embeddings,
                    k=25, log=False):
    embeddings = calculate_outdoors_embeddings(model)
    full_results = execute_full_search(embeddings, query_embeddings, k=k)
    quantized_results = time_and_execute_search(index, index_name,
                                                quantized_query_embeddings, k=k)
    display_statistics(full_results, quantized_results)

def index_int_embeddings(embeddings, name):
    embeddings = quantize_embeddings(embeddings, precision="int8")
    print("Int8 embeddings shape:", embeddings.shape)
    index = faiss.IndexScalarQuantizer(embeddings.shape[1],
                                       faiss.ScalarQuantizer.QT_8bit)
    index.train(embeddings)
    index.add(embeddings)
    faiss.write_index(index, name)
    return index
    
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1",
                            similarity_fn_name=SimilarityFunction.DOT_PRODUCT,
                            truncate_dim=1024)
embeddings = calculate_outdoors_embeddings(model)
queries = get_test_queries()
query_embeddings = model.encode(queries, convert_to_numpy=True,
                                normalize_embeddings=True)
quantized_embeddings = quantize_embeddings(query_embeddings,
                                           calibration_embeddings=embeddings,
                                           precision="int8")
int8_index_name = "int8_out_embs"
int8_index = index_int_embeddings(embeddings, int8_index_name)
evaluate_search(int8_index, int8_index_name, query_embeddings, quantized_embeddings)

Int8 embeddings shape: (18456, 1024)
int8_out_embs search took: 8.121 ms (-44.82% improvement)
int8_out_embs index size: 18.91 MB (74.99% improvement)
Recall: 0.9289


TypeError: index_int_reranked_embeddings() takes 1 positional argument but 2 were given

## Listing 13.22
### Binary Quantization

In [34]:
def index_binary_embeddings(embeddings):
    binary_embeddings = numpy.packbits(embeddings > 0).reshape(embeddings.shape[0], -1)
    print("Binary embeddings shape:", binary_embeddings.shape)
    index = faiss.IndexBinaryFlat(binary_embeddings.shape[1] * 8)
    index.add(binary_embeddings)
    faiss.write_index_binary(index, "binary_out_embs")
    return index

embeddings = calculate_outdoors_embeddings(model)
queries = get_test_queries()
query_embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)

binary_index = index_binary_embeddings(embeddings)

quantized_queries = numpy.packbits(query_embeddings > 0).reshape(query_embeddings.shape[0], -1)
evaluate_search(binary_index, "binary_out_embs", query_embeddings, quantized_queries)

#quantized_queries = numpy.zeros_like(query_embeddings, dtype=numpy.int8)
#quantized_queries[query_embeddings > 0] = 1

Binary embeddings shape: (18456, 128)
binary_out_embs search took: 0.901 ms (81.91% improvement)
binary_out_embs index size: 2.36 MB (96.87% improvement)
Recall: 0.6044


## Listing 13.23
### Matroyoshka Learned Representations

In [40]:
queries = get_test_queries()
embeddings = calculate_outdoors_embeddings(model)
query_embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)
for slice in [512, 256, 128]:    
    scaled_embeddings = numpy.array(list(map(lambda e: e[:slice], embeddings)))
    quantized_queries = numpy.array(list(map(lambda qe: qe[:slice], query_embeddings)))
    index_name = f"mrl_out_embs_{slice}"
    index = index_embeddings(scaled_embeddings, index_name)
    evaluate_search(index, index_name, query_embeddings, quantized_queries)

mrl_out_embs_512 embeddings shape: (18456, 512)
mrl_out_embs_512 search took: 2.872 ms (48.19% improvement)
mrl_out_embs_512 index size: 37.8 MB (50.0% improvement)
Recall: 0.7022


NameError: name 'BONSAI_SOLR_HOSTNAME' is not defined

## Listing 13.24
### Product quantizationQ

In [9]:
def index_pq_embeddings(embeddings, name="pq_out_embs"):    
    dimensions = embeddings.shape[1]
    sub_vectors = 8
    subquantizer_bits = 8
    #faiss::IndexIVFPQ, IndexIVFPQR
    index = faiss.IndexPQ(dimensions, sub_vectors, subquantizer_bits)
    index.train(embeddings)
    index.add(embeddings)
    faiss.write_index(index, name)
    return index

def index_pq_reranked_embeddings(embeddings, name="pq_out_embs"):    
    dimensions = embeddings.shape[1]
    sub_vectors = 8
    subquantizer_bits = 8
    #faiss::IndexIVFPQ, IndexIVFPQR
    index = faiss.IndexPQ(dimensions, sub_vectors, subquantizer_bits)
    rereanking_pq_index = faiss.IndexRefineFlat(index)
    rereanking_pq_index.train(embeddings)
    rereanking_pq_index.add(embeddings)
    faiss.write_index(rereanking_pq_index, name)
    return index

embeddings, outdoors_data = calculate_outdoors_embeddings()
query_embeddings = model.encode(["Hiking trails"], convert_to_numpy=True, normalize_embeddings=True)
scored_full_results, full_search_time, full_index_size = \
    execute_full_search(embeddings, query_embeddings, outdoors_data)
embeddings, outdoors_data = calculate_outdoors_embeddings()
index = index_pq_reranked_embeddings(embeddings)
scored_results, _, _ = execute_search(index, "pq_out_embs", query_embeddings, outdoors_data,
                                      full_search_time=full_search_time,
                                      full_index_size=full_index_size)
calculate_recall(scored_full_results, scored_results)

full_out_embs search took: 4.253 ms 
full_out_embs index size: 75.6 MB 
pq_out_embs search took: 0.538 ms (87.35% improvement)
pq_out_embs index size: 76.79 MB (-1.58% improvement)
Recall: 0.28


## Listing 13.25
### Quantization and reranking

In [39]:
from pyspark.sql.functions import col, udf, monotonically_increasing_id
from pyspark.sql.types import Row, ArrayType, FloatType, StructField, StructType, StringType, ByteType
import faiss
from aips.data_loaders.outdoors import load_dataframe

def calculate_outdoors_embeddingsss():
    outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
    post_texts = [post["title"] + " " + post["body"]
                  for post in outdoors_dataframe.collect()]
    embeddings = get_embeddings(post_texts, "outdoors_mrl_normed", model)
    outdoors_data = list(outdoors_dataframe.rdd.map(lambda r: r.asDict()).collect())
    print(f"embeddings type {(type(embeddings))}")
    embeddings = numpy.array(embeddings)
    quantized_embeddings = numpy.zeros_like(embeddings, dtype=numpy.int8)
    quantized_embeddings[embeddings > 0] = 1
    for i in range(len(outdoors_data)):
        #embs = [float(e) for e in embeddings[i]] numpy.packbits(embeddings > 0) \
            #.reshape(embeddings, -1).tolist()
        #[float(e) for e in embeddings[i]]
        outdoors_data[i]["text_embedding"] = embeddings[i].tolist()
        outdoors_data[i]["binary_text_embedding"] = quantized_embeddings[i].tolist()
    return outdoors_data

def build_engine_quantization_index():
    outdoors_data = calculate_outdoors_embeddingsss()
    schema = StructType([StructField("title", StringType()),
                         StructField("body", StringType()),
                         StructField("text_embedding", ArrayType(FloatType())),
                         StructField("binary_text_embedding", ArrayType(ByteType()))])
    outdoors_dataframe = spark.createDataFrame(
        [Row(title=x["title"], body=x["body"],
             text_embedding=x["text_embedding"],
             binary_text_embedding=x["binary_text_embedding"])
             for x in outdoors_data], schema=schema)
    #embeddings = list(embeddings)
    #outdoors_data = load_dataframe("data/outdoors/posts.csv")
    #quantized_embeddings = [quantize(e) for e in normalized_embeddings]
    embeddings_collection = engine.create_collection("outdoors_quantization")
    embeddings_collection.write(outdoors_dataframe)

def search_request(query_vector, query_field,
                   rerank_vector=None, rerank_query_field=None,
                   quantization_size=None, k=1000, limit=25):
    request = {"query": query_vector,
               "query_fields": [query_field],
               "return_fields": ["title", "body", "score"],
               "limit": limit,
               "k": k,
               "log": True}
    if rerank_vector and rerank_query_field:
        request["rerank_query"] = {"query": rerank_vector,
                                   "query_fields": [rerank_query_field],
                                   "k": k}
    return request

def engine_rankings(query, log=False):
    collection = engine.get_collection("outdoors_quantization")
    query_embedding = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    quantized_queries = numpy.zeros_like(query_embedding, dtype=numpy.int8)
    quantized_queries[query_embedding > 0] = 1

    full_start_time = time.time()
    full_request = search_request(query_embedding[0].tolist(), "text_embedding")
    full_results = collection.search(**full_request)
    full_time_taken = ((time.time() - full_start_time) * 1000)
    display(full_results)

    binary_start_time = time.time()
    binary_request = search_request(quantized_queries[0].tolist(), "binary_text_embedding",
                                    query_embedding[0].tolist(), "text_embedding",
                                    "BINARY")
    binary_time_taken = ((time.time() - binary_start_time) * 1000)
    binary_results = collection.search(**binary_request)
    display(binary_results)

    #calculate_recall()
    print(full_time_taken, binary_time_taken) 

#build_engine_quantization_index()
collection = engine.get_collection("outdoors_quantization")
engine_rankings("hiking trails")

{
  "query": "{!knn f=text_embedding topK=1000}[-0.000763779622502625, 0.013330797664821148, -0.015633903443813324, -0.01676012948155403, 0.0006817926187068224, 0.05022873729467392, -0.009578105062246323, 0.019668355584144592, 0.05052463710308075, 0.06134713441133499, 0.05279262363910675, 0.04455990344285965, 0.025179274380207062, -0.06187020614743233, -0.014002079144120216, 0.017646629363298416, -0.041832707822322845, -0.004604197572916746, -0.02410692162811756, 0.035086240619421005, -0.0048105488531291485, 0.03712352737784386, -0.018056849017739296, -0.042941026389598846, -0.013890030793845654, -0.01205825712531805, 0.00593447545543313, -0.0018199461046606302, 0.03489598259329796, 0.07389743626117706, 0.02978200651705265, -0.008492175489664078, 0.014568432234227657, -0.02370239607989788, -0.011161082424223423, 0.003897890215739608, 0.0346270352602005, 0.01360263116657734, 0.00519433943554759, 0.01757347397506237, 0.023156126961112022, -0.04113055020570755, 0.036492813378572464, -0.00

{'docs': [{'body': "I found this with a quick google, but since I think you want more: Find a local chapter of the Appalachian Trail Club. Here near DC, it's the PATC. They typically do walks on various nearby trails (in addition to maintennance). Second, go to your local outdoors store. Around here, there's a book called Circuit Hikes in Shenandoah park. I'm sure any local park near you would have a similar book.",
   'score': 0.7697265200000001},
  {'body': 'I recently found this post on glocals.com where someone shared their travel guide through Google maps ( https://drive.google.com/open?id=17gleDabcsxBwTrxR2wxUwc17f8QAj8GH&usp=sharing ). It has some suggestions for hikes.',
   'score': 0.7627206},
  {'body': "Quick tips: Go to http://www.i-needtoknow.com/milford/maps/index.html They have several links for hikers, including maps. One you might check out is the Department of Conservation's online maps at: http://www.doc.govt.nz/parks-and-recreation/tracks-and-walks/ Also, Google say

{
  "query": "{!knn f=binary_text_embedding topK=1000}[0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 

{'docs': [{'body': "I found this with a quick google, but since I think you want more: Find a local chapter of the Appalachian Trail Club. Here near DC, it's the PATC. They typically do walks on various nearby trails (in addition to maintennance). Second, go to your local outdoors store. Around here, there's a book called Circuit Hikes in Shenandoah park. I'm sure any local park near you would have a similar book.",
   'score': 1.7697496},
  {'body': 'I recently found this post on glocals.com where someone shared their travel guide through Google maps ( https://drive.google.com/open?id=17gleDabcsxBwTrxR2wxUwc17f8QAj8GH&usp=sharing ). It has some suggestions for hikes.',
   'score': 1.7627432},
  {'body': "Quick tips: Go to http://www.i-needtoknow.com/milford/maps/index.html They have several links for hikers, including maps. One you might check out is the Department of Conservation's online maps at: http://www.doc.govt.nz/parks-and-recreation/tracks-and-walks/ Also, Google says there's

12.425661087036133 0.04553794860839844
