In [50]:
## Install dependencies
!pip install pinecone-client==2.2.4
!pip install sentence-transformers==2.2.2
!pip install torch==2.0.1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [51]:
## Import dependencies
import os
import pinecone
import torch
import torch.nn.functional as F
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

In [52]:
## Initialize environment variables (change api key and environment to be your own)
PINECONE_API_KEY= "insert yours here"
PINECONE_ENV = "gcp-starter"
PINECONE_INDEX = "cml-default"
EMBEDDING_MODEL_REPO = "sentence-transformers/all-mpnet-base-v2"

In [53]:
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
indexes = pinecone.list_indexes()

In [54]:
if PINECONE_INDEX not in indexes:
    pinecone.create_index(PINECONE_INDEX, dimension=768, metric="euclidean")
    
index_description = pinecone.describe_index(PINECONE_INDEX)
print(index_description)

collection = pinecone.Index(PINECONE_INDEX)
print("Successfully loaded " + PINECONE_INDEX)

IndexDescription(name='cml-default', metric='cosine', replicas=1, dimension=768.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')
Successfully loaded cml-default


In [55]:
# Load the model stored in models/embedding-model
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_REPO)
model = AutoModel.from_pretrained(EMBEDDING_MODEL_REPO)

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Create embeddings using chosen embedding-model
def get_embeddings(sentence):
    # Sentences we want sentence embeddings for
    sentences = [sentence]
    
    # Tokenize sentences
    # Default model will truncate the document and only gets embeddings of the first 256 tokens.
    # Semantic search will only be effective on these first 256 tokens.
    # Context loading will still include the ENTIRE document file
    encoded_input = tokenizer(sentences, padding='max_length', truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return (sentence_embeddings.tolist()[0])

In [56]:
## Use documents in /data directory and insert embeddings into Vector DB for each doc
def insert_embedding(pinecone_index, id_path, text):
    print("Upserting vectors...")
    vectors = list(zip([text[:512]], [get_embeddings(text)], [{"file_path": id_path}]))
    upsert_response = pinecone_index.upsert(
        vectors=vectors
        )
    print("Success")

# Create an embedding for given text/doc and insert it into Pinecone Vector DB
doc_dir = './data'
for file in Path(doc_dir).glob(f'**/*.txt'):
    with open(file, "r") as f: # Open file in read mode
        print("Generating embeddings for: %s" % file.name)
        text = f.read()
        insert_embedding(collection, os.path.abspath(file), text)
print('Finished loading Knowledge Base embeddings into Pinecone')


Generating embeddings for: ml-product-overview-2.txt
Upserting vectors...
Success
Generating embeddings for: ml-product-overview-4.txt
Upserting vectors...
Success
Generating embeddings for: ml-product-overview.txt
Upserting vectors...
Success
Generating embeddings for: ml-product-overview-3.txt
Upserting vectors...
Success
Generating embeddings for: ml-product-overview-5.txt
Upserting vectors...
Success
Generating embeddings for: ml-architecture-overview-spark-on-kubernetes.txt
Upserting vectors...
Success
Generating embeddings for: ml-architecture-overview1.txt
Upserting vectors...
Success
Generating embeddings for: ml-architecture-overview-cml-1.txt
Upserting vectors...
Success
Generating embeddings for: ml-architecture-overview-cml-2.txt
Upserting vectors...
Success
Generating embeddings for: ml-architecture-overview-runtimes.txt
Upserting vectors...
Success
Generating embeddings for: ml-architecture-overview-provisioning.txt
Upserting vectors...
Success
Generating embeddings for: 

In [57]:
## Setup function to convert user question into an embedding returning most relevant response (semantic search)
def get_response_from_pinecone_vectordb(index, question):
    # Generate embedding for user question with embedding model
    retriever = SentenceTransformer(EMBEDDING_MODEL_REPO)
    xq = retriever.encode([question]).tolist()
    xc = index.query(xq, top_k=5,
                 include_metadata=True)
    
    matching_files = []
    scores = []
    for match in xc['matches']:
        # extract the 'file_path' within 'metadata'
        file_path = match['metadata']['file_path']
        # extract the individual scores for each vector
        score = match['score']
        scores.append(score)
        matching_files.append(file_path)

    # Return text of the nearest knowledge base chunk 
    # Note that this ONLY uses the first matching document for semantic search. matching_files holds the top results so you can increase this if desired.
    response = load_context_chunk_from_data(matching_files[0])
    sources = matching_files[0]
    score = scores[0]
    return response, sources, score
  
# Return the Knowledge Base doc based on Knowledge Base ID (relative file path)
def load_context_chunk_from_data(id_path):
    with open(id_path, "r") as f: # Open file in read mode
        return f.read()

In [58]:
## Write a query and trigger a semantic search (show document, location, and score)
## Note that if you're running this script for the first time, you may have to run this block twice
USER_QUESTION = "What is Iceberg" ## (replace this with your own from content in your knowledge base)
response, source, score = get_response_from_pinecone_vectordb(collection, USER_QUESTION)
print("Source Path: " + source)
print("Pinecone relevancy score: " + str(score))
print(response)

Source Path: /home/cdsw/data/iceberg/iceberg-snippet.txt
Pinecone relevancy score: 0.393630356
Apache Iceberg is an open table format for huge analytic datasets. Iceberg adds tables to compute engines including Spark, Trino, PrestoDB, Flink, Hive and Impala using a high-performance table format that works just like a SQL table.

User experience
Iceberg avoids unpleasant surprises. Schema evolution works and won't inadvertently un-delete data. Users don't need to know about partitioning to get fast queries.

Schema evolution supports add, drop, update, or rename, and has no side-effects
Hidden partitioning prevents user mistakes that cause silently incorrect results or extremely slow queries
Partition layout evolution can update the layout of a table as data volume or query patterns change
Time travel enables reproducible queries that use exactly the same table snapshot, or lets users easily examine changes
Version rollback allows users to quickly correct problems by resetting tables to