### Feel free to connect with me on LinkedIn -> Leonard Püttmann

In [None]:
# installing libraries
!pip install embedders
!pip install qdrant-client
!pip install openai

## Exploring and embedding our data

All the data is taken from https://docs.kern.ai/


In [1]:
import json 

# you can find out documentation at docs.kern.ai or load your data here! 
with open("documentation.json", "r") as f:
    data = json.load(f)
documentation = data["content"]

In [None]:
len_words = [len(t.split()) for t in documentation]

# structure of the data
print(f"-> The data contains {len(documentation)} articles total. \n")

avg_length = round(sum(len_words) / len(len_words), 2)
print(f"-> On average, a text is {avg_length} words long. \n")

max_length = max(len_words)
min_length = min(len_words)
print(f"-> Longest articles is {max_length} words and the shortest is {min_length} long.")

In [None]:
import random

# let's take a look at some samples of the data
for s in random.sample(documentation, 3):
      print(s)
      print("\n ----------------------------------- \n")

-
-
-
-

### Embeddings

Word embeddings are a type of representation for words in natural language processing (NLP). They are typically real-valued vectors that encode the meaning of a word in such a way that words that are closer in the vector space are expected to be similar in meaning. Word embeddings can be obtained using language modeling and feature learning techniques, where words or phrases from the vocabulary are mapped to vectors of real numbers.

Sentence embeddings, on the other hand, refer to a numeric representation of a sentence in the form of vectors of real numbers which encodes meaningful semantic information. State-of-the-art embeddings are based on the learned hidden layer representation of dedicated sentence transformer models.

So word and sentence embeddings provide an efficient, dense representation for words and sentences, where similar words or sentences have similar encodings. These representations can be used to improve performance in various NLP tasks such as syntactic parsing and sentiment analysis.

In [None]:
from embedders.classification.contextual import TransformerSentenceEmbedder

# some text examples
raw_texts = ["Capybaras are very cute animals.", "I prefer pears to apples."]

# load in a transformer model from HuggingFace
embedder = TransformerSentenceEmbedder("distilbert-base-uncased")

# create the embeddings
embeddings = embedder.fit_transform(raw_texts)

For general purposes I recommend these models:
- https://huggingface.co/distilbert-base-uncased (great for testing and prototyping)
- https://huggingface.co/intfloat/multilingual-e5-small (great for information retrieval)

In [None]:
for e in embeddings:
  print(e)
  print(len(e))

### Let's talk about searching - Symetric and asymetric similarity search

Symetric similarity is useful when we have a full document and we want to find relevant, similar documents of the same length, type, etc. The BERT models are still amazing open-source models for this type of task: https://huggingface.co/distilbert-base-uncased

In [None]:
symetric_embedder_model = "distilbert-base-uncased"
symetric_embedder = TransformerSentenceEmbedder(symetric_embedder_model)

### Asymetric similarity serach

The model that can be used for asymetric search has been finetuned with the ms marco dataset https://huggingface.co/datasets/ms_marco

In a nutshell, the model is really good at finding relevant information and documents when the search query is very short. The ms marco dataset is a question-answer dataset using real Bing queries. 

In [5]:
asymetric_embedder_model = "intfloat/multilingual-e5-small" # or "cross-encoder/ms-marco-MiniLM-L-6-v2"
asymetric_embedder = TransformerSentenceEmbedder(asymetric_embedder_model)

More cool model here: 
- https://huggingface.co/spaces/mteb/leaderboard

-
-
-
-

## Set up the Qdrant vector database

In [6]:
from qdrant_client import models, QdrantClient

qdrant = QdrantClient(":memory:") # load the vector DB in memory (not recommended for prod)

In [None]:
# let's see the embedding size of our vectors before the set up
asymetric_embedder.model.get_sentence_embedding_dimension()

In [None]:
# Create collection to store our data
qdrant.recreate_collection(
    collection_name="webinar",
    vectors_config=models.VectorParams(
        size=64, # Vector size is defined by shape of our embeddings
        distance=models.Distance.COSINE
    )
)

In [9]:
# reduce the dimensions of our embeddings
from embedders.classification.reduce import PCASentenceReducer
import numpy as np

reducer = PCASentenceReducer(asymetric_embedder, n_components=64)
embeddings = reducer.fit_transform(documentation)
embeddings = np.array(embeddings) # convert to array

# also create indices for our embeddings
indices = list(range(len(embeddings)))

In [None]:
print(len(embeddings[0]))

In [12]:
qdrant.upload_records(
    collection_name="webinar",
    records=[
        models.Record(
            id=idx,
            vector=list(vec),
            payload={"content": doc}
        ) for idx, vec, doc in zip(indices, embeddings, documentation)
    ]
)

In [None]:
query_vector = reducer.transform(["What is attribute calculation in Kern AI refinery?"])

In [14]:
hits = qdrant.search(
    collection_name="webinar",
    query_vector=query_vector[0],
    limit=5  # Return 5 closest points
)

In [None]:
for hit in hits: 
    print(hit)
    print(hit.payload["content"])#.replace(".", ".\n"))
    print("\n ------------------- \n")

-
-
-
-

## Feed the results into GPT 3.5/ ChatGPT!

In [17]:
import openai
import os 

# Authenticate with OpenAI
openai.api_key = os.getenv("OPENAI_API_KEY")

### GPTs response without any context

In [18]:
# Define a function to prompt the user for input and generate a response
def standard_response(prompt):
      # Call the OpenAI API to generate a response
      response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=2048,
            n=1,
            temperature=0.0,
            top_p=1,
            frequency_penalty=0.0,
            presence_penalty=0.6,
      )
      # Get the response text from the API response
      response_text = response["choices"][0]["message"]["content"]
      return response_text

### "Vanilla GPT" response

In [None]:
query = "What is attribute calculation in Kern AI refinery?" # Note that we are even giving the model a hint about the tool we are using!
print(standard_response(query))

-
-
-
-

### Enriched GPT model

In [20]:
def get_context(user_prompt):
      query_vector = reducer.transform([user_prompt])
      hits = qdrant.search(
            collection_name="webinar",
            query_vector=query_vector[0],
            limit=5  # Return 5 closest points
      )
      context = [hit.payload["content"] for hit in hits]
      return " ".join(context)

def enriched_response(user_prompt):
    # Call the OpenAI API to generate a response
      system_message = """You are a friendly and helpful assistant for the company Kern AI and their software tools. Their tools are called 
      refinery, gates, workflow and bricks. Your job is to provide answers to questions about these products. For that will be provided with some 
      context about these tools, but you may not use all of the context at all times.
      """
      context = get_context(user_prompt)

      prompt = f"""
      User message:
      {user_prompt}

      Context from our knowledge base:
      {context}
      """
      response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
            {"role": "system", "content": system_message},
            {'role': 'user', 'content': prompt} 
            ],
            max_tokens=2048,
            n=1,
            temperature=0.0,
            top_p=1,
            frequency_penalty=0.0,
            presence_penalty=0.6,
      )
      # Get the response text from the API response
      response_text = response['choices'][0]['message']['content']
      return response_text

In [None]:
query = "What is attribute calculation in Kern AI refinery?"
er1 = enriched_response(query)
print(er1.replace(".", ".\n"))

In [None]:
query = "I want to create a labeling function, but I don't know how."
sr2 = standard_response(query)
print(sr2.replace(".", ".\n"))

In [None]:
query = "I want to create a labeling function, but I don't know how."
er = enriched_response(query)
print(er.replace(".", ".\n"))