# Introduction to Embeddings with the OpenAI API

# 1. What are Embeddings?

## The wonderful world of embeddings!

### Creating embeddings

In [None]:
# Create an OpenAI client and set your API key
client = OpenAI(api_key="<OPENAI_API_TOKEN>")

# Create a request to obtain embeddings
response = client.embeddings.create(
  model="text-embedding-ada-002",
  input="This can contain any text."
)

# Convert the response into a dictionary
response_dict = response.model_dump()

print(response_dict)

### Digging into the embeddings response

In [None]:
# Extract the total_tokens from response_dict
print(response_dict['usage']['total_tokens'])

""" 
11 
{'data': [{'embedding': [-0.010771304368972778, -0.01712021790444851, 0.02336982451379299, -0.03116859309375286, 
-0.014141061343252659, 0.03503487631678581, -0.011731254868209362, -0.007620019372552633, -0.003634570399299264, 
-0.04260855168104172, 0.010532972402870655, 0.012532317079603672, 0.02554129809141159, -0.0031694911886006594, 
-0.004415771458297968, 0.0038000792264938354, 0.015359205193817616, -0.002889781491830945, 0.0037802180740982294, 
-0.01655086688697338, -0.028811750933527946, 0.023303620517253876, 0.021913347765803337, -0.011029497720301151, 
-0.008176128380000591, 0.000987259205430746, 0.007639880292117596 ..... 
""" 

In [None]:
# Extract the embeddings from response_dict
print(response_dict['data'][0]['embedding'])

"""
    [-0.010771304368972778, -0.01712021790444851, 0.02336982451379299, -0.03116859309375286, -0.014141061343252659, 
    0.03503487631678581, -0.011731254868209362, -0.007620019372552633, -0.003634570399299264
""" 

## Investigating the vector space

### Embedding product descriptions

In [None]:
# Set your API key
client = OpenAI(api_key="<OPENAI_API_TOKEN>")

# Extract a list of product short descriptions from products
product_descriptions = [product['short_description'] for product in products]

# Create embeddings for each product description
response = client.embeddings.create(
  model="text-embedding-ada-002",
  input=product_descriptions
)
response_dict = response.model_dump()

# Extract the embeddings from response_dict and store in products
for i, product in enumerate(products):
    product['embedding'] = response_dict['data'][i]['embedding']
    
print(products[0].items())

### Visualizing the embedded descriptions

In [None]:
# Create reviews and embeddings lists using list comprehensions
categories = [product['category'] for product in products]
embeddings = [product['embedding'] for product in products]

# Reduce the number of embeddings dimensions to two using t-SNE
tsne = TSNE(n_components=2, perplexity=5)
embeddings_2d = tsne.fit_transform(np.array(embeddings))

# Create a scatter plot from embeddings_2d
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])

for i, category in enumerate(categories):
    plt.annotate(category, (embeddings_2d[i, 0], embeddings_2d[i, 1]))

plt.show()

## Text similarity

### More repeatable embeddings

In [None]:
# Set your API key
client = OpenAI(api_key="<OPENAI_API_TOKEN>")

# Define a create_embeddings function
def create_embeddings(texts):
  response = client.embeddings.create(
    model="text-embedding-ada-002",
    input=texts
  )
  response_dict = response.model_dump()

  return [data['embedding'] for data in response_dict['data']]

# Embed short_description and print
print(create_embeddings(short_description)[0])

# Embed list_of_descriptions and print
print(create_embeddings(list_of_descriptions))

### Finding the most similar product

In [None]:
# Set your API key
client = OpenAI(api_key="<OPENAI_API_TOKEN>")

# Embed the search text
search_text = "soap"
search_embedding = create_embeddings(search_text)[0]

distances = []
for product in products:
  # Compute the cosine distance for each product description
  dist = distance.cosine(search_embedding, product["embedding"])
  distances.append(dist)

# Find and print the most similar product short_description    
min_dist_ind = np.argmin(distances)
print(products[min_dist_ind]['short_description'])

# 2. Embeddings for AI Applications

## Semantic search and enriched embeddings

### Enriching embeddings

In [None]:
# Set your API key
client = OpenAI(api_key="<OPENAI_API_TOKEN>")

# Define a function to combine the relevant features into a single string
def create_product_text(product):
  return f"""Title: {product['title']}
Description: {product['short_description']}
Category: {product['category']}
Features: {'; '.join(product['features'])}"""

# Combine the features for each product
product_texts = [create_product_text(product) for product in products]

# Create the embeddings from product_texts
product_embeddings = create_embeddings(product_texts)

### Sorting by similarity

In [None]:
def find_n_closest(query_vector, embeddings, n=3):
  distances = []
  for index, embedding in enumerate(embeddings):
    # Calculate the cosine distance between the query vector and embedding
    dist = distance.cosine(query_vector, embedding)
    # Append the distance and index to distances
    distances.append({"distance": dist, "index": index})
  # Sort distances by the distance key
  distances_sorted = sorted(distances, key=lambda x: x["distance"])
  # Return the first n elements in distances_sorted
  return distances_sorted[0:n]

### Semantic search for products

In [None]:
# Set your API key
client = OpenAI(api_key="<OPENAI_API_TOKEN>")

# Create the query vector from query_text
query_text = "computer"
query_vector = create_embeddings(query_text)[0]

# Find the five closest distances
hits = find_n_closest(query_vector, product_embeddings, n=5)

print(f'Search results for "{query_text}"')
for hit in hits:
  # Extract the product at each index in hits
  product = products[hit['index']]
  print(product["title"])

## Recommendation systems

### Product recommendation system

In [None]:
# Set your API key
client = OpenAI(api_key="<OPENAI_API_TOKEN>")

# Combine the features for last_product and each product in products
last_product_text = create_product_text(last_product)
product_texts = [create_product_text(product) for product in products]

# Embed last_product_text and product_texts
last_product_embeddings = create_embeddings(last_product_text)[0]
product_embeddings = create_embeddings(product_texts)

# Find the three smallest cosine distances and their indexes
hits = find_n_closest(last_product_embeddings, product_embeddings)

for hit in hits:
  product = products[hit['index']]
  print(product['title'])

### Adding user history to the recommendation engine

In [None]:
# Set your API key
client = OpenAI(api_key="<OPENAI_API_TOKEN>")

# Prepare and embed the user_history, and calculate the mean embeddings
history_texts = [create_product_text(product) for product in user_history]
history_embeddings = create_embeddings(history_texts)
mean_history_embeddings = np.mean(history_embeddings, axis=0)

# Filter products to remove any in user_history
products_filtered = [product for product in products if product not in user_history]

# Combine product features and embed the resulting texts
product_texts = [create_product_text(product) for product in products_filtered]
product_embeddings = create_embeddings(product_texts)

hits = find_n_closest(mean_history_embeddings, product_embeddings)

for hit in hits:
  product = products_filtered[hit['index']]
  print(product['title'])

## Embeddings for classification tasks

### Embedding restaurant reviews

In [None]:
# Set your API key
client = OpenAI(api_key="<OPENAI_API_TOKEN>")

# Create a list of class descriptions from the sentiment labels
class_descriptions = [sentiment['label'] for sentiment in sentiments]

# Embed the class_descriptions and reviews
class_embeddings = create_embeddings(class_descriptions)
review_embeddings = create_embeddings(reviews)

### Classifying review sentiment

In [None]:
# Define a function to return the minimum distance and its index
def find_closest(query_vector, embeddings):
  distances = []
  for index, embedding in enumerate(embeddings):
    dist = distance.cosine(query_vector, embedding)
    distances.append({"distance": dist, "index": index})
  return min(distances, key=lambda x: x["distance"])

for index, review in enumerate(reviews):
  # Find the closest distance and its index using find_closest()
  closest = find_closest(review_embeddings[index], class_embeddings)
  # Subset sentiments using the index from closest
  label = sentiments[closest['index']]['label']
  print(f'"{review}" was classified as {label}')

"""
    "The food was delicious!" was classified as Positive
    "The service was a bit slow but the food was good" was classified as Neutral
    "Never going back!" was classified as Positive
"""

### Embedding more detailed descriptions

In [None]:
# Set your API key
client = OpenAI(api_key="<OPENAI_API_TOKEN>")

# Extract and embed the descriptions from sentiments
class_descriptions = [sentiment['description'] for sentiment in sentiments]
class_embeddings = create_embeddings(class_descriptions)
review_embeddings = create_embeddings(reviews)

def find_closest(query_vector, embeddings):
  distances = []
  for index, embedding in enumerate(embeddings):
    dist = distance.cosine(query_vector, embedding)
    distances.append({"distance": dist, "index": index})
  return min(distances, key=lambda x: x["distance"])

for index, review in enumerate(reviews):
  closest = find_closest(review_embeddings[index], class_embeddings)
  label = sentiments[closest['index']]['label']
  print(f'"{review}" was classified as {label}')

# 3. Vector Databases

## Vector databases for embedding systems

## Creating vector databases with ChromaDB

### Getting started with ChromaDB

In [None]:
# Create a persistant client
client = chromadb.PersistentClient()

# Create a netflix_title collection using the OpenAI Embedding function
collection = client.create_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key="<OPENAI_API_TOKEN>")
)

# List the collections
print(client.list_collections())

# [Collection(name=netflix_titles)]

### Estimating embedding costs with tiktoken

In [None]:
# Load the encoder for the OpenAI text-embedding-ada-002 model
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

# Encode each text in documents and calculate the total tokens
total_tokens = sum(len(enc.encode(text)) for text in documents)

cost_per_1k_tokens = 0.0001

# Display number of tokens and cost
print('Total tokens:', total_tokens)
print('Cost:', cost_per_1k_tokens * total_tokens/1000)

"""
    Total tokens: 51226
    Cost: 0.005122600000000001
"""

### Adding data to the collection

In [None]:
# Recreate the netflix_titles collection
collection = client.create_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key="<OPENAI_API_TOKEN>")
)

# Add the documents and IDs to the collection
collection.add(ids=ids, documents=documents)

# Print the collection size and first ten items
print(f"No. of documents: {collection.count()}")
print(f"First ten documents: {collection.peek()}")

## Querying and updating the database

### Querying the Netflix collection

In [None]:
# Retrieve the netflix_titles collection
collection = client.get_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key="<OPENAI_API_TOKEN>")
)

# Query the collection for "films about dogs"
result = collection.query(
  query_texts=["films about dogs"],
  n_results=3
)

print(result)

### Updating and deleting items from a collection

In [None]:
# Retrieve the netflix_titles collection
collection = client.get_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key="<OPENAI_API_TOKEN>")
)

# Update or add the new documents
collection.upsert(
  ids=[doc['id'] for doc in new_data],
  documents=[doc['document'] for doc in new_data]
)

# Delete the item with ID "s95" and re-run the query
collection.delete(ids=["s95"])

result = collection.query(
  query_texts=["films about dogs"],
  n_results=3
)
print(result)

## Multiple queries and filtering

### Querying with multiple texts

In [None]:
# Retrieve the netflix_titles collection
collection = client.get_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key="<OPENAI_API_TOKEN>")
)

reference_ids = ['s999', 's1000']

# Retrieve the documents for the reference_ids
reference_texts = collection.get(ids=reference_ids)['documents']

# Query using reference_texts
result = collection.query(
  query_texts=reference_texts,
  n_results=3
)

print(result['documents'])

### Filtering using metadata

In [None]:
# Retrieve the netflix_titles collection
collection = client.get_collection(
  name="netflix_titles",
  embedding_function=OpenAIEmbeddingFunction(api_key="<OPENAI_API_TOKEN>")
)

reference_texts = ["children's story about a car", "lions"]

# Query two results using reference_texts
result = collection.query(
  query_texts=reference_texts,
  n_results=2,
  # Filter for titles with a G rating released before 2019
  where={
    "$and": [
        {"rating": 
        	{"$eq": "G"}
        },
        {"release_year": 
         	{"$lt": 2019}
        }
    ]
  }
)

print(result['documents'])