# Prep OpenAI embeddings

This notebook uses OpenAI models to generate embeddings for a set of common English words. 

In [1]:
import os

from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

azure_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
openai_client = OpenAI(
    base_url=os.environ["AZURE_OPENAI_ENDPOINT"] + "/openai/v1/",
    api_key=token_provider,
)
MODEL_NAME = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]
MODEL_DIMENSIONS = int(os.environ["AZURE_OPENAI_EMBEDDING_DIMENSIONS"])

def get_embeddings(words):
    """Calculate embeddings using OpenAI in a batch (all words at once)"""
    word_vectors = {}

    embeddings_response = openai_client.embeddings.create(
        model=MODEL_NAME,
        input=words,
        dimensions=MODEL_DIMENSIONS,
    )
    for word, embedding_object in zip(words, embeddings_response.data):
        word_vectors[word] = embedding_object.embedding
    return word_vectors

In [None]:
# Open openai_movies.json, which is a dict of movie titles to movie embeddings
# And compute new embeddings
import json

from tqdm.notebook import tqdm

new_movie_vectors = {}
with open("embeddings/movies_text-embedding-ada-002.json") as f:
    disney_vectors = json.load(f)
    for movie, embedding in tqdm(disney_vectors.items(), desc="Computing new embeddings"):
        new_movie_vectors[movie] = get_embeddings([movie])[movie]
# Write new embeddings to openai_movies
filename = f"openai_movies_{MODEL_NAME}-{MODEL_DIMENSIONS}.json"
with open(filename, "w") as f:
    json.dump(new_movie_vectors, f, indent=4)

Computing new embeddings:   0%|          | 0/573 [00:00<?, ?it/s]

In [3]:
import csv
import json

# Open most-common-nouns-english.csv and read the first column as words
words = []
with open('embeddings/most-common-nouns-english.csv') as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        words.append(row[0])

# Calculate embeddings for all words
word_vectors = get_embeddings(words)

# Save embeddings to a file
filename = f"embeddings/words_{MODEL_NAME}-{MODEL_DIMENSIONS}.json"

with open(filename, 'w') as f:
    json.dump(word_vectors, f, indent=4)
