In [None]:
# The Oxford 3000: "The 3000 most important words in the English language"

import requests

url = "https://raw.githubusercontent.com/sapbmw/The-Oxford-3000/master/The_Oxford_3000.txt"
def fetch_oxford_3000():
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error {response.status_code}: Unable to fetch data")
        return []
    return response.text.splitlines()

def filter_alpha(strings):
    # Check if the string is alphabetic and return it in lowercase
    return [s.lower() for s in strings if s.isalpha()]

oxford_3000 = filter_alpha(fetch_oxford_3000())
print(len(oxford_3000), oxford_3000[:10])

3389 ['abandon', 'abandoned', 'ability', 'able', 'about', 'above', 'abroad', 'absence', 'absent', 'absolute']


In [None]:
word_embeddings_map = {}

import os
try:
    import openai
except Exception:
    !pip install openai
    import openai

openai.api_key = None # ENTER AN OPENAI API KEY
if not openai.api_key:
    raise Exception("Missing OpenAI Key")

model = "text-embedding-ada-002"
n_items = len(oxford_3000)
batch_size = 1000
n_batches = (n_items + batch_size - 1) // batch_size
for i in range(n_batches):
    start, end = i * batch_size, (i + 1) * batch_size
    input = oxford_3000[start:end]
    response = openai.Embedding.create(input=input, model=model)
    embeddings = [i["embedding"] for i in response["data"]]
    for word, embedding in zip(input, embeddings):
        word_embeddings_map[word] = embedding

print(len(word_embeddings_map))

3387


In [None]:
# Installations and imports
try:
    import numpy as np
    from sklearn.decomposition import PCA
    import plotly.express as px
except Exception:
    !pip install sklearn plotly
    import numpy as np
    from sklearn.decomposition import PCA
    import plotly.express as px

# Data setup
words = list(word_embeddings_map.keys())
embeddings = np.array(list(word_embeddings_map.values()))

# Perform 3D PCA
pca = PCA(n_components=3)
reduced_embeddings = pca.fit_transform(embeddings)


import plotly.graph_objects as go

def visualize_3d_highlight(words_to_highlight):

    # Default values for all words
    colors = ['blue' if word not in words_to_highlight else 'red' for word in words]
    opacities = [0.1 if word not in words_to_highlight else 1.0 for word in words]
    sizes = [5 if word not in words_to_highlight else 10 for word in words]
    texts = [None if word not in words_to_highlight else word for word in words]

    # Create the scatter plot
    scatter = go.Scatter3d(
        x=reduced_embeddings[:, 0],
        y=reduced_embeddings[:, 1],
        z=reduced_embeddings[:, 2],
        mode='markers+text',
        marker=dict(color=colors, size=sizes),
        text=texts
    )
    layout = go.Layout(height=800)  # Set height as per your preference
    fig = go.Figure(data=[scatter], layout=layout)
    fig.show()

# Call the function and pass the words you want to highlight
words_to_highlight = ["life", "death", "awake", "asleep", "day", "night"] # replace with the words you want
visualize_3d_highlight(words_to_highlight)