In [37]:
# 1. Producing a batch request dataset and saving to disk.
import json
from datetime import datetime

def write_embedding_batch_dataset(story_filepath, model):
    with open(story_filepath, "r") as in_f, open(f"data/embeddings/embedding_batch_{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.jsonl", "w") as out_f:
        
        lines = []
        for k, line in enumerate(in_f):
            entry = json.loads(line)
            lines.append(json.dumps(
                {
                    "custom_id": entry["generation_id"],
                    "method": "POST",
                    "url": "/v1/embeddings",
                    "body": {"model": model, "input": entry["story"]}
                }
            ))

        if model == "text-embedding-3-large":
            print(f"Expect a disk size of {len(lines) * 0.0414:0f} MB for the embeddings.")

        out_f.write("\n".join(lines))

write_embedding_batch_dataset("data/claude_200_completions.jsonl", "text-embedding-3-large")


Expect a disk size of 56.842200 MB for the embeddings.


2. Retrieving the batch through the OpenAI Web UI and saving to disk (To be implemented in code if needed)

In [45]:
# 3. Analyzing the embeddings

import json
import pandas as pd
import numpy as np
import umap
import plotly.express as px
import textwrap

embeddings_file = 'data/embeddings/claude_200_completions.jsonl'
stories_file = 'data/claude_200_completions.jsonl'

embeddings_data = []
with open(embeddings_file, 'r') as f:
    for line in f:
        embeddings_data.append(json.loads(line))

stories_data = []
with open(stories_file, 'r') as f:
    for line in f:
        stories_data.append(json.loads(line))

story_dict = {story['generation_id']: k for k, story in enumerate(stories_data)}
matched_data = []
for embedding_entry in embeddings_data:
    story_index = story_dict[embedding_entry['custom_id']]
    embedding = embedding_entry['response']['body']['data'][0]['embedding']
    story_info = stories_data[story_index]
    wrapped_story = '\n'.join(textwrap.wrap(story_info['story'], width=50))
    matched_data.append({
        'embedding': embedding,
        'story': wrapped_story,
        'theme': story_info['theme'],
        'topic': story_info['topic']
    })

embeddings = np.array([item['embedding'] for item in matched_data])
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embedding_2d = umap_reducer.fit_transform(embeddings)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [None]:
# Find closest Stories (Marked for deletion, too slow)

from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Convert the list of embeddings into a numpy array
embeddings = np.array([entry['embedding'] for entry in matched_data])

max_distance = -1
story_pair = (None, None)

# Iterate over all pairs of embeddings to compute the cosine distance
for i in tqdm(range(len(embeddings))):
    for j in range(i + 1, len(embeddings)):
        similarity = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
        distance = 1 - similarity
        if distance > max_distance:
            max_distance = distance
            story_pair = (i, j)

if story_pair[0] is not None and story_pair[1] is not None:
    story1 = matched_data[story_pair[0]]['story']
    story2 = matched_data[story_pair[1]]['story']
    print("Story 1 with maximal cosine distance:")
    print(story1)
    print("\nStory 2 with maximal cosine distance:")
    print(story2)
else:
    print("No stories to compare.")


In [46]:
# 4. Visualization with Plotly

df = pd.DataFrame(embedding_2d, columns=['x', 'y'])
df['story'] = [item['story'].replace("\n", "<br>").strip() for item in matched_data]
df['theme'] = [item['theme'] for item in matched_data]
df['topic'] = [item['topic'] for item in matched_data]

hover_template = "<b>Story:</b><br>%{customdata[0]}<br><extra></extra>"

fig = px.scatter(df, x='x', y='y', symbol='theme', color='topic', 
                 hover_data={'story': True, 'theme': False, 'x': False, 'y': False},
                 custom_data=['story'],
                 title="UMAP of Story Embeddings")

fig.update_traces(hovertemplate=hover_template)

fig.update_layout(
    title={'x': 0.5},
    xaxis_title=None,
    yaxis_title=None,
    margin=dict(l=0, r=0, t=50, b=0),
    legend_title_text='',
    hoverlabel=dict(font_size=11),
)

fig.show()
fig.write_html("data/embeddings/web/index.html")
