Set everything up

In [None]:
import openai
import json
import csv
import os
import matplotlib.pyplot as plt
import random
import numpy as np
from collections import Counter
from collections import OrderedDict
import re

def open_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as infile:
        return infile.read()


openai.api_key = open_file('openaiapikey.txt')

def gpt3_embedding(content, engine='text-similarity-ada-001'):
    response = openai.Embedding.create(input=content,engine=engine)
    vector = response['data'][0]['embedding']  # this is a normal list
    return vector

If movie synopsis embeddings data doesn't exist yet, load movie synopsis data and compute embeddings for the movie synopses

Note: Try the synopsis by itself, and the synopsis with a prefix of "Movie synopsis: " (to provide more context to the LLM).
My theory is that providing the prefix will produce a more tightly clustered set of resulting embeddings, as the model will have more understanding of the context of the paragraphs.

In [None]:
json_file = 'index.json'

# Check if the JSON file does NOT exist
if not os.path.exists(json_file):
    # File doesn't exist - load synopses and generate embeddings
    synopses = []

    # Open the CSV data file
    with open('imdb_top_1000.csv', 'r', encoding='utf-8', errors='replace') as csvfile:
        # Create a CSV reader object
        csvreader = csv.reader(csvfile)
        
        # Skip the header if your CSV has one
        next(csvreader, None)
        
        # Loop through each row in the CSV
        for row in csvreader:
            # Extract the description 
            # synopsis = "Movie synopsis: " + row[7]
            synopsis = row[7]
            
            # Append the 8th field to the list
            synopses.append(synopsis)
    
    # Print the first synopsis
    # print(synopses[0])
    
    result = list()
    
    for synopsis in synopses:
            embedding = gpt3_embedding(synopsis.encode(encoding='ASCII',errors='ignore').decode())
            info = {'content': synopsis, 'vector': embedding}
    #       print(info, '\n\n\n')
            result.append(info)
    
    with open('index.json', 'w') as outfile:
            json.dump(result, outfile, indent=2)


Iterate through the embeddings to create lower and upper bound vectors (for each vector parameter / dimension), and the size of the range of values at each parameter position / dimension.

This establishes a bounded subset of the latent space, within which all of the synopses exist.


In [None]:
with open('index.json', 'r') as infile:
    data = json.load(infile)

min_vector = []
max_vector = []
range_vector = []

dimensionality = len(data[0]['vector'])
#print(dimensionality)

for j in range(dimensionality):
    # Initialize variable to store the minimum and maximum values for the current dimension
    min_value = 1
    max_value = -1

    for i in data:
        if i['vector'][j] < min_value:
            min_value = i['vector'][j]
        if i['vector'][j] > max_value:
            max_value = i['vector'][j]
            
    range_value = max_value - min_value
    min_vector.append(min_value)
    max_vector.append(max_value)
    range_vector.append(range_value)

    
print("Min vector")
print(min_vector)
print("Max vector")
print(max_vector)
print("Range vector")
print(range_vector)

This is debug.

This step looks into the sizes of the ranges for each vector dimension. Plot range for each vector dimension. Perhaps sorted from highest range to lowest. It will be interesting to see what this looks like; could be one of...
a) range on all dimensions is high (bad)
b) range on some dimensions is high, but low for the (majority of?) others. This is my guess.
c) range on all dimensions is low.

In [None]:
# Sort the vector in decreasing order
range_sorted_vector = sorted(range_vector, reverse=True)

# Create the plot
#plt.figure(figsize=(10, 5))  # Optional: Set the figure size
plt.plot(range_sorted_vector, marker=',', linestyle='-')
#plt.plot(range_vector, marker=',', linestyle='-')

# Add labels and title
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Plot of Range for each vector dimension')

# Show grid
plt.grid(True)

# Show the plot
plt.show()

This is debug.

Turns out there is a small number of embedding dimensions that have a larger range. Drill into the dimensions where the range is greater than 0.15.

In [None]:
# Find indices of values above 0.15
indices = [index for index, value in enumerate(range_vector) if value > 0.15]

# Print the indices
print("Indices of values above 0.15:", indices)
for i in range(len(indices)):
    print(f"Index: {indices[i]}  Range: {range_vector[indices[i]]}")


This is debug.

Side-track.
Plot the values for embedding dimension 822, across all of data

In [None]:
sidetrack_vector = []
for i in range(len(data)):
    sidetrack_vector.append(data[i]['vector'][822])

print(f"Min: {min(sidetrack_vector)}, Max: {max(sidetrack_vector)}, Range: {max(sidetrack_vector) - min(sidetrack_vector)}")

# Create the plot
#plt.figure(figsize=(10, 5))  # Optional: Set the figure size
plt.plot(sidetrack_vector, marker=',', linestyle='-')

# Add labels and title
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Plot of emdedding index 822 values')

# Show grid
plt.grid(True)

# Show the plot
plt.show()

Create 1,000 random embeddings (potential novel outliers), with all elements of each vector between the lower and upper bounds

There is an option here to extend the possible range for the random numbers to some percentage below the lower bound, and above the upper bound to increase the possible candidate space.

In [None]:
randoms = []

for i in range(1000):
    random_vector = []
    
    for j in range(dimensionality):
        random_vector.append(random.uniform(min_vector[j],max_vector[j]))

    randoms.append(random_vector)
    
# print(randoms[0])

In [None]:
# Plot to check that the random vector is actually between and min and max values in each dimension

# Plot the vectors
plt.plot(max_vector, marker=',', linestyle='-', label='Max Vector')
plt.plot(randoms[0], marker='x', linestyle='--', label='First Random Vector')
plt.plot(min_vector, marker=',', linestyle='-.', label='Min Vector')

# Add labels, title, and legend
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Plot of Three Vectors')
plt.legend()

# Show grid
plt.grid(True)

# Show the plot
plt.show()

For each random embedding, compute pairwise distances against all of the synopsis embeddings (the cosine similarity distance of the random embedding against every synopsis embedding). 

Initially I looked at the average cosine similarity for each random embedding against all the movie synopses. However, a better measure of novelty is the lowest maximum cosine similarity.

This step is computationally expensive, and takes a while.



In [None]:
max_CS_vector = []

# For each of the random vectors...
for i in range(len(randoms)):
    
    max_cosine_similarity = -1
    
    # For each movie synopsis...
    for j in range(len(data)):
        
        # Convert lists to NumPy arrays
        np_vector_a = np.array(data[j]['vector']) # Movie synopsis embedding
        np_vector_b = np.array(randoms[i]) # The random embedding
        
        # Calculate the dot product
        dot_product = np.dot(np_vector_a, np_vector_b)
        
        # Calculate the magnitudes (Euclidean norms)
        magnitude_a = np.linalg.norm(np_vector_a)
        magnitude_b = np.linalg.norm(np_vector_b)
        
        # Calculate the cosine similarity
        cosine_similarity = dot_product / (magnitude_a * magnitude_b)

        # Check if this cosine similarity is the maximum. If so, set the maximum to this.
        if cosine_similarity > max_cosine_similarity:
            max_cosine_similarity = cosine_similarity
        # print(f"randoms[{i}], data[{j}], Cosine Similarity: {cosine_similarity}")

    max_CS_vector.append(max_cosine_similarity)
    # print(f"randoms[{i}] average cosine similarity: {avg_cosine_similarity}")
    print(f"randoms[{i}] max cosine similarity: {max_cosine_similarity}")
    


Pick the random embedding with the lowest maximum cosine similarity from all the movie synopsis embeddings.
(1 is identical, 0 is orthogonal, -1 is diametrically opposed)
Max cosine similarity is the movie synopsis that is closest to the random embedding.
Random embedding with the lowest maximum cosine similarity is the one that is furthest from the closest established synopsis.

In [None]:
# Find the lowest max CS from the randoms
lowest_max_CS = min(max_CS_vector)

# Find the index of the best candidate random embedding
lowest_max_CS_index = max_CS_vector.index(lowest_max_CS)

outlier = []
outlier = randoms[lowest_max_CS_index]
print(f"The lowest maximum cosine similarity is {lowest_max_CS} (index {lowest_max_CS_index})")
#print(f"The corresponding outlier embedding is {outlier}")

Write the outlier to a file, for future reference.

In [None]:
result = list()
info = {'vector': outlier, 'max CS': lowest_max_CS}
# print(info, '\n\n\n')
result.append(info)
    
with open('outlier.json', 'w') as outfile:
            json.dump(result, outfile, indent=2)


Invert the outlier embedding to text
This is the hard bit.

"Embedding inversion"

https://www.sisap.org/2023/accepted.html - Fabio Carrara, Claudio Gennaro, Lucia Vadicamo and Giuseppe Amato. Vec2Doc: Transforming Dense Vectors into Sparse Representations for Efficient Information Retrieval

https://www.reddit.com/r/LanguageTechnology/comments/itb55x/vec2doc_reverse_document_embeddings_does_this/

https://arxiv.org/pdf/2004.00053.pdf#:~:text=First%2C%20embedding%20vectors%20can%20be,some%20of%20the%20input%20data.

https://arxiv.org/abs/2305.03010 - "Given the black-box access to a language model, we treat sentence embeddings as initial tokens' representations and train or fine-tune a powerful decoder model to decode the whole sequences directly."

https://arxiv.org/abs/2004.00053?ref=hackernoon.com

https://stats.stackexchange.com/questions/422430/inverse-word-embedding-vector-to-word

https://community.openai.com/t/embeddings-converting-a-embedded-vector-back-to-natural-language/202320

https://hackernoon.com/embeddings-arent-human-readable-and-other-nonsense - good

https://arxiv.org/pdf/2205.05124.pdf - may be of use

Might need to brute-force a solution. 
Generate embeddings for a limited vocabulary of key words. 
See which one is closest to the outlier embedding
Repeat for next word.
...


This is debug.

Calculate cosine similarity of the outlier against a test synopsis.

In [None]:
test_string = "In a post-apocalyptic California, a gripping saga chronicles the high-stakes relationships between cold-blooded killers and desperate survivors, who make a perilous journey to Arkansas, where sacrifice and relationship struggles become the ultimate test of humanity."
embedding = gpt3_embedding(test_string.encode(encoding='ASCII',errors='ignore').decode())

# Convert lists to NumPy arrays
np_vector_a = np.array(embedding)
np_vector_b = np.array(outlier)
        
# Calculate the dot product
dot_product = np.dot(np_vector_a, np_vector_b)
       
# Calculate the magnitudes (Euclidean norms)
magnitude_a = np.linalg.norm(np_vector_a)
magnitude_b = np.linalg.norm(np_vector_b)
        
# Calculate the cosine similarity
cosine_similarity = dot_product / (magnitude_a * magnitude_b)
print(f"Cosine similarity: {cosine_similarity}")

So we are going to try brute forcing the inversion of the outlier embedding.

First create a vocabulary. Start with the all the unique words from the original dateset. Then remove stop words.
Then create corresponding embeddings for each remaining word in the vocabulary.

There's some cost associated with this step.

It also takes a long time to run.

We write the results into a file, so you should only need to run this once, since the vocabulary won't change (unless you change the data set) and the embeddings won't change (unless you change the model).

In [None]:
# Initialize a Counter object to hold the vocabulary and frequencies
vocab_counter = Counter()

# Open the CSV file
with open('imdb_top_1000.csv', 'r', encoding='utf-8', errors='replace') as csvfile:
    csvreader = csv.reader(csvfile)
    
    # Skip the header row if your CSV has one
    next(csvreader, None)
    
    # Loop through each row in the CSV
    for row in csvreader:
        # Extract the sentence from the 8th field (index 7)
        sentence = row[7]
        
        # Tokenize the sentence into words
        # Using regex to split by non-alphabetic characters for simplicity
        words = re.split(r'\W+', sentence.lower())
        
        # Update the vocabulary and frequencies
        vocab_counter.update(words)

# Remove empty string if exists
if '' in vocab_counter:
    del vocab_counter['']

# Read stop words from the text file into a set
with open('stop words.txt', 'r', encoding='utf-8') as f:
    stop_words = set(line.strip() for line in f)

# Remove stop words from the vocabulary
filtered_vocab = {word: freq for word, freq in vocab_counter.items() if word not in stop_words}

# Sort the filtered vocabulary by frequency (most common words first)
sorted_vocab = OrderedDict(sorted(filtered_vocab.items(), key=lambda x: x[0], reverse=False))

# Now sorted_vocab holds the sorted vocabulary
#print("Sorted Vocabulary:", sorted_vocab)

vocab = list(sorted_vocab.keys())
print(vocab)

# Check if the JSON file does NOT exist
if not os.path.exists('dictionary.json'):
    # File doesn't exist - generate embeddings for first word dictionary
    result = []

    for word in vocab:
        embedding = gpt3_embedding(word.encode(encoding='ASCII',errors='ignore').decode())
        info = {'word': word, 'vector': embedding}
        print(info, '\n\n\n')
        result.append(info)
    
    with open('dictionary.json', 'w') as outfile:
        json.dump(result, outfile, indent=2)



#print(len(data))
#print(data[0])
#print(len(sorted_vocab))
#print(len(list(sorted_vocab.keys())))
#print(list(sorted_vocab.keys())[0])

Next, pairwise compare the vocabulary vectors to the outlier vector to find the words that match best.
Store the 20(?) best match words.

In [None]:
with open('dictionary.json', 'r') as infile:
    data = json.load(infile)

# Create a vector that will store the cosine similarities of each word in the dictionary against the outlier embedding.
similarities_vector = []

# For each word in the dictionary...
for i in range(len(data)):
    # Convert lists to NumPy arrays
    np_vector_a = np.array(data[i]['vector']) # The embedding of the word
    np_vector_b = np.array(outlier) # The outlier embedding
        
    # Calculate the dot product
    dot_product = np.dot(np_vector_a, np_vector_b)
       
    # Calculate the magnitudes (Euclidean norms)
    magnitude_a = np.linalg.norm(np_vector_a)
    magnitude_b = np.linalg.norm(np_vector_b)
        
    # Calculate the cosine similarity
    cosine_similarity = dot_product / (magnitude_a * magnitude_b)
    similarities_vector.append(cosine_similarity)
    #print(f"Word: {data[i]['word']},  Cosine similarity: {cosine_similarity}")
    
#print(len(similarities_vector))
#print(max(similarities_vector))
#print(data[similarities_vector.index(max(similarities_vector))]['word'])

# Sort the list in descending order
sorted_similarities_vector = sorted(similarities_vector, reverse=True)

# Take the first N elements
# top_10_values = sorted_similarities_vector[:10]
top_20_values = sorted_similarities_vector[:20]

# Print the N highest values

best_words = [] 

#for i in top_10_values:
for i in top_20_values:
    print(data[similarities_vector.index(i)]['word'])
    best_words.append(data[similarities_vector.index(i)]['word'])
    
for i in range(20):
    print(best_words[i])

Call the openAI API to generate permutations of synopses based on the best match words. Use the best available model for this. No issue that I can see with this being a different model to what I've been using for embedding generation.

Get the 5 permutations out of the response from the LLM.

Calculate embeddings for each of the 5 permutations / candidates.

Finally, calculate the cosine similarity for each permutation / candidate against the outlier that we generated earlier. Higher value equals a better match.

This stage is annoying, because the format of the output from the LLM is somewhat non-deterministic. Sometimes you end up with name / value pairs where the name is "synopsis" (good). Other times the model will name the synopses as "name", or "synopsis1", "synopsis2", etc. My python isn't good enough to handle this. I suggest re-running this cell if it breaks. Not too expensive, since it's only one prompt per time.

In [None]:
best_words_string = " ".join(best_words)
print(best_words_string)

response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are a helpful special-purpose assistant. The user is working on an embedding inversion project. The user will provide you with a list of words that have some semantic similarity to the embedding vector that we are trying to invert. You will use combinations of these words to create 5 candidate narratives in the form of movie synopses. You will return the candidate synopses in JSON format. Don't prefix the JSON response with anything - just provide the bare JSON. In the JSON, don't provide a title - just provide the synopses. In the name / value pairs in the JSON, use the name 'synopsis' for all values."},
        {"role": "user", "content": best_words_string},
    ]
)

#print(response)
#print(type(response))

content = response['choices'][0]['message']['content']
print(content)
#print(type(content))

# Parse the JSON data
synopses_data = json.loads(content)

# Extract the values into a Python array
synopses_list = [item["synopsis"] for item in synopses_data]

# Print the resulting array
print(synopses_list)

result = []
for i in range(5):
    embedding = gpt3_embedding(synopses_list[i].encode(encoding='ASCII',errors='ignore').decode())
    info = {'synopsis': synopses_list[i], 'vector': embedding}
    print(info, '\n\n\n')
    result.append(info)

for i in range(5):
    # Convert lists to NumPy arrays
    np_vector_a = np.array(result[i]['vector']) # The embedding of the candidate synopsis
    np_vector_b = np.array(outlier) # The outlier embedding
        
    # Calculate the dot product
    dot_product = np.dot(np_vector_a, np_vector_b)
       
    # Calculate the magnitudes (Euclidean norms)
    magnitude_a = np.linalg.norm(np_vector_a)
    magnitude_b = np.linalg.norm(np_vector_b)
        
    # Calculate the cosine similarity
    cosine_similarity = dot_product / (magnitude_a * magnitude_b)
    print(f"Synopsis: {result[i]['synopsis']},  Cosine similarity: {cosine_similarity}")
 

Thank you for following to the end.