In [1]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
from scipy import spatial  # for calculating vector similarities for search


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

In [3]:
# https://cdn.openai.com/API/examples/data/winter_olympics_2022.csv
embeddings_path = "./winter_olympics_2022.csv"

df = pd.read_csv(embeddings_path)
# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)
# the dataframe has two columns: "text" and "embedding"
df # print

                                                   text  \
0     Lviv bid for the 2022 Winter Olympics\n\n{{Oly...   
1     Lviv bid for the 2022 Winter Olympics\n\n==His...   
2     Lviv bid for the 2022 Winter Olympics\n\n==Ven...   
3     Lviv bid for the 2022 Winter Olympics\n\n==Ven...   
4     Lviv bid for the 2022 Winter Olympics\n\n==Ven...   
...                                                 ...   
6054  Anaïs Chevalier-Bouchet\n\n==Personal life==\n...   
6055  Uliana Nigmatullina\n\n{{short description|Rus...   
6056  Uliana Nigmatullina\n\n==Biathlon results==\n\...   
6057  Uliana Nigmatullina\n\n==Biathlon results==\n\...   
6058  Uliana Nigmatullina\n\n==Biathlon results==\n\...   

                                              embedding  
0     [-0.005021067801862955, 0.00026050032465718687...  
1     [0.0033927420154213905, -0.007447326090186834,...  
2     [-0.00915789045393467, -0.008366798982024193, ...  
3     [0.0030951891094446182, -0.006064314860850573,...  
4

In [None]:
# search function
def strings_ranked_by_relatedness(
        query: str,
        df: pd.DataFrame,
        relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
        top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]