In [None]:
!pip install openai requests --quiet

In [None]:
import os
import openai

openai.api_key = os.environ.get("OPENAI_API_KEY")


In [None]:
import pandas as pd

art_facts_df = pd.read_csv("data/art-facts.csv", sep="|", header=None, names=["fact"])
art_facts_df.head()

In [None]:
new_facts = [
    "Yayoi Kusama, known for her polka dots, has been a major figure in the avant-garde movement since the 1960s.",
    "Banksy, an anonymous England-based street artist, is renowned for his politically themed and satirical street art.",
    "Zaha Hadid, known as the 'Queen of the Curve,' was the first woman to receive the Pritzker Architecture Prize in 2004.",
    "Jean-Michel Basquiat went from being homeless to selling a painting for over $100 million, highlighting his meteoric rise in the art world.",
    "The Louvre Museum, originally a royal palace, holds over 380,000 objects and displays 35,000 works of art, including the Mona Lisa.",
]

In [None]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

art_facts_df['ada_embedding'] = art_facts_df.fact.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [None]:
new_facts_df = pd.DataFrame(new_facts, columns=["fact"])
new_facts_df['ada_embedding'] = new_facts_df.fact.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))


In [None]:
art_facts_df["dataset"] = "training"
new_facts_df["dataset"] = "new"
final_art_facts_df = pd.concat([art_facts_df, new_facts_df]).reset_index(drop=True)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
fact_to_compare_index = 51

def get_similar_facts(index, df):
    # Ensure that the "lda_vector_dense" column is correctly formatted as a list of numpy arrays
    dense_vectors = np.array(list(df["ada_embedding"].apply(lambda x: np.array(x))))
    
    # Compute cosine similarity
    # The dense_vectors[index] needs to be reshaped to ensure it's 2D if it's not already
    similarities = cosine_similarity([dense_vectors[index]], dense_vectors)
    
    # Assign similarities back to the DataFrame
    df["similarity"] = similarities[0]
    
    # Return the DataFrame sorted by similarity
    return df.sort_values(by="similarity", ascending=False)


get_similar_facts(fact_to_compare_index, final_art_facts_df)[:5]

In [None]:
baseball_facts_df = pd.read_csv("data/baseball-facts-01.csv", sep="|", header=None, names=["fact"])
jazz_facts_df = pd.read_csv("data/jazz-facts-01.csv", sep="|", header=None, names=["fact"])
baseball_facts_df["dataset"] = "baseball"
jazz_facts_df["dataset"] = "jazz"

In [None]:
full_df = pd.concat([art_facts_df, baseball_facts_df, jazz_facts_df]).reset_index(drop=True)
full_df["ada_embedding"] = full_df.fact.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [None]:
full_df.sample(5)

In [None]:
new_baseball_facts = pd.read_csv("data/baseball-facts-02.csv", sep="|", header=None, names=["fact"])
new_baseball_facts["dataset"] = "new baseball"
new_jazz_facts = pd.read_csv("data/jazz-facts-02.csv", sep="|", header=None, names=["fact"])
new_jazz_facts["dataset"] = "new jazz"
new_facts_df = pd.concat([new_baseball_facts, new_jazz_facts]).reset_index(drop=True)
new_facts_df['ada_embedding'] = new_facts_df.fact.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))


In [None]:
combined_df = pd.concat([full_df, new_facts_df]).reset_index(drop=True)

In [None]:
combined_df.sample(5)

In [None]:
combined_df.dataset.value_counts()

In [None]:
combined_df[combined_df["dataset"].str.contains("new")]

In [None]:
new_index = 61
get_similar_facts(new_index, combined_df)[:5]

In [None]:
new_index = 67
get_similar_facts(new_index, combined_df)[:5]