In [1]:
!pip install openai requests --quiet

In [2]:
import os
import openai

openai.api_key = os.environ.get("OPENAI_API_KEY")


In [3]:
import pandas as pd

art_facts_df = pd.read_csv("data/art-facts.csv", sep="|", header=None, names=["fact"])
art_facts_df.head()

Unnamed: 0,fact
0,Vincent van Gogh sold only one painting during...
1,Leonardo da Vinci was ambidextrous and could w...
2,Pablo Picasso could draw before he could walk.
3,Frida Kahlo began painting after a severe bus ...
4,Michelangelo's David was sculpted from a singl...


In [4]:
new_facts = [
    "Yayoi Kusama, known for her polka dots, has been a major figure in the avant-garde movement since the 1960s.",
    "Banksy, an anonymous England-based street artist, is renowned for his politically themed and satirical street art.",
    "Zaha Hadid, known as the 'Queen of the Curve,' was the first woman to receive the Pritzker Architecture Prize in 2004.",
    "Jean-Michel Basquiat went from being homeless to selling a painting for over $100 million, highlighting his meteoric rise in the art world.",
    "The Louvre Museum, originally a royal palace, holds over 380,000 objects and displays 35,000 works of art, including the Mona Lisa.",
]

In [5]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model="text-embedding-3-small").data[0].embedding

art_facts_df['ada_embedding'] = art_facts_df.fact.apply(get_embedding)

In [6]:
new_facts_df = pd.DataFrame(new_facts, columns=["fact"])
new_facts_df['ada_embedding'] = new_facts_df.fact.apply(get_embedding)


In [7]:
art_facts_df["dataset"] = "training"
new_facts_df["dataset"] = "new"
final_art_facts_df = pd.concat([art_facts_df, new_facts_df]).reset_index(drop=True)

In [8]:
final_art_facts_df.sample(5)

Unnamed: 0,fact,ada_embedding,dataset
13,Gustav Klimt's most famous painting is 'The Ki...,"[-0.05941743403673172, -0.0163740124553442, -0...",training
9,Andy Warhol was a leading figure in the visual...,"[-0.03704332560300827, -0.016979891806840897, ...",training
37,Klimt was a central figure in the Vienna Seces...,"[-0.08770947903394699, -0.03105640411376953, -...",training
15,Edvard Munch's most famous work is 'The Scream'.,"[-0.01564689725637436, 0.033545296639204025, -...",training
12,Johannes Vermeer specialized in domestic inter...,"[-0.0278290007263422, 0.04824869707226753, 0.0...",training


In [10]:
len(final_art_facts_df.iloc[0].ada_embedding)

1536

In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
fact_to_compare_index = 51

def get_similar_facts(index, df):
    dense_vectors = np.array(list(df["ada_embedding"].apply(lambda x: np.array(x))))
    
    # Compute cosine similarity
    # The dense_vectors[index] needs to be reshaped to ensure it's 2D if it's not already
    similarities = cosine_similarity([dense_vectors[index]], dense_vectors)
    
    # Assign similarities back to the DataFrame
    df["similarity"] = similarities[0]
    
    # Return the DataFrame sorted by similarity
    return df.sort_values(by="similarity", ascending=False)


get_similar_facts(fact_to_compare_index, final_art_facts_df)[:5]

Unnamed: 0,fact,ada_embedding,dataset,similarity
51,"Yayoi Kusama, known for her polka dots, has be...","[-0.00819129217416048, -0.01896316185593605, 0...",new,1.0
23,Kandinsky is credited with painting the first ...,"[-0.028202960267663002, -0.007871092297136784,...",training,0.441183
9,Andy Warhol was a leading figure in the visual...,"[-0.03704332560300827, -0.016979891806840897, ...",training,0.439029
28,Kahlo's work is celebrated for its uncompromis...,"[0.02692466787993908, 0.014319660142064095, -0...",training,0.430462
30,O’Keeffe has been called the 'Mother of Americ...,"[0.07060316205024719, -0.018229320645332336, -...",training,0.409512


In [12]:
baseball_facts_df = pd.read_csv("data/baseball-facts-01.csv", sep="|", header=None, names=["fact"])
jazz_facts_df = pd.read_csv("data/jazz-facts-01.csv", sep="|", header=None, names=["fact"])
baseball_facts_df["dataset"] = "baseball"
jazz_facts_df["dataset"] = "jazz"

In [14]:
full_df = pd.concat([art_facts_df, baseball_facts_df, jazz_facts_df]).reset_index(drop=True)
full_df["ada_embedding"] = full_df.fact.apply(get_embedding)

In [15]:
full_df.sample(5)

Unnamed: 0,fact,ada_embedding,dataset
5,Claude Monet founded the French Impressionist ...,"[-0.015810076147317886, 0.013010584749281406, ...",training
58,Influential Figures: Some of the most influent...,"[0.013417782261967659, -0.03790966793894768, 0...",jazz
51,"Origins: The origins of baseball are debated, ...","[-0.03109973855316639, -0.0036125697661191225,...",baseball
41,Titian's real name was Tiziano Vecellio.,"[-0.007993349805474281, -0.012574519030749798,...",training
14,Caravaggio is known for his use of dramatic li...,"[-0.010245920158922672, -0.03482024371623993, ...",training


In [17]:
new_baseball_facts = pd.read_csv("data/baseball-facts-02.csv", sep="|", header=None, names=["fact"])
new_baseball_facts["dataset"] = "new baseball"
new_jazz_facts = pd.read_csv("data/jazz-facts-02.csv", sep="|", header=None, names=["fact"])
new_jazz_facts["dataset"] = "new jazz"
new_facts_df = pd.concat([new_baseball_facts, new_jazz_facts]).reset_index(drop=True)
new_facts_df['ada_embedding'] = new_facts_df.fact.apply(get_embedding)


In [18]:
combined_df = pd.concat([full_df, new_facts_df]).reset_index(drop=True)

In [19]:
combined_df.sample(5)

Unnamed: 0,fact,ada_embedding,dataset
70,International Jazz Day: UNESCO declared April ...,"[-0.026415400207042694, -0.011050738394260406,...",new jazz
5,Claude Monet founded the French Impressionist ...,"[-0.015810076147317886, 0.013010584749281406, ...",training
49,Michelangelo's 'The Last Judgment' covers the ...,"[-0.008831469342112541, 0.01284672599285841, 0...",training
39,Munch's 'The Scream' has been stolen twice.,"[-0.0054969401098787785, 0.05982716381549835, ...",training
48,Rothko's art is seen as a precursor to abstrac...,"[-0.029300494119524956, 0.027020489796996117, ...",training


In [20]:
combined_df.dataset.value_counts()

dataset
training        51
baseball         5
jazz             5
new baseball     5
new jazz         5
Name: count, dtype: int64

In [22]:
combined_df["dataset"].str.contains("new")

0     False
1     False
2     False
3     False
4     False
      ...  
66     True
67     True
68     True
69     True
70     True
Name: dataset, Length: 71, dtype: bool

In [21]:
combined_df[combined_df["dataset"].str.contains("new")]

Unnamed: 0,fact,ada_embedding,dataset
61,Hall of Fame: The National Baseball Hall of Fa...,"[-0.02534695342183113, -0.018908988684415817, ...",new baseball
62,Record Holders: Some notable record holders in...,"[0.05110907182097435, 0.009580457583069801, 0....",new baseball
63,Integration: Jackie Robinson broke the basebal...,"[-0.04465446248650551, -0.04984288662672043, 0...",new baseball
64,Economic Impact: Baseball is a significant eco...,"[-0.011238553561270237, -0.008753512986004353,...",new baseball
65,Cultural Impact: Baseball has had a profound i...,"[0.040041785687208176, 0.006350216455757618, -...",new baseball
66,Instrumentation: Common jazz instruments inclu...,"[-0.013267251662909985, 0.019517024978995323, ...",new jazz
67,Jazz Standards: Jazz standards are musical com...,"[-0.005088659469038248, -0.04408976435661316, ...",new jazz
68,Influence on Other Genres: Jazz has influenced...,"[-0.027099695056676865, -0.041977569460868835,...",new jazz
69,Jazz Education: There are numerous institution...,"[-0.033211350440979004, -0.00677696755155921, ...",new jazz
70,International Jazz Day: UNESCO declared April ...,"[-0.026415400207042694, -0.011050738394260406,...",new jazz


In [23]:
new_index = 61
get_similar_facts(new_index, combined_df)[:5]

Unnamed: 0,fact,ada_embedding,dataset,similarity
61,Hall of Fame: The National Baseball Hall of Fa...,"[-0.02534695342183113, -0.018908988684415817, ...",new baseball,1.0
65,Cultural Impact: Baseball has had a profound i...,"[0.040041785687208176, 0.006350216455757618, -...",new baseball,0.44244
62,Record Holders: Some notable record holders in...,"[0.05110907182097435, 0.009580457583069801, 0....",new baseball,0.434316
55,International Play: Baseball is played profess...,"[-0.02373475767672062, -0.029355548322200775, ...",baseball,0.423117
53,Major Leagues: Major League Baseball (MLB) in ...,"[-0.059779442846775055, -0.003021507291123271,...",baseball,0.408175


In [24]:
new_index = 67
get_similar_facts(new_index, combined_df)[:5]

Unnamed: 0,fact,ada_embedding,dataset,similarity
67,Jazz Standards: Jazz standards are musical com...,"[-0.005088659469038248, -0.04408976435661316, ...",new jazz,1.0
66,Instrumentation: Common jazz instruments inclu...,"[-0.013267251662909985, 0.019517024978995323, ...",new jazz,0.53412
59,Jazz Styles: Jazz has evolved into many differ...,"[-0.020899906754493713, -0.004004450514912605,...",jazz,0.521962
57,Improvisation: A key feature of jazz music is ...,"[-0.004410018678754568, 0.002082218648865819, ...",jazz,0.491001
60,"The Jazz Age: The 1920s, known as the Jazz Age...","[-0.03348676115274429, -0.010527645237743855, ...",jazz,0.482505
