In [1]:
!pip install gensim scikit-learn --quiet

In [2]:
import pandas as pd

art_facts_df = pd.read_csv("data/art-facts.csv", sep="|", header=None, names=["fact"])
art_facts_df.head()

Unnamed: 0,fact
0,Vincent van Gogh sold only one painting during...
1,Leonardo da Vinci was ambidextrous and could w...
2,Pablo Picasso could draw before he could walk.
3,Frida Kahlo began painting after a severe bus ...
4,Michelangelo's David was sculpted from a singl...


In [3]:
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def preprocess(text):
    return [word for word in simple_preprocess(text) if word not in STOPWORDS]

art_facts_df["preprocessed"] = art_facts_df['fact'].apply(preprocess)


In [5]:
art_facts_df.sample(5)

Unnamed: 0,fact,preprocessed
21,Turner is known for his expressive colorizatio...,"[turner, known, expressive, colorizations, ima..."
34,Dalí designed the logo for Chupa Chups lollipops.,"[dalí, designed, logo, chupa, chups, lollipops]"
23,Kandinsky is credited with painting the first ...,"[kandinsky, credited, painting, purely, abstra..."
17,Titian is considered the greatest Venetian art...,"[titian, considered, greatest, venetian, artis..."
37,Klimt was a central figure in the Vienna Seces...,"[klimt, central, figure, vienna, secession, mo..."


In [6]:
from gensim import corpora

dictionary = corpora.Dictionary(art_facts_df.preprocessed.values)
art_facts_df["corpus"] = [dictionary.doc2bow(doc) for doc in art_facts_df.preprocessed.values]


In [7]:
art_facts_df.head()

Unnamed: 0,fact,preprocessed,corpus
0,Vincent van Gogh sold only one painting during...,"[vincent, van, gogh, sold, painting, lifetime]","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]"
1,Leonardo da Vinci was ambidextrous and could w...,"[leonardo, da, vinci, ambidextrous, write, dra...","[(6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11,..."
2,Pablo Picasso could draw before he could walk.,"[pablo, picasso, draw, walk]","[(8, 1), (13, 1), (14, 1), (15, 1)]"
3,Frida Kahlo began painting after a severe bus ...,"[frida, kahlo, began, painting, severe, bus, a...","[(2, 1), (16, 1), (17, 1), (18, 1), (19, 1), (..."
4,Michelangelo's David was sculpted from a singl...,"[michelangelo, david, sculpted, single, block,...","[(22, 1), (23, 1), (24, 1), (25, 1), (26, 1), ..."


In [8]:
from gensim.models import LdaModel # Latent Dirichlet Allocation

lda_model = LdaModel(corpus=art_facts_df["corpus"], id2word=dictionary, num_topics=5, passes=15)


In [9]:
lda_model.num_topics

5

In [13]:
import numpy as np

def sparse_to_dense(sparse):
    dense = np.zeros(lda_model.num_topics)
    for (topic, prob) in sparse:
        dense[topic] = prob
    return dense

art_facts_df["lda_vector"] = [lda_model[doc] for doc in art_facts_df["corpus"]]
art_facts_df["lda_vector_dense"] = art_facts_df["lda_vector"].apply(sparse_to_dense)

In [11]:
art_facts_df.head()

Unnamed: 0,fact,preprocessed,corpus,lda_vector,lda_vector_dense
0,Vincent van Gogh sold only one painting during...,"[vincent, van, gogh, sold, painting, lifetime]","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]","[(0, 0.028622186), (1, 0.02865547), (2, 0.0285...","[0.028622185811400414, 0.02865546941757202, 0...."
1,Leonardo da Vinci was ambidextrous and could w...,"[leonardo, da, vinci, ambidextrous, write, dra...","[(6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11,...","[(0, 0.89931744), (1, 0.025005257), (2, 0.0253...","[0.8993174433708191, 0.02500525675714016, 0.02..."
2,Pablo Picasso could draw before he could walk.,"[pablo, picasso, draw, walk]","[(8, 1), (13, 1), (14, 1), (15, 1)]","[(0, 0.8396275), (1, 0.040339265), (2, 0.04001...","[0.8396275043487549, 0.04033926501870155, 0.04..."
3,Frida Kahlo began painting after a severe bus ...,"[frida, kahlo, began, painting, severe, bus, a...","[(2, 1), (16, 1), (17, 1), (18, 1), (19, 1), (...","[(0, 0.02508626), (1, 0.8994526), (2, 0.025005...","[0.025086259469389915, 0.8994526267051697, 0.0..."
4,Michelangelo's David was sculpted from a singl...,"[michelangelo, david, sculpted, single, block,...","[(22, 1), (23, 1), (24, 1), (25, 1), (26, 1), ...","[(0, 0.028605813), (1, 0.8856594), (2, 0.02857...","[0.028605813160538673, 0.885659396648407, 0.02..."


In [12]:
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


(0, '0.035*"known" + 0.022*"art" + 0.022*"famous" + 0.015*"figure" + 0.015*"visual"')
(1, '0.025*"michelangelo" + 0.017*"works" + 0.017*"vermeer" + 0.017*"known" + 0.017*"painting"')
(2, '0.022*"paintings" + 0.022*"turner" + 0.022*"matisse" + 0.022*"spanish" + 0.022*"fauvism"')
(3, '0.026*"movement" + 0.026*"impressionist" + 0.026*"monet" + 0.014*"considered" + 0.014*"greatest"')
(4, '0.036*"painting" + 0.036*"van" + 0.025*"rembrandt" + 0.014*"work" + 0.014*"real"')


In [14]:
new_facts = [
    "Yayoi Kusama, known for her polka dots, has been a major figure in the avant-garde movement since the 1960s.",
    "Banksy, an anonymous England-based street artist, is renowned for his politically themed and satirical street art.",
    "Zaha Hadid, known as the 'Queen of the Curve,' was the first woman to receive the Pritzker Architecture Prize in 2004.",
    "Jean-Michel Basquiat went from being homeless to selling a painting for over $100 million, highlighting his meteoric rise in the art world.",
    "The Louvre Museum, originally a royal palace, holds over 380,000 objects and displays 35,000 works of art, including the Mona Lisa.",
]

new_facts_df = pd.DataFrame(new_facts, columns=["fact"])
new_facts_df["preprocessed"] = new_facts_df['fact'].apply(preprocess)
new_facts_df["corpus"] = [dictionary.doc2bow(doc) for doc in new_facts_df.preprocessed.values]
new_facts_df["lda_vector"] = [lda_model[doc] for doc in new_facts_df["corpus"]]
new_facts_df["lda_vector_dense"] = new_facts_df["lda_vector"].apply(sparse_to_dense)
new_facts_df

Unnamed: 0,fact,preprocessed,corpus,lda_vector,lda_vector_dense
0,"Yayoi Kusama, known for her polka dots, has be...","[yayoi, kusama, known, polka, dots, major, fig...","[(33, 1), (38, 1), (53, 1)]","[(0, 0.7982775), (1, 0.050204575), (2, 0.05043...","[0.7982774972915649, 0.050204575061798096, 0.0..."
1,"Banksy, an anonymous England-based street arti...","[banksy, anonymous, england, based, street, ar...","[(52, 1), (93, 1)]","[(0, 0.06923429), (1, 0.06667639), (2, 0.06758...","[0.06923428922891617, 0.06667639315128326, 0.0..."
2,"Zaha Hadid, known as the 'Queen of the Curve,'...","[zaha, hadid, known, queen, curve, woman, rece...","[(38, 1)]","[(0, 0.5965774), (1, 0.10160036), (2, 0.100837...","[0.5965774059295654, 0.10160035640001297, 0.10..."
3,Jean-Michel Basquiat went from being homeless ...,"[jean, michel, basquiat, went, homeless, selli...","[(2, 1), (52, 1), (143, 1)]","[(0, 0.4948869), (1, 0.05107212), (2, 0.350040...","[0.4948869049549103, 0.051072120666503906, 0.3..."
4,"The Louvre Museum, originally a royal palace, ...","[louvre, museum, originally, royal, palace, ho...","[(52, 1), (108, 1), (140, 1), (141, 1)]","[(0, 0.04249632), (1, 0.38199356), (2, 0.49486...","[0.042496319860219955, 0.3819935619831085, 0.4..."


In [15]:
art_facts_df["dataset"] = "training"
new_facts_df["dataset"] = "new"
final_art_facts_df = pd.concat([art_facts_df, new_facts_df]).reset_index(drop=True)
final_art_facts_df.sample(5)

Unnamed: 0,fact,preprocessed,corpus,lda_vector,lda_vector_dense,dataset
7,Rembrandt is considered one of the greatest vi...,"[rembrandt, considered, greatest, visual, arti...","[(40, 1), (41, 1), (42, 1), (43, 1), (44, 1), ...","[(0, 0.88465565), (1, 0.028577391), (2, 0.0285...","[0.8846556544303894, 0.028577391058206558, 0.0...",training
45,Turner's full name is Joseph Mallord William T...,"[turner, joseph, mallord, william, turner]","[(120, 2), (208, 1), (209, 1), (210, 1)]","[(0, 0.03344322), (1, 0.03333733), (2, 0.86654...","[0.03344321995973587, 0.03333732858300209, 0.8...",training
35,Raphael's 'The School of Athens' features repr...,"[raphael, school, athens, features, representa...","[(66, 1), (174, 1), (175, 1), (176, 1), (177, ...","[(0, 0.025004335), (1, 0.89970523), (2, 0.0251...","[0.02500433474779129, 0.8997052311897278, 0.02...",training
51,"Yayoi Kusama, known for her polka dots, has be...","[yayoi, kusama, known, polka, dots, major, fig...","[(33, 1), (38, 1), (53, 1)]","[(0, 0.7982775), (1, 0.050204575), (2, 0.05043...","[0.7982774972915649, 0.050204575061798096, 0.0...",new
29,"Monet's 'Impression, Sunrise' gave the Impress...","[monet, impression, sunrise, gave, impressioni...","[(31, 1), (32, 1), (33, 1), (152, 1), (153, 1)...","[(0, 0.028648475), (1, 0.02857416), (2, 0.0285...","[0.02864847518503666, 0.028574159368872643, 0....",training


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

final_art_facts_df[final_art_facts_df["dataset"] == "new"]

Unnamed: 0,fact,preprocessed,corpus,lda_vector,lda_vector_dense,dataset
51,"Yayoi Kusama, known for her polka dots, has be...","[yayoi, kusama, known, polka, dots, major, fig...","[(33, 1), (38, 1), (53, 1)]","[(0, 0.7982775), (1, 0.050204575), (2, 0.05043...","[0.7982774972915649, 0.050204575061798096, 0.0...",new
52,"Banksy, an anonymous England-based street arti...","[banksy, anonymous, england, based, street, ar...","[(52, 1), (93, 1)]","[(0, 0.06923429), (1, 0.06667639), (2, 0.06758...","[0.06923428922891617, 0.06667639315128326, 0.0...",new
53,"Zaha Hadid, known as the 'Queen of the Curve,'...","[zaha, hadid, known, queen, curve, woman, rece...","[(38, 1)]","[(0, 0.5965774), (1, 0.10160036), (2, 0.100837...","[0.5965774059295654, 0.10160035640001297, 0.10...",new
54,Jean-Michel Basquiat went from being homeless ...,"[jean, michel, basquiat, went, homeless, selli...","[(2, 1), (52, 1), (143, 1)]","[(0, 0.4948869), (1, 0.05107212), (2, 0.350040...","[0.4948869049549103, 0.051072120666503906, 0.3...",new
55,"The Louvre Museum, originally a royal palace, ...","[louvre, museum, originally, royal, palace, ho...","[(52, 1), (108, 1), (140, 1), (141, 1)]","[(0, 0.04249632), (1, 0.38199356), (2, 0.49486...","[0.042496319860219955, 0.3819935619831085, 0.4...",new


In [17]:
fact_to_compare_index = 51

def get_similar_facts(index, df):
    # Ensure that the "lda_vector_dense" column is correctly formatted as a list of numpy arrays
    dense_vectors = np.array(list(df["lda_vector_dense"].apply(lambda x: np.array(x))))
    
    # Compute cosine similarity
    # The dense_vectors[index] needs to be reshaped to ensure it's 2D if it's not already
    similarities = cosine_similarity([dense_vectors[index]], dense_vectors)
    
    # Assign similarities back to the DataFrame
    df["similarity"] = similarities[0]
    
    # Return the DataFrame sorted by similarity
    return df.sort_values(by="similarity", ascending=False)


get_similar_facts(fact_to_compare_index, final_art_facts_df)[:5]

Unnamed: 0,fact,preprocessed,corpus,lda_vector,lda_vector_dense,dataset,similarity
51,"Yayoi Kusama, known for her polka dots, has be...","[yayoi, kusama, known, polka, dots, major, fig...","[(33, 1), (38, 1), (53, 1)]","[(0, 0.7982775), (1, 0.050204575), (2, 0.05043...","[0.7982774972915649, 0.050204575061798096, 0.0...",new,1.0
2,Pablo Picasso could draw before he could walk.,"[pablo, picasso, draw, walk]","[(8, 1), (13, 1), (14, 1), (15, 1)]","[(0, 0.83963), (1, 0.040336784), (2, 0.0400102...","[0.8396300077438354, 0.04033678397536278, 0.04...",training,0.999535
39,Munch's 'The Scream' has been stolen twice.,"[munch, scream, stolen, twice]","[(87, 1), (88, 1), (189, 1), (190, 1)]","[(0, 0.8399655), (1, 0.04000675), (2, 0.040008...","[0.8399655222892761, 0.04000674933195114, 0.04...",training,0.999528
13,Gustav Klimt's most famous painting is 'The Ki...,"[gustav, klimt, famous, painting, kiss]","[(2, 1), (78, 1), (79, 1), (80, 1), (81, 1)]","[(0, 0.8657531), (1, 0.033551335), (2, 0.03333...","[0.8657531142234802, 0.03355133533477783, 0.03...",training,0.998833
15,Edvard Munch's most famous work is 'The Scream'.,"[edvard, munch, famous, work, scream]","[(78, 1), (86, 1), (87, 1), (88, 1), (89, 1)]","[(0, 0.86604536), (1, 0.033338204), (2, 0.0336...","[0.866045355796814, 0.03333820402622223, 0.033...",training,0.998823


In [18]:
fact_to_compare_index = 52
get_similar_facts(fact_to_compare_index, final_art_facts_df)[:5]

Unnamed: 0,fact,preprocessed,corpus,lda_vector,lda_vector_dense,dataset,similarity
52,"Banksy, an anonymous England-based street arti...","[banksy, anonymous, england, based, street, ar...","[(52, 1), (93, 1)]","[(0, 0.06923429), (1, 0.06667639), (2, 0.06758...","[0.06923428922891617, 0.06667639315128326, 0.0...",new,1.0
47,Kandinsky taught at the Bauhaus school of art ...,"[kandinsky, taught, bauhaus, school, art, design]","[(52, 1), (129, 1), (179, 1), (213, 1), (214, ...","[(0, 0.028859511), (1, 0.028751874), (2, 0.028...","[0.028859511017799377, 0.028751874342560768, 0...",training,0.993017
50,Da Vinci's 'The Last Supper' has been the subj...,"[da, vinci, supper, subject, restoration, atte...","[(7, 1), (11, 1), (225, 1), (226, 1), (227, 1)...","[(0, 0.028724184), (1, 0.028575232), (2, 0.028...","[0.0287241842597723, 0.0285752322524786, 0.028...",training,0.993002
29,"Monet's 'Impression, Sunrise' gave the Impress...","[monet, impression, sunrise, gave, impressioni...","[(31, 1), (32, 1), (33, 1), (152, 1), (153, 1)...","[(0, 0.028648475), (1, 0.02857416), (2, 0.0285...","[0.02864847518503666, 0.028574159368872643, 0....",training,0.992983
5,Claude Monet founded the French Impressionist ...,"[claude, monet, founded, french, impressionist...","[(2, 1), (28, 1), (29, 1), (30, 1), (31, 1), (...","[(0, 0.025188208), (1, 0.025151433), (2, 0.025...","[0.025188207626342773, 0.025151433423161507, 0...",training,0.991935
