In [None]:
!pip install gensim scikit-learn --quiet

In [None]:
import pandas as pd

art_facts_df = pd.read_csv("data/art-facts.csv", sep="|", header=None, names=["fact"])
art_facts_df.head()

In [None]:
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def preprocess(text):
    return [word for word in simple_preprocess(text) if word not in STOPWORDS]

art_facts_df["preprocessed"] = art_facts_df['fact'].apply(preprocess)


In [None]:
art_facts_df.head()

In [None]:
from gensim import corpora

dictionary = corpora.Dictionary(art_facts_df.preprocessed.values)
art_facts_df["corpus"] = [dictionary.doc2bow(doc) for doc in art_facts_df.preprocessed.values]


In [None]:
from gensim.models import LdaModel

lda_model = LdaModel(corpus=art_facts_df["corpus"], id2word=dictionary, num_topics=5, passes=15)


In [None]:
lda_model.num_topics

In [None]:
import numpy as np

def sparse_to_dense(sparse):
    dense = np.zeros(lda_model.num_topics)
    for (topic, prob) in sparse:
        dense[topic] = prob
    return dense

art_facts_df["lda_vector"] = [lda_model[doc] for doc in art_facts_df["corpus"]]
art_facts_df["lda_vector_dense"] = art_facts_df["lda_vector"].apply(sparse_to_dense)

In [None]:
art_facts_df.head()

In [None]:
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


In [None]:
new_facts = [
    "Yayoi Kusama, known for her polka dots, has been a major figure in the avant-garde movement since the 1960s.",
    "Banksy, an anonymous England-based street artist, is renowned for his politically themed and satirical street art.",
    "Zaha Hadid, known as the 'Queen of the Curve,' was the first woman to receive the Pritzker Architecture Prize in 2004.",
    "Jean-Michel Basquiat went from being homeless to selling a painting for over $100 million, highlighting his meteoric rise in the art world.",
    "The Louvre Museum, originally a royal palace, holds over 380,000 objects and displays 35,000 works of art, including the Mona Lisa.",
]

new_facts_df = pd.DataFrame(new_facts, columns=["fact"])
new_facts_df["preprocessed"] = new_facts_df['fact'].apply(preprocess)
new_facts_df["corpus"] = [dictionary.doc2bow(doc) for doc in new_facts_df.preprocessed.values]
new_facts_df["lda_vector"] = [lda_model[doc] for doc in new_facts_df["corpus"]]
new_facts_df["lda_vector_dense"] = new_facts_df["lda_vector"].apply(sparse_to_dense)
new_facts_df

In [None]:
art_facts_df["dataset"] = "training"
new_facts_df["dataset"] = "new"
final_art_facts_df = pd.concat([art_facts_df, new_facts_df]).reset_index(drop=True)
final_art_facts_df.sample(5)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

final_art_facts_df[final_art_facts_df["dataset"] == "new"]

In [None]:
fact_to_compare_index = 51

def get_similar_facts(index, df):
    # Ensure that the "lda_vector_dense" column is correctly formatted as a list of numpy arrays
    dense_vectors = np.array(list(df["lda_vector_dense"].apply(lambda x: np.array(x))))
    
    # Compute cosine similarity
    # The dense_vectors[index] needs to be reshaped to ensure it's 2D if it's not already
    similarities = cosine_similarity([dense_vectors[index]], dense_vectors)
    
    # Assign similarities back to the DataFrame
    df["similarity"] = similarities[0]
    
    # Return the DataFrame sorted by similarity
    return df.sort_values(by="similarity", ascending=False)


get_similar_facts(fact_to_compare_index, final_art_facts_df)[:5]

In [None]:
fact_to_compare_index = 52
get_similar_facts(fact_to_compare_index, final_art_facts_df)[:5]