In [None]:
# Install and import packages

import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
import numpy as np
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# Define a part-of-speech extraction function

def extract_pos(text):
    filtered_tokens = []
    doc = nlp(text)
    for token in doc:
        if token.pos_ in ("NOUN", "ADJ", "VERB"):
            filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [5]:
df = pd.read_csv("metexhibitions_2015-2024.csv")

In [None]:
# Filter part of speech for exhibition descriptions

df["Description_filtered"] = df["Description"].apply(extract_pos)
df.to_csv("metexhibitions_2015-2024_pos.csv", index=False)

In [8]:
df = pd.read_csv("metexhibitions_2015-2024_pos.csv")

In [None]:
# TF-IDF vectorize the filtered descriptions

vectorizer = TfidfVectorizer(stop_words="english")
vectorized_data = vectorizer.fit_transform(df["Description_filtered"])

In [None]:
# Fit NMF to the vectorized data

nmf = NMF(n_components=10, random_state=1)
doc_topic_dist_nmf = nmf.fit_transform(vectorized_data)

In [None]:
# Extract top topic words

topic_words_df = pd.DataFrame(nmf.components_, columns=vectorizer.get_feature_names_out())
for topic, topic_row in topic_words_df.iterrows():
    top_10_words = ", ".join(topic_row.sort_values(ascending=False).head(10).index)
    print(f"Topic {topic}: {top_10_words}")

Topic 0: exhibition, artist, work, make, possible, support, catalogue, painting, art, provide
Topic 1: tree, scene, holiday, eighteenth, light, installation, continue, presentation, angel, lifelike
Topic 2: card, baseball, sport, history, era, game, collection, golden, electrician, batch
Topic 3: student, art, young, work, creativity, select, understanding, school, help, host
Topic 4: drawing, print, work, artist, paper, installation, rotation, highlight, range, period
Topic 5: chinese, lacquer, jade, exhibition, animal, dynasty, collection, art, century, lapidary
Topic 6: painting, dutch, seventeenth, realism, praise, display, art, century, gallery, woman
Topic 7: sixteenth, value, cost, worth, cm, market, century, raw, stain, tapestry
Topic 8: antique, object, identity, roman, archaeological, late, early, jewelry, art, pottery
Topic 9: design, book, designer, cover, textile, pattern, fashion, woman, feature, furniture


In [None]:
# Append top topic words to the topics list

topics = []
for topic_num, row in topic_words_df.iterrows():
    top_5_words = row.sort_values(ascending=False).head(5).index.to_list()
    topic_label = ", ".join(top_5_words)
    topics.append(topic_label)

In [None]:
# Group the data frame by year

df_topics = pd.DataFrame(doc_topic_dist_nmf, index=df.Year, columns=topics)
df_topics_per_year = df_topics.groupby("Year").mean()

In [None]:
# Plot the line graph

fig = px.line(
    df_topics_per_year,
    markers=True,
    title="Average Topic Weights by Year",
    labels={"value": "Topic Weight", "variable": "Topic"}
)

fig.show()