# 2 Topic Model Text
We have too much data so maybe we do a topic model and limit ourselves to certain videos

In [45]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import nltk
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\serca\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
# Load the CSV file
df = pd.read_csv("../data/full_video_dataset.csv")

In [47]:
df.head(2)

Unnamed: 0,Title,Description,Video URL,Europeana URL,Date,Subjects,Places,Provider
0,WIELERWEDSTRIJDEN IN HET VONDELPARK,Bioscoopjournaals waarin Nederlandse onderwerpen van een bepaalde week worden gepresenteerd.,https://www.openbeelden.nl/media/1005287,https://www.europeana.eu/item/2051906/data_euscreenXL_https___www_openbeelden_nl_media_1005287?utm_source=api&utm_medium=api&utm_campaign=illecible,1923-09-21,"Road bicycle racing, moving images, road bicycle racing, Sports competition, sports competition","Kingdom of the Netherlands, The Netherlands, Eurozone",Netherlands Institute for Sound & Vision
1,VOETBALWEDSTRIJD TUSCHINSKI - CINEMA ROYAL,Bioscoopjournaals waarin Nederlandse onderwerpen van een bepaalde week worden gepresenteerd.,https://www.openbeelden.nl/media/1005717,https://www.europeana.eu/item/2051906/data_euscreenXL_https___www_openbeelden_nl_media_1005717?utm_source=api&utm_medium=api&utm_campaign=illecible,1928-01-01,"Association football, moving images, association football","Kingdom of the Netherlands, The Netherlands, Eurozone",Netherlands Institute for Sound & Vision


## PREP

In [48]:
# 🔹 Define stopwords (Dutch + English + Custom)
custom_stopwords = set([
    'weekjournaal', 'hollands', "nederlands", 'nieuws', 'polygoon', 'uit', 
    'van', 'week', 'sep', 'moving', "de", "het", "een", "en", "voor", "op", "aan"
])  
dutch_stopwords = set(stopwords.words('dutch'))  # Load Dutch stopwords
english_stopwords = set(ENGLISH_STOP_WORDS)  # Load English stopwords
all_stopwords = custom_stopwords | dutch_stopwords | english_stopwords  # Combine all sets

# 🔹 Text preprocessing function
def clean_text(text):
    if pd.isna(text):  
        return ""  # Handle NaN values safely
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"\b(" + "|".join(all_stopwords) + r")\b", "", text)  # Remove stopwords
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# 🔹 Apply preprocessing to each text field
df["Title"] = df["Title"].astype(str).apply(lambda x: x.title())  # Title case
df["Description"] = df["Description"].astype(str).apply(clean_text)
df["Subjects"] = df["Subjects"].astype(str).apply(clean_text)
df["Places"] = df["Places"].astype(str).apply(clean_text)

# 🔹 Create 'docs' column (Lowercased and joined with `[SEP]`)
df["docs"] = df.apply(
    lambda row: f"{row['Title'].lower()} [SEP] {row['Description']} [SEP] {row['Subjects']} [SEP] {row['Places']}",
    axis=1
)


In [49]:
df.head(2)

Unnamed: 0,Title,Description,Video URL,Europeana URL,Date,Subjects,Places,Provider,docs
0,Wielerwedstrijden In Het Vondelpark,bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd,https://www.openbeelden.nl/media/1005287,https://www.europeana.eu/item/2051906/data_euscreenXL_https___www_openbeelden_nl_media_1005287?utm_source=api&utm_medium=api&utm_campaign=illecible,1923-09-21,road bicycle racing images road bicycle racing sports competition sports competition,kingdom netherlands netherlands eurozone,Netherlands Institute for Sound & Vision,wielerwedstrijden in het vondelpark [SEP] bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd [SEP] road bicycle racing images road bicycle racing sports competition sports com...
1,Voetbalwedstrijd Tuschinski - Cinema Royal,bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd,https://www.openbeelden.nl/media/1005717,https://www.europeana.eu/item/2051906/data_euscreenXL_https___www_openbeelden_nl_media_1005717?utm_source=api&utm_medium=api&utm_campaign=illecible,1928-01-01,association football images association football,kingdom netherlands netherlands eurozone,Netherlands Institute for Sound & Vision,voetbalwedstrijd tuschinski - cinema royal [SEP] bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd [SEP] association football images association football [SEP] kingdom nether...


# MODEL

In [50]:

# 🔹 Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # You can change the model if needed

# 🔹 Generate embeddings
embeddings = embedding_model.encode(df["docs"].tolist(), show_progress_bar=True)

# 🔹 Save embeddings to a file
np.save("../data/video_embeddings.npy", embeddings)

print(f"✅ Saved {embeddings.shape[0]} embeddings to '../data/video_embeddings.npy'")


Batches: 100%|██████████| 201/201 [00:33<00:00,  5.93it/s]

✅ Saved 6421 embeddings to '../data/video_embeddings.npy'





In [68]:
# 🔹 Load saved embeddings
embeddings = np.load("../data/video_embeddings.npy")

# Initialize BERTopic WITHOUT automatic topic reduction AND verbose
topic_model = BERTopic(language="multilingual", verbose=True, nr_topics="auto")  

# 🔹 Fit BERTopic using precomputed embeddings
topics, probs = topic_model.fit_transform(df["docs"].tolist(), embeddings=embeddings)

# 🔹 Add topics to the dataframe
df["Topic"] = topics
df["Topic_Probability"] = probs

# 🔹 Save results
df.to_csv("../data/full_video_dataset_with_topics.csv", index=False)

print(f"✅ Saved topic modeling results to '../data/full_video_dataset_with_topics.csv'")

2025-02-17 14:55:08,859 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


2025-02-17 14:55:10,492 - BERTopic - Dimensionality - Completed ✓
2025-02-17 14:55:10,493 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-17 14:55:10,648 - BERTopic - Cluster - Completed ✓
2025-02-17 14:55:10,648 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-17 14:55:10,820 - BERTopic - Representation - Completed ✓
2025-02-17 14:55:10,821 - BERTopic - Topic reduction - Reducing number of topics
2025-02-17 14:55:10,962 - BERTopic - Topic reduction - Reduced number of topics from 116 to 59


✅ Saved topic modeling results to '../data/full_video_dataset_with_topics.csv'


In [69]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', None)

In [70]:
# 🔹 Print topic overview
topic_model.get_topic_info().to_csv("../data/topic_model_info.csv", index=False)
topic_model.get_topic_info().head(5)



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2001,-1_sep_kingdom_images_netherlands,"[sep, kingdom, images, netherlands, van, de, amsterdam, in, shots, het]","[bathing children in the amstel canal [SEP] newsreels dutch subjects certain presented [SEP] images [SEP] amsterdam kingdom netherlands amsterdam netherlands amsterdam kingdom netherlands, opening..."
1,0,1319,0_sep_images_van_kingdom,"[sep, images, van, kingdom, de, het, rotterdam, netherlands, quot, in]","[tewaterlating van de &quot;jacob van heemskerck&quot; [SEP] [SEP] images [SEP] kingdom netherlands, nieuws uit west-indie - het vertrek van de &quot;johan maurits&quot; en de aankomst van de &qu..."
2,1,805,1_bioscoopjournaals_gepresenteerd_bepaalde_waarin,"[bioscoopjournaals, gepresenteerd, bepaalde, waarin, onderwerpen, nederlandse, netherlands, nsbpropagandafilm, kingdom, amsterdam]","[wielerwedstrijd [SEP] bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd [SEP] images [SEP] kingdom netherlands, modeshow [SEP] bioscoopjournaals waarin nederlandse onderwerp..."
3,2,202,2_subjects_certain_presented_newsreels,"[subjects, certain, presented, newsreels, dutch, text, of, amsterdam, the, netherlands]","[mother's day [SEP] newsreels dutch subjects certain presented [SEP] images [SEP] kingdom netherlands netherlands kingdom netherlands, huishoudbeurs [SEP] newsreels dutch subjects certain presente..."
4,3,135,3_ship_sailing_helder_ships,"[ship, sailing, helder, ships, boat, sea, den, the, shots, rotterdam]","[official opening of the repair ship ""wilhelmina"". [SEP] image report opening recovery boat wilhelmina gunboat converted rehabilitation home located omval amstel skipper husband pose deck gentleme..."


# Outlier Reduction

In [71]:
# 🔹 Reduce outliers using the "probabilities" strategy
print("🔄 Reducing outliers and reassigning topics...")
new_topics = topic_model.reduce_outliers(df["docs"].tolist(), topics, probabilities=probs, strategy="c-tf-idf", threshold=0.01)

🔄 Reducing outliers and reassigning topics...


In [72]:
# 🔹 Update topics in BERTopic model
print("🔄 Updating topics with reassigned outliers...")
topic_model.update_topics(df["docs"].tolist(), topics=new_topics)



🔄 Updating topics with reassigned outliers...


In [73]:
# 🔹 Print topic overview
# topic_model.get_topic_info().to_csv("../data/topic_model_info.csv", index=False)
topic_model.get_topic_info().head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,2316,0_sep_images_van_de,"[sep, images, van, de, kingdom, netherlands, het, quot, rotterdam, in]","[tewaterlating van de &quot;jacob van heemskerck&quot; [SEP] [SEP] images [SEP] kingdom netherlands, nieuws uit west-indie - het vertrek van de &quot;johan maurits&quot; en de aankomst van de &qu..."
1,1,957,1_bioscoopjournaals_bepaalde_gepresenteerd_waarin,"[bioscoopjournaals, bepaalde, gepresenteerd, waarin, onderwerpen, nederlandse, netherlands, amsterdam, kingdom, sep]","[wielerwedstrijd [SEP] bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd [SEP] images [SEP] kingdom netherlands, modeshow [SEP] bioscoopjournaals waarin nederlandse onderwerp..."
2,2,327,2_subjects_certain_presented_newsreels,"[subjects, certain, presented, newsreels, dutch, the, amsterdam, netherlands, of, kingdom]","[mother's day [SEP] newsreels dutch subjects certain presented [SEP] images [SEP] kingdom netherlands netherlands kingdom netherlands, huishoudbeurs [SEP] newsreels dutch subjects certain presente..."
3,3,151,3_ship_sailing_helder_ships,"[ship, sailing, helder, ships, boat, den, sea, vlissingen, shots, rotterdam]","[official opening of the repair ship ""wilhelmina"". [SEP] image report opening recovery boat wilhelmina gunboat converted rehabilitation home located omval amstel skipper husband pose deck gentleme..."
4,4,156,4_dogs_animal_dog_shots,"[dogs, animal, dog, shots, pigeon, animals, eggs, div, cattle, horses]",[tracking dog in the service of the police [SEP] demonstration police dogs haarlem shots policemen walking round police dogs sheepdogs div cus dogs demonstration dog pulling man bike staying offic...


In [74]:
df.head(1)

Unnamed: 0,Title,Description,Video URL,Europeana URL,Date,Subjects,Places,Provider,docs,Topic,Topic_Probability
0,Wielerwedstrijden In Het Vondelpark,bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd,https://www.openbeelden.nl/media/1005287,https://www.europeana.eu/item/2051906/data_euscreenXL_https___www_openbeelden_nl_media_1005287?utm_source=api&utm_medium=api&utm_campaign=illecible,1923-09-21,road bicycle racing images road bicycle racing sports competition sports competition,kingdom netherlands netherlands eurozone,Netherlands Institute for Sound & Vision,wielerwedstrijden in het vondelpark [SEP] bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd [SEP] road bicycle racing images road bicycle racing sports competition sports com...,30,0.332084


In [None]:
# 🔹 Add updated topics to the dataframe
df["Topic"] = new_topics



# SAVE

In [None]:
# 🔹 Save results
df.to_csv("../data/full_video_dataset_with_topics.csv", index=False)

In [76]:
# 🔹 Print topic overview
topic_model.get_topic_info().to_csv("../data/topic_model_info.csv", index=False)
topic_model.get_topic_info().head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,2316,0_sep_images_van_de,"[sep, images, van, de, kingdom, netherlands, het, quot, rotterdam, in]","[tewaterlating van de &quot;jacob van heemskerck&quot; [SEP] [SEP] images [SEP] kingdom netherlands, nieuws uit west-indie - het vertrek van de &quot;johan maurits&quot; en de aankomst van de &qu..."
1,1,957,1_bioscoopjournaals_bepaalde_gepresenteerd_waarin,"[bioscoopjournaals, bepaalde, gepresenteerd, waarin, onderwerpen, nederlandse, netherlands, amsterdam, kingdom, sep]","[wielerwedstrijd [SEP] bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd [SEP] images [SEP] kingdom netherlands, modeshow [SEP] bioscoopjournaals waarin nederlandse onderwerp..."
2,2,327,2_subjects_certain_presented_newsreels,"[subjects, certain, presented, newsreels, dutch, the, amsterdam, netherlands, of, kingdom]","[mother's day [SEP] newsreels dutch subjects certain presented [SEP] images [SEP] kingdom netherlands netherlands kingdom netherlands, huishoudbeurs [SEP] newsreels dutch subjects certain presente..."
3,3,151,3_ship_sailing_helder_ships,"[ship, sailing, helder, ships, boat, den, sea, vlissingen, shots, rotterdam]","[official opening of the repair ship ""wilhelmina"". [SEP] image report opening recovery boat wilhelmina gunboat converted rehabilitation home located omval amstel skipper husband pose deck gentleme..."
4,4,156,4_dogs_animal_dog_shots,"[dogs, animal, dog, shots, pigeon, animals, eggs, div, cattle, horses]",[tracking dog in the service of the police [SEP] demonstration police dogs haarlem shots policemen walking round police dogs sheepdogs div cus dogs demonstration dog pulling man bike staying offic...


In [77]:
# Method 1 - safetensors
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("../data/model", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)


# Evaluate and choose


In [84]:
df_topic_info  = pd.read_csv("../data/topic_model_info_v2_relevant.csv")

In [85]:
df_topic_info .head(10)

Unnamed: 0,Topic,Count,Name,Representation,Trans,Relevant,Representative_Docs
0,0,2316,0_sep_images_van_de,"['sep', 'images', 'van', 'de', 'kingdom', 'netherlands', 'het', 'quot', 'rotterdam', 'in']","['sep', 'images', 'van', 'de', 'kingdom', 'netherlands', 'het', 'quot', 'rotterdam', 'in']",1,"['tewaterlating van de &quot;jacob van heemskerck&quot; [SEP] [SEP] images [SEP] kingdom netherlands', 'nieuws uit west-indie - het vertrek van de &quot;johan maurits&quot; en de aankomst van de ..."
1,1,957,1_bioscoopjournaals_bepaalde_gepresenteerd_waarin,"['bioscoopjournaals', 'bepaalde', 'gepresenteerd', 'waarin', 'onderwerpen', 'nederlandse', 'netherlands', 'amsterdam', 'kingdom', 'sep']","['newsreels', 'certain', 'presented', 'in which', 'subjects', 'dutch', 'netherlands', 'amsterdam', 'kingdom', 'sep']",0,"['wielerwedstrijd [SEP] bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd [SEP] images [SEP] kingdom netherlands', 'modeshow [SEP] bioscoopjournaals waarin nederlandse onderw..."
2,2,327,2_subjects_certain_presented_newsreels,"['subjects', 'certain', 'presented', 'newsreels', 'dutch', 'the', 'amsterdam', 'netherlands', 'of', 'kingdom']","['subjects', 'certain', 'presented', 'newsreels', 'dutch', 'the', 'amsterdam', 'netherlands', 'of', 'kingdom']",0,"[""mother's day [SEP] newsreels dutch subjects certain presented [SEP] images [SEP] kingdom netherlands netherlands kingdom netherlands"", 'huishoudbeurs [SEP] newsreels dutch subjects certain prese..."
3,3,151,3_ship_sailing_helder_ships,"['ship', 'sailing', 'helder', 'ships', 'boat', 'den', 'sea', 'vlissingen', 'shots', 'rotterdam']","['ship', 'sailing', 'helder', 'ships', 'boat', 'den', 'sea', 'vlissingen', 'shots', 'rotterdam']",1,"['official opening of the repair ship ""wilhelmina"". [SEP] image report opening recovery boat wilhelmina gunboat converted rehabilitation home located omval amstel skipper husband pose deck gentlem..."
4,4,156,4_dogs_animal_dog_shots,"['dogs', 'animal', 'dog', 'shots', 'pigeon', 'animals', 'eggs', 'div', 'cattle', 'horses']","['dogs', 'animal', 'dog', 'shots', 'pigeon', 'animals', 'eggs', 'div', 'cattle', 'horses']",0,['tracking dog in the service of the police [SEP] demonstration police dogs haarlem shots policemen walking round police dogs sheepdogs div cus dogs demonstration dog pulling man bike staying offi...
5,5,148,5_football_association_voetbalwedstrijd_stadion,"['football', 'association', 'voetbalwedstrijd', 'stadion', 'nederland', 'sep', 'sports', 'het', 'images', 'cue']","['football', 'association', 'football match', 'stadium', 'netherlands', 'sep', 'sports', 'it', 'images', 'cue']",0,"['voetbalwedstrijd [SEP] [SEP] association football images [SEP] kingdom netherlands', 'voetbalwedstrijd holland-belgië [SEP] [SEP] association football images association football [SEP] amsterd..."
6,6,130,6_bridge_train_railway_steam,"['bridge', 'train', 'railway', 'steam', 'station', 'construction', 'shots', 'lock', 'the', 'locomotive']","['bridge', 'train', 'railway', 'steam', 'station', 'construction', 'shots', 'lock', 'the', 'locomotive']",1,['first coal train from germany crosses the meuse at roermond [SEP] mr wg sonsbeeck queens commissioner limburg opens new railway bridge maas near buggenum train german coal enters netherlands bri...
7,7,115,7_equestrian_sport_hippique_concours,"['equestrian', 'sport', 'hippique', 'concours', 'cross', 'country', 'horse', 'amersfoort', 'jachtvereniging', 'sep']","['equestrian', 'sport', 'equestrian', 'competition', 'cross', 'country', 'horse', 'amersfoort', 'hunting club', 'sep']",0,"['cross-country [SEP] [SEP] equestrian sport images [SEP] kingdom netherlands', 'concours hippique [SEP] [SEP] equestrian event equestrian sport images equestrian sport [SEP] bussum', 'cross-cou..."
8,8,111,8_racing_bicycle_race_assen,"['racing', 'bicycle', 'race', 'assen', 'races', 'cycling', 'road', 'auto', 'tt', 'start']","['racing', 'bicycle', 'race', 'assen', 'races', 'cycling', 'road', 'auto', 'tt', 'start']",0,"['internationale wielerwedstrijden [SEP] [SEP] road bicycle racing images road bicycle racing [SEP] kingdom netherlands netherlands eurozone', 'wereldkampioenschappen op de weg [SEP] [SEP] road ..."
9,9,69,9_japanse_propagandafilm_propaganda_japans,"['japanse', 'propagandafilm', 'propaganda', 'japans', 'indonesische', 'japan', 'place', 'indonesia', 'journaal', 'java']","['japanese', 'propaganda film', 'propaganda', 'japanese', 'indonesian', 'japan', 'place', 'indonesia', 'news', 'java']",1,"['japanse industrieen [SEP] japanse propagandafilm film japanse vvv fabricage souvenirs [SEP] porcelain images propaganda [SEP] japan', 'djagalah tanah djawa (bewaak java) [SEP] japanse propaganda..."


In [86]:
# 🔹 Extract lists of relevant and non-relevant topics
relevant_topics = df_topic_info[df_topic_info["Relevant"] == 1]["Topic"].tolist()
non_relevant_topics = df_topic_info[df_topic_info["Relevant"] == 0]["Topic"].tolist()

# 🔹 Split dataset based on topic relevance
df_1 = df[df["Topic"].isin(relevant_topics)]
df_0 = df[df["Topic"].isin(non_relevant_topics)]

# 🔹 Save each dataframe separately
df_1.to_csv("../data/full_video_dataset_relevant.csv", index=False)
df_0.to_csv("../data/full_video_dataset_non_relevant.csv", index=False)

print(f"✅ Saved {len(df_1)} relevant records to 'full_video_dataset_relevant.csv'")
print(f"✅ Saved {len(df_0)} non-relevant records to 'full_video_dataset_non_relevant.csv'")

✅ Saved 3593 relevant records to 'full_video_dataset_relevant.csv'
✅ Saved 2828 non-relevant records to 'full_video_dataset_non_relevant.csv'
