# Topic Modeling

This notebook wants to explore the topic modeling possibilities of the summaries dataset. We will implement the model using Bertopic.

In [None]:
# Modules to import
import sys
import pandas as pd
import numpy as np

In [2]:
paths = ['../data','../scripts','../utils']
for path in paths:
    sys.path.append(path)

In [3]:
from dataLoader import loadDataframe

In [4]:
# Load data
path_to_directory = '../../data/cleanData/'
df_movies = loadDataframe('movies', path_to_directory)
df_summaries = loadDataframe('summaries', path_to_directory)

  df[columns_to_convert] = df[columns_to_convert].applymap(eval)


### Bertopic

In [2]:
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from tqdm import tqdm

In [6]:
# Load the spaCy model for English language
nlp = spacy.load("en_core_web_sm")

# Download stopwords and punkt tokenizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Load the English stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arnau\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arnau\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
def preprocess_text(text):
    # Tokenisation
    doc = nlp(text.lower())  # Convert to lowercase
    
    # Lemmatisation and remove stopwords and punctuation
    processed_tokens = [
        token.lemma_ for token in doc if token.text not in stop_words and token.text not in string.punctuation
    ]
    
    char_to_remove = ["'s", " "]
    processed_tokens = [token for token in processed_tokens if token not in char_to_remove]
    
    return " ".join(processed_tokens)

In [None]:
# Apply the preprocessing to the summaries
tqdm.pandas()
df_cleaned_summaries = df_summaries.copy()
df_cleaned_summaries["cleaned_summary"] = df_cleaned_summaries["summary"].progress_apply(preprocess_text)

# Drop the original summary column
df_cleaned_summaries.drop(columns=["summary"], inplace=True)

  0%|          | 0/42303 [00:00<?, ?it/s]

100%|██████████| 42303/42303 [39:01<00:00, 18.07it/s]  


All the summaries are not in English, so we use the library `langdetect` to filter out the non-English summaries.

In [25]:
# Reconnaitre la langue de chaque summary
from langdetect import detect

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

df_cleaned_summaries["language"] = df_cleaned_summaries["cleaned_summary"].progress_apply(detect_language)

100%|██████████| 42303/42303 [06:38<00:00, 106.04it/s]


In [11]:
# Save the cleaned summaries
df_cleaned_summaries.to_csv("../../data/topicModelData/cleaned_summaries.csv", index=False)

In [3]:
# Load the cleaned summaries
df_cleaned_summaries = pd.read_csv("../../data/topicModelData/cleaned_summaries.csv")

In [4]:
pourcentage_english = df_cleaned_summaries["language"].value_counts(normalize=True)["en"] * 100
print('Pourcentage of summaries in English: {:.2f}%'.format(pourcentage_english))

Pourcentage of summaries in English: 99.53%


We can just decide to drop the non-English summaries or translate them to English. In this notebook, we will drop the non-English summaries.

In [5]:
df_cleaned_summaries = df_cleaned_summaries[df_cleaned_summaries["language"] == "en"]

In [6]:
docs = df_cleaned_summaries["cleaned_summary"].tolist()

In [7]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


1. Embedding

In [16]:
# Embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode the summaries
embeddings = embedding_model.encode(docs, show_progress_bar=True)

Batches: 100%|██████████| 1316/1316 [42:06<00:00,  1.92s/it] 


In [17]:
#Save embeddings
np.save("embeddings.npy", embeddings)

In [8]:
embeddings = np.load("embeddings.npy")

2. UMAP

In [9]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

3. HDBSCAN

In [10]:
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

4. Vectorizer

In [11]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [25]:
# Create the model
topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=hdbscan_model, verbose=True)

# Fit the model
topics, probabilities = topic_model.fit_transform(docs)
df_cleaned_summaries['topic'] = topics

2024-11-25 20:55:32,024 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 1316/1316 [32:49<00:00,  1.50s/it]
2024-11-25 21:28:24,458 - BERTopic - Embedding - Completed ✓
2024-11-25 21:28:24,458 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-25 21:28:48,963 - BERTopic - Dimensionality - Completed ✓
2024-11-25 21:28:48,965 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-25 21:28:58,777 - BERTopic - Cluster - Completed ✓
2024-11-25 21:28:58,783 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-25 21:29:05,729 - BERTopic - Representation - Completed ✓


In [26]:
# Save the model
topic_model.save("topic_model_min_cluster_size_50")



In [12]:
# Load the model
topic_model = BERTopic.load("topic_model")
topic_info = topic_model.get_topic_info()

ValueError: <class 'numpy.random._mt19937.MT19937'> is not a known BitGenerator module.

In [6]:
# Print the topics
topic_info = topic_model.get_topic_info()
print(topic_info)

     Topic  Count                                 Name  \
0       -1  25994                  -1_find_go_take_get   
1        0   1321                      0_ho_su_li_wong   
2        1   1200          1_mother_husband_year_child   
3        2    921               2_love_marry_singh_get   
4        3    897       3_murder_bank_detective_prison   
..     ...    ...                                  ...   
222    221     10    221_luther_halligan_brewster_jöns   
223    222     10      222_molly_lasch_dwayne_danielle   
224    223     10     223_japanese_chinese_troop_ahmad   
225    224     10  224_bardot_vadim_merteuil_madeleine   
226    225     10        225_pelikán_dato_dániel_zoran   

                                        Representation  \
0    [find, go, take, get, one, leave, tell, man, f...   
1    [ho, su, li, wong, jin, hong, master, eun, chi...   
2    [mother, husband, year, child, life, father, f...   
3    [love, marry, singh, get, father, raja, marria...   
4    [murder,

Our algorithm provides 227 different topics. We have to explore the topics to understand if they make sense and if some are redundant or useless.

In [7]:
# Visualise topics by dominant words
topic_model.visualize_barchart()

In [8]:
# Visualise topics on a map
topic_model.visualize_topics()

In [21]:
# Save the topics
df_cleaned_summaries.to_csv("../../data/topicModelData/summaries_with_topics.csv", index=False)

### Review topics

Merge similar topics

Drop non-informative topics

In [9]:
df_cleaned_summaries = pd.read_csv("../../data/topicModelData/summaries_with_topics.csv")
docs = df_cleaned_summaries["cleaned_summary"].tolist()

In [35]:
# Load new model
topic_model = BERTopic.load("topic_model_min_cluster_size_50")
topic_info = topic_model.get_topic_info()

In [53]:
topics = topic_model.get_topics()

In [36]:
# Print the topics
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,25785,-1_find_go_take_one,"[find, go, take, one, get, tell, leave, man, f...",[gary nightclub manager fly group woman new yo...
1,0,1845,0_love_get_father_marry,"[love, get, father, marry, come, go, family, k...",[geeta devi one daughter law prominent wealthy...
2,1,1292,1_ho_su_master_wong,"[ho, su, master, wong, li, hong, man, chinese,...",[film third chapter share story begin day wild...
3,2,1251,2_mother_father_family_life,"[mother, father, family, life, year, child, ol...",[bitter teen broken home send live remarried f...
4,3,1120,3_murder_police_prison_detective,"[murder, police, prison, detective, crime, ban...",[young man accuse murder master detective set ...
5,4,995,4_love_marry_family_get,"[love, marry, family, get, son, marriage, daug...",[kavitha work woman middle class family work h...
6,5,784,5_police_kill_car_find,"[police, kill, car, find, shoot, man, frank, g...",[film open man frank johnson walk dog city nig...
7,6,658,6_film_life_movie_story,"[film, life, movie, story, young, woman, follo...",[movie set around small group character experi...
8,7,584,7_kill_tokyo_find_take,"[kill, tokyo, find, take, samurai, conan, one,...",[ninja resurrection take place tokugawa era ti...
9,8,519,8_life_love_young_woman,"[life, love, young, woman, antonio, film, live...",[young man believe girlfriend cheat ex boyfrie...


Using the HDBSCAN model with a minimum cluster size of 50, we reduce the number of topicsfrom 225 to 50, which represents a more manageable number of topics. Now, can merge similar topics and drop non-informative topics.

In [38]:
topic_model.visualize_heatmap()

In [49]:
# Topics to merge (name, list of topics ids)
topic_model = BERTopic.load("topic_model_min_cluster_size_50")

topics_1 = ['Love and family',0,4,2,11,8]
topic_model.merge_topics(docs, topics_1[1:])

topics_2 = ['Crime and Police', 2,3]
topic_model.merge_topics(docs, topics_2[1:])

topics_3 = ['Nazi germany', 14,15]
topic_model.merge_topics(docs, topics_3[1:])
'''

topics_4 = ['Space and Aliens', 9, 12]
topics_5 = ['Pirates', 21, 30]
topics_6 = ['French Culture and Art', 28,47]'''

"\n\ntopics_4 = ['Space and Aliens', 9, 12]\ntopics_5 = ['Pirates', 21, 30]\ntopics_6 = ['French Culture and Art', 28,47]"

In [50]:
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,25785,-1_find_go_take_one,"[find, go, take, one, get, tell, leave, man, f...",[gary nightclub manager fly group woman new yo...
1,0,4997,0_love_father_get_marry,"[love, father, get, marry, family, go, life, s...",[retire commissioner ashwini kumar railway sta...
2,1,1904,1_police_kill_murder_find,"[police, kill, murder, find, man, car, get, ta...",[plot los angeles detectives graham water part...
3,2,1292,2_ho_su_master_li,"[ho, su, master, li, wong, hong, man, chinese,...",[film third chapter share story begin day wild...
4,3,658,3_film_life_movie_story,"[film, life, movie, story, young, woman, follo...",[film open commentary vice president mgm danie...
5,4,584,4_find_kill_take_tokyo,"[find, kill, take, tokyo, samurai, one, conan,...",[ninja resurrection take place tokugawa era ti...
6,5,490,5_earth_scientist_planet_alien,"[earth, scientist, planet, alien, human, space...",[distant future war human race alien know gami...
7,6,469,6_german_nazi_hitler_war,"[german, nazi, hitler, war, germany, von, camp...",[june 1941 ukrainian villager live peace schoo...
8,7,437,7_school_student_go_tell,"[school, student, go, tell, girl, friend, get,...",[four good friend western michigan high school...
9,8,361,8_earth_ship_planet_alien,"[earth, ship, planet, alien, space, human, des...",[year 2058 earth soon uninhabitable irreversib...
