In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
from nltk.corpus import stopwords
from collections import Counter
import fasttext
from HelperFunction import *

from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.corpora import Dictionary
from gensim.models import HdpModel
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.models import LdaMulticore
from pprint import pprint
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline
from umap import UMAP
from hdbscan import HDBSCAN

import warnings
warnings.filterwarnings("ignore")

# 1. Data

## 1.1 Loading Data

In [2]:
rulebook_data_original = pd.read_csv('boardgames_with_rulebooks.csv')
print('Numebr of rows:', len(rulebook_data_original))
rulebook_data_original.head()

Numebr of rows: 4727


Unnamed: 0,Name,Rulebook_Text
0,1001: Hezar o Yek Sab Règle,
1,1001 Islands,Game Designers:\nGGaammee ccoonntteennttss\nAn...
2,10′ to Kill Rulebook,"components Goal\nIn 10 minutes to kill, you ar..."
3,123 Go ! Règle Multilingue,No new trees were felled to create this produc...
4,123 Puzzle Rulebook,The material contained in the package is usefu...


In [3]:
rulebook_data_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4727 entries, 0 to 4726
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Name           4727 non-null   object
 1   Rulebook_Text  4196 non-null   object
dtypes: object(2)
memory usage: 74.0+ KB


## 1.2 Data Preparation

In [4]:
# remove null values
null = rulebook_data_original['Rulebook_Text'].isnull().sum()
print('Numebr of null:', null)
rulebook_data_no_null = rulebook_data_original.dropna()
print('Number of rows after removing nulls:', len(rulebook_data_no_null))

# remove rows where 'Rulebook_Text' has fewer than 30 words
rulebook_data_no_null = rulebook_data_no_null[rulebook_data_no_null['Rulebook_Text'].str.split().str.len() >= 30]
print('Number of rows after removing short texts:', len(rulebook_data_no_null))

rulebook_data_no_null.head()

Numebr of null: 531
Number of rows after removing nulls: 4196
Number of rows after removing short texts: 4152


Unnamed: 0,Name,Rulebook_Text
1,1001 Islands,Game Designers:\nGGaammee ccoonntteennttss\nAn...
2,10′ to Kill Rulebook,"components Goal\nIn 10 minutes to kill, you ar..."
3,123 Go ! Règle Multilingue,No new trees were felled to create this produc...
4,123 Puzzle Rulebook,The material contained in the package is usefu...
5,"1, 2, 3… Trésor à Bâbord Règle",Spielanleitung · Instructions · Règle du jeu\n...


## 1.3 Data Cleaning

In [5]:
# Load pre-trained language model
model = fasttext.load_model('lid.176.ftz')

def is_english_fasttext(text):
    # ensure text is processed as a single line
    text = text.replace('\n', ' ').strip()
    prediction = model.predict(text)
    lang, confidence = prediction[0][0], prediction[1][0]
    return lang == '__label__en' and confidence > 0.8

print('Number of rows before language filtering:', len(rulebook_data_no_null))
rulebook_data_no_null = rulebook_data_no_null[
    rulebook_data_no_null['Rulebook_Text'].apply(is_english_fasttext)]
print('Number of rows after removing non-english rows:', len(rulebook_data_no_null))

Number of rows before language filtering: 4152
Number of rows after removing non-english rows: 2709


In [6]:
# set up valid and artifact words
all_words = " ".join(rulebook_data_no_null['Rulebook_Text']).split()
valid_words = set([word for word, count in Counter(all_words).items() if count > 1])
artifact_words = ['cid', 'n', 'e', 'player', 'game', 'action', 'token', 'card', 'place', 'turn', 'may', 'one', 'tile', 'point', 'board', 'space']

In [7]:
# Set up lemmatizer and stopwords
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [8]:
print('Number of rows before cleaning:', len(rulebook_data_no_null))
rulebook_data_cleaned = rulebook_data_no_null.copy()

# Cleaning the text
rulebook_data_cleaned['clean_rulebook'] = rulebook_data_cleaned.Rulebook_Text.apply(lambda x: clean_text_rulebooks(x,lemmatizer,stop_words,valid_words,artifact_words))
print('Number of rows after cleaning:', len(rulebook_data_cleaned))
rulebook_data_cleaned

Number of rows before cleaning: 2709
Number of rows after cleaning: 2709


Unnamed: 0,Name,Rulebook_Text,clean_rulebook
1,1001 Islands,Game Designers:\nGGaammee ccoonntteennttss\nAn...,designer contents bauza bruno cathala artist c...
2,10′ to Kill Rulebook,"components Goal\nIn 10 minutes to kill, you ar...",component goal minute kil hitman goal kil targ...
4,123 Puzzle Rulebook,The material contained in the package is usefu...,material contained package useful familiarizin...
6,12 Gangsters Rulebook,Your gangster wants to steal the same amount a...,gangster want steal amount bos gangster sucesf...
7,13 Clues Rulebook,"For 2-6 players, ages 8 and up\nLondon, 1899: ...",age london city shaken heinous crime solution ...
...,...,...,...
4720,Zone-A: Le Secret de Tchernobyl Rulebook,I NST RUCT IONS\nINTRODUCTION\nGAMEPLAY\nDETAI...,nst ion introduction gameplay detailed tragic ...
4721,Zooloretto Junior Rulebook,"GAME SET-UP\n• With 3 players, remove the anim...",setup remove animal ofspring two kind animal r...
4722,Zooloretto Rulebook,For 2 to 5 players ages 8 and up\nOVERVIEW\nEa...,age overview zo owner score atracting many vis...
4723,Zooloretto: The Dice Game Rulebook,"Game end\nShould one player, after passing, ha...",end pasing either filed enclosure left single ...


In [6]:
#rulebook_data_cleaned.to_excel('rulebook_data_cleaned.xlsx', index=False)
rulebook_data_cleaned = pd.read_excel('rulebook_data_cleaned.xlsx')
print('Number of rows:', len(rulebook_data_cleaned))
rulebook_data_cleaned.head()

Number of rows: 2709


Unnamed: 0,Name,Rulebook_Text,clean_rulebook
0,1001 Islands,Game Designers:\nGGaammee ccoonntteennttss\nAn...,designer contents bauza bruno cathala artist c...
1,10′ to Kill Rulebook,"components Goal\nIn 10 minutes to kill, you ar...",component goal minute kil hitman goal kil targ...
2,123 Puzzle Rulebook,The material contained in the package is usefu...,material contained package useful familiarizin...
3,12 Gangsters Rulebook,Your gangster wants to steal the same amount a...,gangster want steal amount bos gangster sucesf...
4,13 Clues Rulebook,"For 2-6 players, ages 8 and up\nLondon, 1899: ...",age london city shaken heinous crime solution ...


# 2. Topic Modelling

## 2.1 Sentence Embeddings and K-Means Clustering

In [10]:
# Generate embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")  # or other pre-trained models
embeddings = model.encode(rulebook_data_cleaned["clean_rulebook"].tolist())

# Cluster embeddings
kmeans = KMeans(n_clusters=8, random_state=42)
labels = kmeans.fit_predict(embeddings)

# Assign clusters to rulebooks
rulebook_data_cleaned["Cluster"] = labels

# Print representative texts for each cluster
for cluster in range(10):
    print(f"Cluster {cluster}:")
    print(rulebook_data_cleaned[rulebook_data_cleaned["Cluster"] == cluster]["clean_rulebook"].head(5))

Cluster 0:
1      designer contents bauza bruno cathala artist c...
75     content setup midle table shufle exploration f...
135    lim dig site island began simple map said lead...
177    folow lead famous navigator discoverer new lan...
286    around world content rule year min content tra...
Name: clean_rulebook, dtype: object
Cluster 1:
14    new component overview puting dice dice used p...
16    sumer frantic race reach south pole first tok ...
24    age duration minute piece number intermediate ...
38    tricky shape search age familiar use either si...
39    overview special example played low playing ca...
Name: clean_rulebook, dtype: object
Cluster 2:
17    minute year old botleger acquired establishmen...
53    bauza give lever stand move earth content wond...
57    great wal special rule stage great wal built o...
58    adventurous race around world inspired diary e...
99    important make sure take joker asistant played...
Name: clean_rulebook, dtype: object
Cluster 3:
4  

## 2.2 Non-Negative Matrix Factorization (NMF)

In [11]:
# Convert text data to TF-IDF matrix
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix = vectorizer.fit_transform(rulebook_data_cleaned["clean_rulebook"])

# Apply NMF
nmf_model = NMF(n_components=8, random_state=13)  # Set n_components to desired number of topics
W = nmf_model.fit_transform(tfidf_matrix)
H = nmf_model.components_

# Print top words per topic
words = vectorizer.get_feature_names_out()
for i, topic in enumerate(H):
    print(f"Topic {i}: {[words[j] for j in topic.argsort()[-10:]]}")

Topic 0: ['end', 'discard', 'score', 'draw', 'pile', 'hand', 'deck', 'round', 'play', 'character']
Topic 1: ['efect', 'overlord', 'campaign', 'damage', 'scenario', 'atack', 'encounter', 'enemy', 'quest', 'hero']
Topic 2: ['infantry', 'terain', 'comand', 'combat', 'enemy', 'army', 'atack', 'batle', 'hex', 'unit']
Topic 3: ['gain', 'track', 'victory', 'build', 'marker', 'coin', 'city', 'resource', 'worker', 'building']
Topic 4: ['equipment', 'weapon', 'zombicide', 'mision', 'dor', 'spawn', 'objective', 'zone', 'survivor', 'zombie']
Topic 5: ['atack', 'treasure', 'rom', 'level', 'combat', 'item', 'munchkin', 'dungeon', 'investigator', 'monster']
Topic 6: ['roled', 'value', 'field', 'color', 'property', 'round', 'number', 'rol', 'die', 'dice']
Topic 7: ['port', 'flet', 'sea', 'planet', 'captain', 'crew', 'treasure', 'island', 'pirate', 'ship']


## 2.3 Latent Dirichlet Allocation (LDA)

In [10]:
# Tokenize the text
rulebook_data_cleaned['token_rulebook'] = rulebook_data_cleaned['clean_rulebook'].progress_map(word_tokenizer)
rulebook_data_cleaned.head()

100%|██████████| 2709/2709 [00:00<00:00, 5167.56it/s]


Unnamed: 0,Name,Rulebook_Text,clean_rulebook,token_rulebook
0,1001 Islands,Game Designers:\nGGaammee ccoonntteennttss\nAn...,designer contents bauza bruno cathala artist c...,"[designer, contents, bauza, bruno, cathala, ar..."
1,10′ to Kill Rulebook,"components Goal\nIn 10 minutes to kill, you ar...",component goal minute kil hitman goal kil targ...,"[component, goal, minute, kil, hitman, goal, k..."
2,123 Puzzle Rulebook,The material contained in the package is usefu...,material contained package useful familiarizin...,"[material, contained, package, useful, familia..."
3,12 Gangsters Rulebook,Your gangster wants to steal the same amount a...,gangster want steal amount bos gangster sucesf...,"[gangster, want, steal, amount, bos, gangster,..."
4,13 Clues Rulebook,"For 2-6 players, ages 8 and up\nLondon, 1899: ...",age london city shaken heinous crime solution ...,"[age, london, city, shaken, heinous, crime, so..."


In [11]:
# Getting the length of the list of words
data_words = rulebook_data_cleaned['token_rulebook'].values.tolist()
len_data_words = len(data_words)
print("The length of the list of words is:", len_data_words)

The length of the list of words is: 2709


In [12]:
# Create dictionary where each unique word in 'data_words' is assigned a unique numeric ID
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1][0][:30])

[(0, 7), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 2), (10, 2), (11, 1), (12, 2), (13, 1), (14, 3), (15, 9), (16, 2), (17, 2), (18, 2), (19, 1), (20, 1), (21, 1), (22, 2), (23, 6), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1)]


In [13]:
num_topics = 8

# Build LDA model
lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics, iterations=400, random_state=42)

# Print the keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.006*"move" + 0.006*"must" + 0.005*"hero" + 0.005*"rule" + 0.005*"first" + '
  '0.005*"play" + 0.005*"end" + 0.005*"new" + 0.005*"monster" + 0.005*"take"'),
 (1,
  '0.009*"play" + 0.007*"building" + 0.007*"round" + 0.006*"end" + '
  '0.006*"must" + 0.006*"take" + 0.006*"move" + 0.005*"first" + 0.005*"number" '
  '+ 0.005*"two"'),
 (2,
  '0.008*"unit" + 0.007*"must" + 0.006*"character" + 0.006*"play" + '
  '0.006*"number" + 0.006*"take" + 0.005*"deck" + 0.005*"monster" + '
  '0.005*"first" + 0.005*"use"'),
 (3,
  '0.008*"move" + 0.006*"take" + 0.006*"must" + 0.006*"round" + 0.006*"number" '
  '+ 0.005*"play" + 0.005*"dice" + 0.005*"use" + 0.005*"two" + 0.005*"deck"'),
 (4,
  '0.007*"take" + 0.007*"end" + 0.007*"play" + 0.006*"must" + 0.006*"deck" + '
  '0.005*"draw" + 0.005*"number" + 0.005*"efect" + 0.005*"move" + '
  '0.005*"marker"'),
 (5,
  '0.008*"round" + 0.007*"take" + 0.006*"play" + 0.006*"phase" + 0.006*"first" '
  '+ 0.006*"move" + 0.006*"end" + 0.006*"number" + 0.005

In [14]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.display(vis)

## 2.4  Hierarchical Dirichlet Process (HDP)

In [13]:
# extract the clean_textbook column
texts = rulebook_data_cleaned['clean_rulebook'].str.split()

# create a gensim dictionary and corpus
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# build HDP model
hdp_model = HdpModel(corpus=corpus, id2word=dictionary, random_state=7)
threshold = 0.013

# extract all topics
num_detected_topics = len(hdp_model.get_topics())
all_topics = hdp_model.print_topics(num_topics=num_detected_topics, num_words=10)

# filter significant topics based on the threshold
significant_topics = []
for i, topic in enumerate(all_topics):
    topic_id, topic_details = topic
    word_probs = [float(w.split("*")[0]) for w in topic_details.split(" + ")]
    if max(word_probs) > threshold:
        significant_topics.append((topic_id, topic_details))

# print significant topics
print(f"Number of significant topics: {len(significant_topics)}")
for topic_id, topic_details in significant_topics:
    print(f"Topic {topic_id}: {topic_details}")

Number of significant topics: 8
Topic 3: 0.016*unit + 0.007*hex + 0.007*move + 0.006*batle + 0.006*god + 0.006*lore + 0.005*take + 0.005*round + 0.005*number + 0.005*must
Topic 6: 0.021*investigator + 0.020*monster + 0.014*gate + 0.010*location + 0.009*encounter + 0.007*check + 0.007*arkham + 0.006*ancient + 0.006*item + 0.006*move
Topic 25: 0.016*hero + 0.009*atack + 0.008*guild + 0.007*monster + 0.007*scenario + 0.004*inferno + 0.004*quest + 0.003*campaign + 0.003*move + 0.003*rulebok
Topic 27: 0.018*building + 0.006*money + 0.006*play + 0.005*rule + 0.005*wal + 0.004*extension + 0.004*new + 0.004*scoring + 0.004*module + 0.004*change
Topic 37: 0.014*investigator + 0.006*enemy + 0.005*deck + 0.005*location + 0.004*test + 0.003*copy + 0.003*play + 0.003*skil + 0.003*scenario + 0.002*fg
Topic 38: 0.015*unit + 0.005*lore + 0.005*hex + 0.004*deployment + 0.004*scenario + 0.003*step + 0.003*comand + 0.003*must + 0.003*terain + 0.003*atack
Topic 39: 0.015*technology + 0.009*enginer + 0.006

## 2.3 BERTopic

In [10]:
# Set seeds for reproducibility
SEED = 13
random.seed(SEED)
np.random.seed(SEED)

# Extract the cleaned rulebook text
documents = rulebook_data_cleaned['clean_rulebook'].tolist()

# Configure UMAP with a fixed random state
umap_model = UMAP(random_state=SEED, n_neighbors=15, metric='cosine')

# Initialize BERTopic with parameters to reduce topics
vectorizer_model = CountVectorizer(ngram_range=(1, 2))

# Using min_topic_size and nr_topics to reduce the number of topics
topic_model = BERTopic(vectorizer_model=vectorizer_model,
                       umap_model=umap_model,
                       nr_topics=9)  

# Fit and transform
topics, probs = topic_model.fit_transform(documents)

# Summary of topics
print(topic_model.get_topic_info())
topic_model.visualize_topics()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marsone\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   Topic  Count                               Name  \
0     -1   1024              -1_take_play_end_must   
1      0   1447              0_take_play_move_must   
2      1     73    1_dice_ingredient_number_potion   
3      2     55          2_property_bank_jail_rent   
4      3     45         3_detective_case_clue_lead   
5      4     22           4_child_animal_kid_piece   
6      5     19  5_asasin_location_enemy_character   
7      6     12            6_ghost_taoist_play_tao   
8      7     12   7_patient_doctor_nurse_treatment   

                                      Representation  \
0  [take, play, end, must, move, round, number, f...   
1  [take, play, move, must, number, end, first, t...   
2  [dice, ingredient, number, potion, rol, chip, ...   
3  [property, bank, jail, rent, pay, title, must,...   
4  [detective, case, clue, lead, character, inves...   
5  [child, animal, kid, piece, fort, puzle, suit,...   
6  [asasin, location, enemy, character, artefact,...   
7  [ghost, 

In [11]:
topic_model.visualize_barchart(n_words = 10)