```python
from bertopic import BERTopic

documents = [
    "My cat is the cutest.", 
    "Offer your cat prenium food.",
    "The Empire State building is 1,250 feet tall",
]

topic_model = BERTopic()
topic, _ = topic_model.fit_transform(documents)
```

# Open data and preprocess

In [14]:
import pandas as pd

df_raw = pd.read_csv("./data/theses-soutenues-curated.csv")
df_raw

Unnamed: 0,CI,year,oai_set_specs,titres.en,resumes.en,lang_res.en,topics.en,titres.fr,resumes.fr,lang_res.fr,topics.fr,swapped
0,CI-0,2010.0,ddc:004,"ViSaGe project : VisageFS, a filesystem with a...","Nowdays, the grid computing enables solutions ...",EN,POSIX (norme),"Projet ViSaGe : VisageFS, systèmes de fichiers...",Les grilles informatiques permettent d'envisag...,FR,Entrepôts de données||Langages de programmatio...,
1,CI-1,2012.0,ddc:570,Neural basis of glaucoma : a new approach comb...,Decreased visual motion sensitivity in early s...,EN,,Bases neuronales du glaucome : une approche co...,La diminution précoce de la sensibilité au mou...,FR,Poursuite oculaire||Glaucome à angle ouvert,
2,CI-2,2010.0,ddc:150,Richard Wagner and the Redemption's opera : co...,Richard Wagner's poetic and musical writing in...,EN,,Richard Wagner et l’Opéra de la Rédemption : c...,L’écriture poétique et musicale de Richard Wag...,FR,"Musique -- 19e siècle -- Thèmes, motifs||Psych...",
3,CI-3,2010.0,ddc:530,Investigation of temperature measurement of ma...,This work investigates the temperature measure...,EN,,Contribution à la mesure de température des ma...,Le cadre de ces travaux concerne la mesure de ...,FR,Thermométrie||Pyrométrie||Choc (mécanique)||Ma...,
4,CI-4,2012.0,ddc:796,Sociology of juvenile prison,Researches in social sciences that deal with t...,EN,,Faire sa peine à l'établissement pénitentiaire...,Les recherches en sciences sociales s'intéress...,FR,Centres pour jeunes délinquants||Détention des...,
...,...,...,...,...,...,...,...,...,...,...,...,...
164374,CI-164374,2011.0,ddc:330,Partial Ownerships and Competition,The literature is unanimous about the harmfuln...,EN,Concurrence,Prises de Participations et Concurrence,La littérature est unanime quand à la nocivité...,FR,Fusion d'entreprises||Offres publiques d'achat...,
164375,CI-164375,2017.0,ddc:530,New dynamics in doped fiber laser cavity : sel...,"Non-linear effects, which depend essentially o...",EN,,Nouvelles dynamiques en cavité laser à fibre d...,Les effets non-linéaires dépendant essentielle...,FR,Interféromètres||Techniques des impulsions (él...,
164376,CI-164376,2011.0,ddc:610,Vectorization of siRNA targeting RET/PTC1 jonc...,The papillary thyroid carcinoma (PTC) is the m...,EN,Nanoparticules,Vectorisation de siRNA dirigés contre l'oncogè...,Le cancer papillaire de la thyroïde (PTC) repr...,FR,Nanomédecine||Squalène||Cancérogenèse||Petit A...,
164377,CI-164377,2012.0,ddc:840,From the knowledge of China to the knowledge o...,Our research focuses on the relationships in V...,EN,"Segalen, Victor (1878-1919)",De la connaissance de la Chine à la connaissan...,La recherche s’attache aux rapports dans l'œuv...,FR,Études chinoises||Théologie||Herméneutique||Li...,


In [15]:
from transformers import AutoConfig 

model_name = "sentence-transformers/all-MiniLM-L6-v2"
config = AutoConfig.from_pretrained(model_name, trust_remote_code = True)
print(f"Context window size of the model {model_name}: {config.max_position_embeddings}")

model_name = "Alibaba-NLP/gte-multilingual-base"
config = AutoConfig.from_pretrained(model_name, trust_remote_code = True)
print(f"Context window size of the model {model_name}: {config.max_position_embeddings}")

Context window size of the model sentence-transformers/all-MiniLM-L6-v2: 512
Context window size of the model Alibaba-NLP/gte-multilingual-base: 8192


In [16]:
df_raw["resumes.en.len"] = df_raw["resumes.en"].apply(len)
df_raw["resumes.fr.len"] = df_raw["resumes.fr"].apply(len)
df_raw.loc[:,["resumes.en.len", "resumes.fr.len"]].describe()

Unnamed: 0,resumes.en.len,resumes.fr.len
count,164379.0,164379.0
mean,1777.648082,1984.935119
std,735.027732,802.72081
min,1.0,6.0
25%,1324.0,1508.0
50%,1617.0,1702.0
75%,2080.0,2362.0
max,12010.0,12207.0


In [17]:
from numpy import logical_and

valid_index = logical_and.reduce([
    df_raw["resumes.fr.len"] >= 1000,
    df_raw["resumes.fr.len"] <= 4000,
    df_raw["resumes.en.len"] >= 1000,
    df_raw["resumes.en.len"] <= 4000,
])

df = df_raw.loc[valid_index,:]
print(f"Proportion of the dataset preserved: {100 * len(df) / len(df_raw):.0f} %")

Proportion of the dataset preserved: 89 %


In [18]:
stratification_column = "year"
samples_per_stratum = 500
df_stratified = (
	df
	.groupby(stratification_column, as_index = False)
	.apply(lambda x : x.sample(n = samples_per_stratum), include_groups=True)
	.reset_index()
	.drop(["level_0", "level_1"], axis = 1)
)
print(f"Size of the dataset: {len(df_stratified)}")

Size of the dataset: 6500


In [19]:
df_stratified.to_csv("./data/theses-soutenues-curated-stratified.csv", index=False)

# Create a BERTopic instance, fit and transform

In [21]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from stopwordsiso import stopwords


language = "english" # or "french"
language_short = language[:2] # "en" or "fr"

docs = df_stratified[f"resumes.{language_short}"] # "resumes.en" or "resumes.fr"

vectorizer_model = CountVectorizer(
    stop_words = list(stopwords(language_short)),
    ngram_range = (1,1)
)

topic_model = BERTopic(
    embedding_model="sentence-transformers/all-MiniLM-L6-v2",
    language = language,
    vectorizer_model = vectorizer_model,
)
topic_model.fit(documents=docs)

ValueError: Alibaba-NLP/new-impl You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-multilingual-base.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.