In [3]:
import re
from pathlib import Path
import pymupdf
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
path = Path("data")
sub_path_list = [x for x in path.iterdir() if x.is_dir()]
file_list = []
for sub_path in sub_path_list:
    for x in sub_path.iterdir():
        if str(x).endswith("pdf"):
            file_list.append(x)

In [5]:

pattern = re.compile(
    r'(?i)abstract[:\s]*(.+?)(?=(?:\n?\s*(?:\d+\s*\.|introduction|keywords|1\s)))',
     re.DOTALL                  
)

abstracts = []
for pdf in file_list:
    doc = pymupdf.open(pdf)
    page_0 = doc[0]
    text_0 = page_0.get_text()
    text_0 = re.sub(r'\s+',' ',text_0).strip()
    match = pattern.search(text_0)
    if match:
        abstract = match.group(1).strip()
    else:
        abstract = None
    abstracts.append(abstract)
cleanabs = [x for x in abstracts if str(x) != "None"]

In [8]:
len(cleanabs)

151

In [9]:
my_stop_words = ["et", 'locations','location','models',"al", "study", "paper", "research", "introduction", "method","data","model",'map','tasks','task','maps']
all_stop_words = list(ENGLISH_STOP_WORDS) + my_stop_words
vectorizer_model = CountVectorizer(stop_words = all_stop_words)
embedding_model = "pritamdeka/S-Scibert-snli-multinli-stsb"
topic_model = BERTopic(
    verbose=True,
    min_topic_size=5,
    vectorizer_model = vectorizer_model,
    embedding_model=embedding_model,
    )
topics, probabilities = topic_model.fit_transform(cleanabs)

2025-10-22 09:59:30,469 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 5/5 [00:02<00:00,  1.88it/s]
2025-10-22 09:59:35,606 - BERTopic - Embedding - Completed ✓
2025-10-22 09:59:35,606 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-22 09:59:41,801 - BERTopic - Dimensionality - Completed ✓
2025-10-22 09:59:41,803 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-22 09:59:41,823 - BERTopic - Cluster - Completed ✓
2025-10-22 09:59:41,830 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-22 09:59:41,867 - BERTopic - Representation - Completed ✓


In [10]:
topic_info = topic_model.get_topic_info()
print(topic_info)

   Topic  Count                                Name  \
0     -1     17  -1_trajectory_privacy_income_urban   
1      0     54  0_population_synthetic_agent_based   
2      1     27             1_land_use_change_cover   
3      2     14   2_learning_llms_language_networks   
4      3     13    3_urban_game_historical_encoding   
5      4     11           4_changes_land_period_use   
6      5      9   5_historical_landscape_land_cover   
7      6      6     6_trafﬁc_urban_policies_bicycle   

                                      Representation  \
0  [trajectory, privacy, income, urban, hyper, op...   
1  [population, synthetic, agent, based, simulati...   
2  [land, use, change, cover, changes, markov, fu...   
3  [learning, llms, language, networks, geographi...   
4  [urban, game, historical, encoding, analysis, ...   
5  [changes, land, period, use, czech, landscape,...   
6  [historical, landscape, land, cover, topograph...   
7  [trafﬁc, urban, policies, bicycle, effects, ro...   


In [36]:
len(cleanabs)

151

In [35]:
topic_model.get_document_info(cleanabs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,The city-building game Cities: Skylines simula...,6,6_game_encoding_urban_methods,"[game, encoding, urban, methods, vgi, embeddin...",[Shape vectorization is a key stage of the dig...,game - encoding - urban - methods - vgi - embe...,0.973994,True
1,. Statistical modeling is a powerful tool for ...,2,2_agent_modeling_abms_simulation,"[agent, modeling, abms, simulation, based, cri...",[. Agent-based simulation is an alternative ap...,agent - modeling - abms - simulation - based -...,0.829782,False
2,We present an individual-centric agent-based m...,-1,-1_based_macau_hyper_agent,"[based, macau, hyper, agent, issues, microsimu...",[—Most of the machine learning models have ass...,based - macau - hyper - agent - issues - micro...,0.000000,False
3,Transport planning strategies regard cycling p...,7,7_urban_income_trafﬁc_segregation,"[urban, income, trafﬁc, segregation, sprawl, p...",[Urban traffic is a system always prone to ove...,urban - income - trafﬁc - segregation - sprawl...,1.000000,True
4,Topographic maps are powerful tools for the pu...,5,5_historical_land_landscape_cover,"[historical, land, landscape, cover, topograph...",[Many studies of land-cover and structural cha...,historical - land - landscape - cover - topogr...,1.000000,False
...,...,...,...,...,...,...,...,...
146,"three groups of populations: U.S. Corn Belt, n...",4,4_changes_land_period_czech,"[changes, land, period, czech, landscape, use,...",[Central European landscapes have undergone ma...,changes - land - period - czech - landscape - ...,0.425805,False
147,Agent-based models have gained traction in exp...,-1,-1_based_macau_hyper_agent,"[based, macau, hyper, agent, issues, microsimu...",[—Most of the machine learning models have ass...,based - macau - hyper - agent - issues - micro...,0.000000,True
148,With recent advancements in natural language p...,3,3_llms_learning_language_spatial,"[llms, learning, language, spatial, networks, ...",[Spatial representation learning (SRL) aims at...,llms - learning - language - spatial - network...,0.786562,True
149,Reconstructing historical land-use and land-co...,0,0_land_use_change_cover,"[land, use, change, cover, changes, markov, la...",[Land use and land cover change research has b...,land - use - change - cover - changes - markov...,0.754658,False


In [34]:
topic_model.visualize_hierarchy()

In [33]:
topic_model.visualize_barchart()

In [32]:
topic_model.visualize_topics()