# Necessary dependencies

In [None]:
import os
import requests
import time
import pandas as pd
import re
from pathlib import Path
import pymupdf
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from umap import UMAP
from bertopic.representation import KeyBERTInspired


  from .autonotebook import tqdm as notebook_tqdm


# Data Preprocessing

In [30]:
papers_dict= pd.read_excel(io = r"prisma_synpop.xlsx",sheet_name=['prisma',"WoS_extended_search"])

In [31]:
papers = pd.concat([papers_dict['prisma'][['Title','Findability  (DOI)']],papers_dict['WoS_extended_search'][['Title','Findability  (DOI)']]])

In [32]:
papers = papers.dropna()
doilist = papers['Findability  (DOI)'].to_list()

In [None]:
def reconstruct_abstract(inverted_index):
    """
    Converts an OpenAlex-style inverted index into a plain text abstract.
    """
    if not inverted_index:
        return None
    
    # Create a list of the correct size
    max_index = max([pos for positions in inverted_index.values() for pos in positions])
    abstract_list = [''] * (max_index + 1)
    
    # Populate the list with words
    for word, positions in inverted_index.items():
        for pos in positions:
            abstract_list[pos] = word
            
    # Join the words to form the abstract
    return ' '.join(abstract_list)


all_abstract = []
doi_list = doilist


headers = {
    'User-Agent': 'AbstractFetcher/1.0 (mailto:@mail.muni.cz)'
}

for doi in doi_list:
    try:
        url = f"https://api.openalex.org/works/doi:{doi}"
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        data = response.json()
        
        # Get the inverted index
        inverted_abstract = data.get('abstract_inverted_index')
        
        if inverted_abstract:
            # Reconstruct the plain text abstract
            abstract_text = reconstruct_abstract(inverted_abstract)

        else:
            abstract_text = f"Abstract not found in OpenAlex record"

    except requests.exceptions.RequestException as e:
        abstract_text = f"Error fetching data: {e}"
    
    all_abstract.append({
        'doi': doi,
        'abstract': abstract_text,
    })


In [None]:
abstract_df = pd.DataFrame(all_abstract)

In [None]:
abstract_df.to_excel('abstracts.xlsx')

In [None]:
"""
path = Path("data")
sub_path_list = [x for x in path.iterdir() if x.is_dir()]
file_list = []
for sub_path in sub_path_list:
    for x in sub_path.iterdir():
        if str(x).endswith("pdf"):
            file_list.append(x)
""" 

In [None]:
"""
pattern = re.compile(
    r'(?i)abstract[:\s]*(.+?)(?=(?:\n?\s*(?:\d+\s*\.|introduction|keywords|1\s)))',
     re.DOTALL                  
)

abstracts = []
for pdf in file_list:
    doc = pymupdf.open(pdf)
    page_0 = doc[0]
    text_0 = page_0.get_text()
    text_0 = re.sub(r'\s+',' ',text_0).strip()
    match = pattern.search(text_0)
    if match:
        abstract = match.group(1).strip()
    else:
        abstract = None
    abstracts.append(abstract)
cleanabs = [x for x in abstracts if str(x) != "None"]
"""

# Topic modelling

- [ ] LLM for reprsention model   

In [26]:
abstract_df = pd.read_excel('abstracts.xlsx')

# Ensure a plain List[str] with no NaN/sentinels
docs = (
    abstract_df.loc[
        (abstract_df['abstract'] != 'Abstract not found in OpenAlex record'),
        'abstract'
    ].tolist()
)
docs = [s for s in docs if "Error fetching data:" not in s]

In [27]:
len(docs)

237

In [12]:


my_stop_words = ['synthesis','ipf','synthetic','population','data','activity','model','models','approach','framework','characteristics',"les", "et",'dataset']
all_stop_words = list(ENGLISH_STOP_WORDS) + my_stop_words
embedding_model = "pritamdeka/S-Scibert-snli-multinli-stsb"
umap_model = UMAP(n_neighbors=15, n_components=10, random_state=42)
vectorizer_model = CountVectorizer(stop_words = all_stop_words, ngram_range=(1,2))
representation_model = KeyBERTInspired()

topic_model = BERTopic(
    verbose=True,
    min_topic_size=10,
    #top_n_words = 3,
    vectorizer_model = vectorizer_model,
    embedding_model=embedding_model,
    umap_model = umap_model,
    representation_model=representation_model
    )

topics, probabilities = topic_model.fit_transform(docs)

2025-10-29 09:00:27,863 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 8/8 [00:04<00:00,  1.96it/s]
2025-10-29 09:00:34,286 - BERTopic - Embedding - Completed ✓
2025-10-29 09:00:34,286 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-29 09:00:34,683 - BERTopic - Dimensionality - Completed ✓
2025-10-29 09:00:34,685 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-29 09:00:34,690 - BERTopic - Cluster - Completed ✓
2025-10-29 09:00:34,691 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-29 09:00:35,844 - BERTopic - Representation - Completed ✓


In [13]:
topic_info = topic_model.get_topic_info()
print(topic_info)

   Topic  Count                                               Name  \
0     -1    108         -1_modeling_simulations_simulation_spatial   
1      0     43     0_simulation_mobility_case study_micromobility   
2      1     23  1_transportation_modeling_microsimulation_simu...   
3      2     20  2_transportation planning_travel demand_mobili...   
4      3     16                3_modeling_mobility_cities_patterns   
5      4     16  4_transport simulations_transport simulation_t...   
6      5     11  5_pollutant emissions_urban_environmental effe...   

                                      Representation  \
0  [modeling, simulations, simulation, spatial, s...   
1  [simulation, mobility, case study, micromobili...   
2  [transportation, modeling, microsimulation, si...   
3  [transportation planning, travel demand, mobil...   
4  [modeling, mobility, cities, patterns, measure...   
5  [transport simulations, transport simulation, ...   
6  [pollutant emissions, urban, environmental e

In [14]:
topic_model.get_topic(0)

[('simulation', np.float32(0.44926697)),
 ('mobility', np.float32(0.28835297)),
 ('case study', np.float32(0.28181857)),
 ('micromobility', np.float32(0.26874742)),
 ('city', np.float32(0.262754)),
 ('impacts', np.float32(0.25374272)),
 ('electric vehicles', np.float32(0.25269952)),
 ('traffic', np.float32(0.23784317)),
 ('results', np.float32(0.2322515)),
 ('estimated', np.float32(0.21476868))]

In [16]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,Models of human mobility have broad applicabil...,-1,-1_modeling_simulations_simulation_spatial,"[modeling, simulations, simulation, spatial, s...","[In this study, an activity-based travel deman...",modeling - simulations - simulation - spatial ...,0.000000,False
1,Models of human mobility have broad applicabil...,-1,-1_modeling_simulations_simulation_spatial,"[modeling, simulations, simulation, spatial, s...","[In this study, an activity-based travel deman...",modeling - simulations - simulation - spatial ...,0.000000,False
2,Multi-agent models for simulating the mobility...,-1,-1_modeling_simulations_simulation_spatial,"[modeling, simulations, simulation, spatial, s...","[In this study, an activity-based travel deman...",modeling - simulations - simulation - spatial ...,0.000000,False
3,Increasing computing capability and high-resol...,-1,-1_modeling_simulations_simulation_spatial,"[modeling, simulations, simulation, spatial, s...","[In this study, an activity-based travel deman...",modeling - simulations - simulation - spatial ...,0.000000,False
4,The execution of agent-based microsimulation r...,1,1_transportation_modeling_microsimulation_simu...,"[transportation, modeling, microsimulation, si...",[The execution of agent-based microsimulation ...,transportation - modeling - microsimulation - ...,1.000000,True
...,...,...,...,...,...,...,...,...
232,Incorporating individual user preferences in s...,0,0_simulation_mobility_case study_micromobility,"[simulation, mobility, case study, micromobili...","[Micromobility refers to small, lightweight ve...",simulation - mobility - case study - micromobi...,0.695303,False
233,Reaction-diffusion mathematical modeling plays...,-1,-1_modeling_simulations_simulation_spatial,"[modeling, simulations, simulation, spatial, s...","[In this study, an activity-based travel deman...",modeling - simulations - simulation - spatial ...,0.000000,False
234,Due to air quality concerns and stricter carbo...,0,0_simulation_mobility_case study_micromobility,"[simulation, mobility, case study, micromobili...","[Micromobility refers to small, lightweight ve...",simulation - mobility - case study - micromobi...,0.999745,False
235,"We applied the PECAS Framework, a spatial econ...",-1,-1_modeling_simulations_simulation_spatial,"[modeling, simulations, simulation, spatial, s...","[In this study, an activity-based travel deman...",modeling - simulations - simulation - spatial ...,0.000000,False


In [17]:
topic_model.visualize_hierarchy()

In [18]:
topic_model.visualize_barchart()

In [19]:
topic_model.visualize_topics()