In [3]:
import labels 
import pandas as pd

In [None]:
def helper(topic_df: pd.DataFrame,origin_df: pd.DataFrame, col_name:str):
    """
    A helper function to combine the topic column from topic mining to orignial dataframe.

    Args:
        topic_df: The dataframe from topic mining.
        origin_df: The original dataframe for topic mining.
        col_name: The name of coloum that contains topics in the origin dataframe
    """
    topic_df['Document'] = topic_df['Document'].apply(lambda x : x.split("[SEP]")[0])
    topic_df = topic_df.set_index('Document')
    origin_df = origin_df.join(topic_df['Name'],on='Title',how='left')
    origin_df = origin_df.rename(columns = {"Name":col_name})

    return origin_df

In [13]:
keywords = ["synthetic population" , "artificial population" , "temporary population" ,  "daytime population" , "mobile population" ,  "service population" , "floating population" , "elusive population" ,  "ambient population" , "seasonal population" , "non-resident population" , "real-time population" , "spatiotemporal population" ,  "spatial temporal population" , "spatial-temporal population" ,  "spatio temporal population" , "visiting population" , "visitor population" ,  "dynamic population" , "commuting population" , "commuter population" , "nighttime population" , "projected population" , "de facto population" , "agent-based population" , "agent based population"]
df = labels.duplicates('savedrecs_populations.xlsx',['Title','DOI/ISBN'])
df = labels.keywords(df,['Title','Abstract'],keywords)

In [14]:
df = df[df['duplicate'] == False]
df = df[~((df['Abstract'].isna()) | (df['Abstract'] == '[No abstract available]'))]

In [38]:
df= df[(df['Rel DS'] == 1) | ((df['Rel LH'] == 1))]

In [7]:
from bertopic import BERTopic
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
import torch

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("gpu:", torch.cuda.get_device_name(0))

cuda available: True
gpu: NVIDIA GeForce RTX 4060 Laptop GPU


In [9]:
# Embedding model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
model = AutoAdapterModel.from_pretrained('allenai/specter2_base')
model.load_adapter("allenai/specter2", source="hf", load_as="specter2",set_active=True)
model.to(device)
model.eval()

Using device: cuda


Fetching 4 files: 100%|██████████| 4/4 [00:00<?, ?it/s]
There are adapters available but none are activated for the forward pass.


BertAdapterModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttentionWithAdapters(
              (query): LoRALinearTorch(
                in_features=768, out_features=768, bias=True
                (shared_parameters): ModuleDict()
                (loras): ModuleDict()
              )
              (key): LoRALinearTorch(
                in_features=768, out_features=768, bias=True
                (shared_parameters): ModuleDict()
                (loras): ModuleDict()
              )
              (value): LoRALinearTorch(
             

In [39]:
# Prepare text batch
text_batch = []
for i, item in df.iterrows():
    text_batch.append(str(item['Title']) + tokenizer.sep_token + str(item['Abstract']))
    
all_embeddings = []
batch_size =   64
with torch.no_grad():
    for start in range(0, len(text_batch), batch_size):
        batch = text_batch[start:start + batch_size]
        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            return_tensors="pt",
            return_token_type_ids=False,
            max_length=512,
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        output = model(**inputs)
        batch_emb = output.last_hidden_state[:, 0, :].cpu()
        all_embeddings.append(batch_emb)

embeddings = torch.cat(all_embeddings, dim=0)

In [48]:
# embeddinggs
emb_np = embeddings.detach().cpu().numpy()
# Dimensionality reduction model
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, random_state=42, metric="cosine")

#Clustering model
hdbscan_model = HDBSCAN(
    min_cluster_size = 10, 
    min_samples = 5,
    prediction_data = True
    )

# Verctorization model
## Stop words
my_stop_words = ['rights','reserved','population','data','population','synthetic','elsevier','bv','model',
                 'populations','study','models','results','using','floating','analysis','based','paper','china',
                 'spatial','used','areas'
                 ]

all_stop_words = list(ENGLISH_STOP_WORDS) + my_stop_words

vectorizer_model = CountVectorizer(stop_words = all_stop_words, ngram_range=(1,3))

# Representation Model
representation_model = KeyBERTInspired()
# Bertopic pipeline
topic_model = BERTopic(
    verbose=True,                 # track each stage of model
    umap_model = umap_model,         # dimension reduction
    hdbscan_model= hdbscan_model,    # clustering
    vectorizer_model = vectorizer_model, #vectorization
    representation_model= None, # topic representation
    calculate_probabilities=True
    )
# Train
topics, probs = topic_model.fit_transform(documents=text_batch, embeddings = emb_np)


2026-01-07 16:34:57,642 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-07 16:34:58,071 - BERTopic - Dimensionality - Completed ✓
2026-01-07 16:34:58,072 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-07 16:34:58,084 - BERTopic - Cluster - Completed ✓
2026-01-07 16:34:58,086 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-07 16:34:58,243 - BERTopic - Representation - Completed ✓


In [49]:
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,87,-1_crime_urban_mobile_use,"[crime, urban, mobile, use, new, mobile phone,...",[Daily mobility practices through mobile phone...
1,0,82,0_travel_simulation_households_household,"[travel, simulation, households, household, us...",[Synthetic household travel survey data simula...
2,1,41,1_urban_migration_migrants_patterns,"[urban, migration, migrants, patterns, city, e...",[The spatial-temporal patterns of per capita s...
3,2,34,2_projections_fertility_demographic_growth,"[projections, fertility, demographic, growth, ...",[Reverse survival method of fertility estimati...
4,3,32,3_epidemic_disease_network_contact,"[epidemic, disease, network, contact, social, ...",[Epidemic Modelling: Validation of Agent-based...
5,4,26,4_tourism_daytime_diurnal_household,"[tourism, daytime, diurnal, household, visitor...",[Assessing patterns of spatial behavior in hea...
6,5,26,5_mobile_distribution_estimates_human,"[mobile, distribution, estimates, human, studi...",[Dynamic population mapping using mobile phone...
7,6,21,6_exposure_risk_distribution_evacuation,"[exposure, risk, distribution, evacuation, tsu...",[Towards improved risk assessment: Mapping the...
8,7,20,7_distribution_chemical_wastewater_loads,"[distribution, chemical, wastewater, loads, la...",[Systematic and day-to-day effects of chemical...
9,8,16,8_social_network_artificial_service,"[social, network, artificial, service, agents,...",[Artificial society-oriented large-scale road ...


In [34]:
topic_model.reduce_topics(text_batch, nr_topics=10)

2026-01-07 16:30:53,889 - BERTopic - Topic reduction - Reducing number of topics
2026-01-07 16:30:53,900 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-07 16:30:58,357 - BERTopic - Representation - Completed ✓
2026-01-07 16:30:58,370 - BERTopic - Topic reduction - Reduced number of topics from 29 to 10


<bertopic._bertopic.BERTopic at 0x16a784c0800>

In [182]:
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1199,-1_urban_time_management_different,"[urban, time, management, different, species, ...",[Climate change is associated with a higher ex...
1,0,1707,0_water_urban_use_distribution,"[water, urban, use, distribution, area, demand...",[A methodology to prioritize spatio-temporal m...
2,1,1255,1_migrants_migration_urban_social,"[migrants, migration, urban, social, rural, mi...",[Understanding the Gap Between De Facto and De...
3,2,550,2_dynamic_disease_dynamics_epidemic,"[dynamic, disease, dynamics, epidemic, covid19...",[Untangling the interplay between epidemic spr...
4,3,530,3_species_seasonal_dynamics_growth,"[species, seasonal, dynamics, growth, abundanc...",[Central-marginal population dynamics in speci...
5,4,303,4_algorithm_optimization_dynamic_algorithms,"[algorithm, optimization, dynamic, algorithms,...",[Dynamic topology multi force particle swarm o...
6,5,195,5_stars_stellar_mass_planets,"[stars, stellar, mass, planets, formation, gal...",[The New Generation Planetary Population Synth...
7,6,108,6_crime_ambient_street_theft,"[crime, ambient, street, theft, crimes, police...",[Assessing Crime History as a Predictor: Explo...
8,7,52,7_mobile_archaeological_early_late,"[mobile, archaeological, early, late, site, ag...",[Integrated remote sensing and excavation at D...
9,8,47,8_wastewater_normalization_sarscov2_treatment,"[wastewater, normalization, sarscov2, treatmen...",[Current state and future perspectives on de f...


In [185]:
topic_model.visualize_hierarchy()

In [184]:
topic_model.visualize_topics()