In [2]:
import labels 
import pandas as pd
import numpy as np

from bertopic import BERTopic
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
import torch

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import MaximalMarginalRelevance

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("gpu:", torch.cuda.get_device_name(0))

cuda available: True
gpu: NVIDIA GeForce RTX 4060 Laptop GPU


In [None]:
# load data, tag duplicates and keywords
keywords = ["synthetic population" , "artificial population" , "temporary population" ,  "daytime population" , "mobile population" ,  "service population" , "floating population" , "elusive population" ,  "ambient population" , "seasonal population" , "non-resident population" , "real-time population" , "spatiotemporal population" ,  "spatial temporal population" , "spatial-temporal population" ,  "spatio temporal population" , "visiting population" , "visitor population" ,  "dynamic population" , "commuting population" , "commuter population" , "nighttime population" , "projected population" , "de facto population" , "agent-based population" , "agent based population"]
df = labels.duplicates('savedrecs_populations.xlsx',['Title','DOI/ISBN'])
df = labels.keywords(df,['Title','Abstract'],keywords)
df_withoutDup = df[df['duplicate'] == False]
all_abstract = df_withoutDup[~((df_withoutDup['Abstract'].isna()) | (df_withoutDup['Abstract'] == '[No abstract available]'))]
marked_abstract= all_abstract[(all_abstract['Rel DS'] == 1) | ((all_abstract['Rel LH'] == 1))]

In [3]:
def helper(topic_df: pd.DataFrame,origin_df: pd.DataFrame, col_name:str):
    """
    A helper function to combine the topic column from topic mining to orignial dataframe.

    Args:
        topic_df: The dataframe from topic mining.
        origin_df: The original dataframe for topic mining.
        col_name: The name of coloum that contains topics in the origin dataframe
    
    Returns:
        orginal dataframe with topic coloumn
    """
    topic_df['Document'] = topic_df['Document'].apply(lambda x : x.split("[SEP]")[0])
    topic_df = topic_df.set_index('Document')
    origin_df = origin_df.join(topic_df[['Name','Representation']],on='Title',how='left')
    origin_df = origin_df.rename(columns = {"Name":col_name,'Representation':col_name+" fullWords"})

    return origin_df

# Embedding model set up

In [5]:
# Embedding model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
model = AutoAdapterModel.from_pretrained('allenai/specter2_base')
model.load_adapter("allenai/specter2", source="hf", load_as="specter2",set_active=True)
model.to(device)
model.eval()

Using device: cuda


Fetching 4 files: 100%|██████████| 4/4 [00:00<?, ?it/s]
There are adapters available but none are activated for the forward pass.


BertAdapterModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttentionWithAdapters(
              (query): LoRALinearTorch(
                in_features=768, out_features=768, bias=True
                (shared_parameters): ModuleDict()
                (loras): ModuleDict()
              )
              (key): LoRALinearTorch(
                in_features=768, out_features=768, bias=True
                (shared_parameters): ModuleDict()
                (loras): ModuleDict()
              )
              (value): LoRALinearTorch(
             

In [6]:
def text_embedding(df):
# Prepare text batch
    text_batch = []
    for i, item in df.iterrows():
        text_batch.append(str(item['Title']) + tokenizer.sep_token + str(item['Abstract']))
        
    all_embeddings = []
    batch_size =   64
    with torch.no_grad():
        for start in range(0, len(text_batch), batch_size):
            batch = text_batch[start:start + batch_size]
            inputs = tokenizer(
                batch,
                padding=True,
                truncation=True,
                return_tensors="pt",
                return_token_type_ids=False,
                max_length=512,
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}
            output = model(**inputs)
            batch_emb = output.last_hidden_state[:, 0, :].cpu()
            all_embeddings.append(batch_emb)

    embeddings = torch.cat(all_embeddings, dim=0)
    embeddings = embeddings.detach().cpu().numpy()
    return text_batch, embeddings

In [None]:
# Embeddings for all docs
all_text_batch, all_embeddings = text_embedding(all_abstract)
np.save('all_embeddings.npy', all_embeddings)

In [None]:
# Embeddings for marked docs
marked_text_batch, marked_embeddings = text_embedding(marked_abstract)
np.save('marked_embeddings.npy', marked_embeddings)

# Bertopic setup

In [35]:
def bertopic(text_batch, embeddings,min_cluster_size):
    # embeddinggs
    # Dimensionality reduction model
    umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, random_state=42, metric="cosine")

    #Clustering model
    hdbscan_model = HDBSCAN(
        min_cluster_size = min_cluster_size, 
        min_samples = 5,
        prediction_data = True,
        cluster_selection_method ='eom'

        )

    # Verctorization model
    ## Stop words
    my_stop_words = ['rights','reserved','population','data','population','synthetic','elsevier','bv','model',
                    'populations','study','models','results','using','floating','analysis','based','paper','china',
                    'spatial','used','areas','use','different','dynamic','social','simulation','approach','proposed'
                    ]

    all_stop_words = list(ENGLISH_STOP_WORDS) + my_stop_words

    vectorizer_model = CountVectorizer(stop_words = all_stop_words, ngram_range=(1,2))

    # Representation Model
    representation_model = MaximalMarginalRelevance(diversity=0.8)
    # Bertopic pipeline
    topic_model = BERTopic(
        verbose=True,                 # track each stage of model
        umap_model = umap_model,         # dimension reduction
        hdbscan_model= hdbscan_model,    # clustering
        vectorizer_model = vectorizer_model, #vectorization
        representation_model= representation_model, # topic representation
        calculate_probabilities=True
        )
    # Train
    topic_model.fit_transform(documents=text_batch, embeddings = embeddings)
    return topic_model

In [None]:
# Tokenizer and reprensetation model for outlier reduction, which are them same with topic miningmodel
my_stop_words = ['rights','reserved','population','data','population','synthetic','elsevier','bv','model',
                    'populations','study','models','results','using','floating','analysis','based','paper','china',
                    'spatial','used','areas','use','different','dynamic','social','simulation','approach','proposed'
                    ]

all_stop_words = list(ENGLISH_STOP_WORDS) + my_stop_words

vectorizer_model = CountVectorizer(stop_words = all_stop_words, ngram_range=(1,2))
representation_model = MaximalMarginalRelevance(diversity=0.8)

# Topic mining of All titles with abstracts

In [17]:
all_embeddings = np.load('all_embeddings.npy')
all_topic_model = bertopic(all_text_batch, all_embeddings,min_cluster_size=30)
all_topic_info = all_topic_model.get_topic_info()
all_topic_info

2026-01-09 10:06:42,963 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-09 10:06:47,785 - BERTopic - Dimensionality - Completed ✓
2026-01-09 10:06:47,785 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-09 10:06:48,230 - BERTopic - Cluster - Completed ✓
2026-01-09 10:06:48,234 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-09 10:06:50,250 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1374,-1_urban_development_migration_housing,"[urban, development, migration, housing, citie...",[The recent demographic dynamics in the Sierra...
1,0,515,0_travel_agents_agentbased_household,"[travel, agents, agentbased, household, synthe...",[How to generate micro-agents? A deep generati...
2,1,303,1_algorithm_optimization_algorithms_search,"[algorithm, optimization, algorithms, search, ...",[A Particle Swarm Optimization Algorithm with ...
3,2,280,2_urban_distribution_mobile_phone,"[urban, distribution, mobile, phone, density, ...",[Research on the Correlation between the Dynam...
4,3,243,3_species_seasonal_dynamics_sea,"[species, seasonal, dynamics, sea, females, ab...",[Density and climate influence seasonal popula...
5,4,222,4_epidemic_disease_covid19_transmission,"[epidemic, disease, covid19, transmission, spr...",[Paleoepidemiological Considerations of Mobili...
6,5,209,5_water_supply_groundwater_demand,"[water, supply, groundwater, demand, water sup...",[Assessment of surface water and groundwater s...
7,6,195,6_stars_stellar_mass_planets,"[stars, stellar, mass, planets, galactic, form...",[The New Generation Planetary Population Synth...
8,7,161,7_urban_accessibility_city_mobility,"[urban, accessibility, city, mobility, service...",[Future access to essential services in a grow...
9,8,158,8_mobile_language_migration_political,"[mobile, language, migration, political, migra...",[International migration and liberal democraci...


In [None]:
# Reduce outliers 
all_topics = all_topic_model.reduce_outliers(
    all_text_batch, 
    all_topic_model.topics_, 
    strategy="c-tf-idf"
)
all_topic_model.update_topics(
    all_text_batch, 
    topics=all_topics,
    vectorizer_model = vectorizer_model, #vectorization
    representation_model= representation_model, # topic representation
)
all_topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,653,0_travel_agents_agentbased_information,"[travel, agents, agentbased, information, fram...",[How to generate micro-agents? A deep generati...
1,1,338,1_algorithm_optimization_algorithms_search,"[algorithm, optimization, algorithms, search, ...",[A Particle Swarm Optimization Algorithm with ...
2,2,419,2_urban_distribution_mobile_city,"[urban, distribution, mobile, city, density, n...",[Research on the Correlation between the Dynam...
3,3,342,3_species_seasonal_dynamics_abundance,"[species, seasonal, dynamics, abundance, sea, ...",[Density and climate influence seasonal popula...
4,4,257,4_disease_epidemic_transmission_covid19,"[disease, epidemic, transmission, covid19, spr...",[Paleoepidemiological Considerations of Mobili...
5,5,230,5_water_supply_demand_groundwater,"[water, supply, demand, groundwater, water sup...",[Assessment of surface water and groundwater s...
6,6,198,6_stars_stellar_mass_planets,"[stars, stellar, mass, planets, galactic, form...",[The New Generation Planetary Population Synth...
7,7,268,7_urban_accessibility_city_housing,"[urban, accessibility, city, housing, public, ...",[Future access to essential services in a grow...
8,8,239,8_mobile_migration_language_mobility,"[mobile, migration, language, mobility, articl...",[International migration and liberal democraci...
9,9,257,9_migrants_rural_migrant_migration,"[migrants, rural, migrant, migration, settleme...",[Understanding the role of housing in rural mi...


In [None]:
# Save the topic list of all docs
all_topic_info = all_topic_model.get_topic_info()
all_topic_info.to_excel("FullAbstracts_Topics.xlsx")

In [21]:
all_topic_model.visualize_hierarchy()

In [22]:
all_topic_model.visualize_topics()

In [None]:
# Label the original dataframe with topic for each doc
all_topic_df = all_topic_model.get_document_info(all_text_batch)
df_withTopic = helper(all_topic_df,df,'Topic')

# Topic mining of Marked titles with abstracts

In [27]:
marked_embeddings = np.load("marked_embeddings.npy")
marked_topic_model = bertopic(marked_text_batch, marked_embeddings,10)
marked_topic_info = marked_topic_model.get_topic_info()
marked_topic_info

2026-01-09 10:22:15,372 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-09 10:22:15,794 - BERTopic - Dimensionality - Completed ✓
2026-01-09 10:22:15,794 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-09 10:22:15,806 - BERTopic - Cluster - Completed ✓
2026-01-09 10:22:15,809 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-09 10:22:15,906 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,87,-1_crime_urban_mobile_mobile phone,"[crime, urban, mobile, mobile phone, phone, ne...",[Daily mobility practices through mobile phone...
1,0,82,0_travel_households_household_activity,"[travel, households, household, activity, synt...",[Synthesizing Population for Microsimulation-B...
2,1,41,1_urban_migration_migrants_patterns,"[urban, migration, migrants, patterns, city, e...",[Modeling the spatial distribution of urban po...
3,2,34,2_projections_fertility_demographic_growth,"[projections, fertility, demographic, growth, ...",[Reverse survival method of fertility estimati...
4,3,32,3_epidemic_disease_network_contact,"[epidemic, disease, network, contact, spread, ...",[Synthesis of a high resolution social contact...
5,4,26,4_tourism_daytime_diurnal_household,"[tourism, daytime, diurnal, household, visitor...",[Assessing patterns of spatial behavior in hea...
6,5,26,5_mobile_distribution_estimates_human,"[mobile, distribution, estimates, human, studi...",[Dynamic population mapping using mobile phone...
7,6,21,6_exposure_risk_distribution_evacuation,"[exposure, risk, distribution, evacuation, tsu...",[Towards improved risk assessment: Mapping the...
8,7,20,7_distribution_chemical_wastewater_loads,"[distribution, chemical, wastewater, loads, la...",[Systematic and day-to-day effects of chemical...
9,8,16,8_network_artificial_service_agents,"[network, artificial, service, agents, process...",[Artificial society-oriented large-scale road ...


In [34]:
marked_topic_model.visualize_hierarchy()

In [None]:
# Reduce outliers 
marked_topics = marked_topic_model.reduce_outliers(
    marked_text_batch, 
    marked_topic_model.topics_, 
    strategy="c-tf-idf"
)
marked_topic_model.update_topics(
    marked_text_batch, 
    topics=marked_topics,
    vectorizer_model = vectorizer_model, #vectorization
    representation_model= representation_model, # topic representation
)
marked_topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,106,0_travel_households_household_activity,"[travel, households, household, activity, micr...",[Synthesizing Population for Microsimulation-B...
1,1,51,1_urban_migration_city_migrants,"[urban, migration, city, migrants, patterns, e...",[Modeling the spatial distribution of urban po...
2,2,44,2_projections_growth_fertility_method,"[projections, growth, fertility, method, demog...",[Reverse survival method of fertility estimati...
3,3,40,3_epidemic_disease_hiv_network,"[epidemic, disease, hiv, network, contact, spr...",[Synthesis of a high resolution social contact...
4,4,31,4_tourism_commuting_daytime_diurnal,"[tourism, commuting, daytime, diurnal, househo...",[Assessing patterns of spatial behavior in hea...
5,5,44,5_mobile_phone_mobile phone_distribution,"[mobile, phone, mobile phone, distribution, de...",[Dynamic population mapping using mobile phone...
6,6,27,6_exposure_risk_distribution_evacuation,"[exposure, risk, distribution, evacuation, ass...",[Towards improved risk assessment: Mapping the...
7,7,23,7_water_distribution_chemical_wastewater,"[water, distribution, chemical, wastewater, la...",[Systematic and day-to-day effects of chemical...
8,8,16,8_network_artificial_service_agents,"[network, artificial, service, agents, process...",[Artificial society-oriented large-scale road ...
9,9,15,9_microdata_census_small_acs,"[microdata, census, small, acs, microsimulatio...",[The enhancement of spatial microsimulation mo...


In [30]:
marked_topic_df = marked_topic_model.get_document_info(marked_text_batch)

In [31]:
topic_info = marked_topic_model.get_topic_info()
topic_info.to_excel("MarkedAbstracts_Topics.xlsx")

In [32]:
df_withTopic = helper(marked_topic_df,df_withTopic,'Topic2')

In [33]:
df_withTopic.to_excel('synpop_withTopics.xlsx')