# BERT Topic modelling

### Library imports

#### TODO
- Enable separate saving of embeddings so that the model fitted on a GPU can be used on a CPU 

In [1]:
# Import libraries

import pandas as pd
import os
import sys
import logging
import time
import random
import datetime
import plotly.io as pio
import datapane as dp

from itables import init_notebook_mode, show

In [2]:
# Import BERTopic

from bertopic import BERTopic

### Configurations

In [3]:
%%capture
# Change the logging level to logging.INFO in the cells for additional informative output.
logger = logging.getLogger(__name__)

# Activate Interactive Tables for better dataframe visualisations: https://mwouts.github.io/itables/quick_start.html
init_notebook_mode(all_interactive = False)

# Display the charts inside the notebook
#pio.renderers.default = "iframe"
pio.renderers.default = "notebook_connected"


### Folders and file paths

In [11]:
# PARAMETERS

project = 'ml_in_medicine'  # the project folder
docs_label = 'abstract'

biblio_csv_files = []
biblio_csv_files = ['scopus_lens_ml_heart_all_abstract']

model_name = 'bertopic_ng_1_2_scopus_lens_ml_heart_all'

data_dir = '/results/'
results_dir = '/results/'
model_dir = '/models/'

# ----------------------------

if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('../')

root_dir = os.getcwd() + '/projects/' + project

print(f'Working directory: {root_dir}')

biblio_csv_files = [x + '.csv' for x in biblio_csv_files]

model_file = model_name + '_' + docs_label

Working directory: /Users/gilbert/Analyses/bibliometrics/projects/ml_in_medicine


### Read bibliography datasets
Read CSV files (single or bulk) and keep a subset of columns.

In [12]:
# PARAMETERS

add_id_prefix = False
add_id_suffix = False

# ----------------------------

# Get list of CSV files in directory
if len(biblio_csv_files) == 0:
    biblio_csv_files = [f for f in os.listdir(root_dir + data_dir) if f.endswith('.csv')]

all_f_df = []

# Read all CSV files into a single DataFrame
for f in biblio_csv_files:
    f_df = pd.read_csv(os.path.join(root_dir + data_dir, f))
    print(f'File: {os.path.basename(f)}, Size: {len(f_df)} rows')
    all_f_df.append(f_df)

biblio_df = pd.concat(all_f_df, ignore_index = True)

if add_id_prefix:
    biblio_df[docs_label] = biblio_df.apply(lambda x: x['id'] + ' ' + x[docs_label], axis = 1)
elif add_id_suffix:
#    biblio_df[docs_label] = biblio_df.apply(lambda x: x[docs_label] + ' ' + (x['id'].split('_')[0].lstrip('0') or '0'), axis = 1)
    biblio_df[docs_label] = biblio_df.apply(lambda x: x[docs_label] + ' ' + x['id'], axis = 1)

print(f'Number of documents in the dataset: {len(biblio_df)}')

# Create a list of the titles
docs_all = biblio_df[docs_label].to_list()

# Remove NAN values from the list of abstracts
print(f'Removing {len([x for x in docs_all if x != x])} documents that are NAN...')
docs_all = [x for x in docs_all if x == x]

display(docs_all[:5])


File: scopus_lens_ml_heart_all_abstract.csv, Size: 47168 rows
Number of documents in the dataset: 47168
Removing 1737 documents that are NAN...


['According to estimations made by World Health Organization heart disease is the largest cause of mortality throughout the globe and it is safe to assume that diagnosing heart diseases in their earliest stages is very essential. Diagnosis of cardiovascular disease may be carried out by detection of interference in cardiac signals one of which is called phonocardiography and it can be accomplished in a number of various ways. Using phonocardiogram PCG inputs and deep learning the researchers aim to develop a classification system for different types of heart illness. The slicing and normalization of the signal served as the first step in the studysignal preprocessing which was subsequently followed by a wavelet based transformation method that employs mother wavelet analytic morlet. The of the decomposition are first shown with the use of a scalogram afterwards they are utilized as input for the deep CNN. In this investigation the analyzed PCG signals were separated into categories den

In [13]:
# PARAMETERS

n_gram_range=(1, 2)

# ----------------------------

# Use all or a subset of the documents
docs = docs_all
#docs = random.sample(docs_all, 5000)

# Set the timer
start_time = time.time()

# Create topics
model = BERTopic(language = "english", n_gram_range = n_gram_range, verbose = True, calculate_probabilities = True)
topics, probs = model.fit_transform(docs)

# Calculate the time needed to dit the BERTopic model
print("--- %s seconds ---" % (time.time() - start_time))


Batches:   0%|          | 0/1420 [00:00<?, ?it/s]

2023-05-01 11:51:42,984 - BERTopic - Transformed documents to Embeddings
2023-05-01 11:52:03,444 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Av

2023-05-01 11:58:16,177 - BERTopic - Clustered reduced embeddings


--- 5083.922977924347 seconds ---


### Save the BERTopic model

In [14]:
# PARAMETERS

check_save = True   # check whether the model was correctly saved
timestamping = True

file_suffix = '_probs'
#file_suffix = '_suffix'

# ----------------------------

# Save the model

if timestamping:
    timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
else:
    timestamp = ''

model.save(root_dir + model_dir + model_file + file_suffix + timestamp + '.sav')

# Quick check that the model has been saved correctly
if check_save:
    model = BERTopic.load(root_dir + model_dir + model_file + file_suffix + timestamp + '.sav')
    # display(model.get_topic_info())



Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



### Load the BERTopic model

In [None]:
# PARAMETERS

model_load_file = 'bertopic_scopus_lens_ml_and_anomaly_titles_2023-03-30_10-40-38'
model_load_file = 'bertopic_scopus_lens_ml_and_anomaly_abstracts_2023-03-30_11-44-11'
model_load_file = 'bertopic_scopus_lens_ml_and_anomaly_' + docs_label

# ----------------------------

# Load the topic model
model = BERTopic.load(root_dir + model_dir + model_load_file + '.sav')
display(model.get_topic_info())


### Visualise and save results

In [10]:
# PARAMETERS
remove_title_suffix = False  # for titles, if they have a suffix then strip it

show_topic_info = True
show_doc_info = True
show_barchart = True

write_results_csv = True
write_results_xlsx = True

topic_info_file = 'bertopic_topic_info_ng_1_2_ml_heart_titles'
doc_info_file = 'bertopic_doc_info_ng_1_2_ml_heart_titles'
doc_topic_probs_csv = ''
merged_doc_topic_probs_csv = ''

timestamping = False

# ----------------------------


if show_topic_info:
    topic_info_df = model.get_topic_info()
    display(topic_info_df)

    if topic_info_file:
        if timestamping:
            timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        else:
            timestamp = ''

        if write_results_csv:
            print(f'Saving file {topic_info_file}.csv ...')
            topic_info_df.to_csv(root_dir + results_dir + topic_info_file + timestamp + '.csv', index = False)
            print(f'DONE!')

        if write_results_xlsx:
            print(f'Saving file {topic_info_file}.xlsx ...')
            topic_info_df.to_excel(root_dir + results_dir + topic_info_file + timestamp + '.xlsx', index = False)
            print(f'DONE!')

if show_doc_info:
    doc_info_df = model.get_document_info(docs)

    doc_info_df = doc_info_df.rename(columns = {'Document': docs_label, 'Topic': 'tp_num', 'Name': 'tp_name', 
                                                'Top_n_words': 'top_n_words', 'Probability': 'prob',
                                                'Representative_document': 'representative'})
    
    display(doc_info_df)

    if remove_title_suffix:
        doc_info_df[docs_label] = doc_info_df[docs_label].str.replace(r'\s\w+$', '', regex=True)

    if doc_info_file:
        if timestamping:
            timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        else:
            timestamp = ''
        
        if write_results_csv:
            print(f'Saving file {doc_info_file}.csv ...')
            doc_info_df.to_csv(root_dir + results_dir + doc_info_file + timestamp + '.csv', index = False)
            print(f'DONE!')
        
        if write_results_xlsx:
            print(f'Saving file {doc_info_file}.xlsx ...')
            doc_info_df.to_excel(root_dir + results_dir + doc_info_file + timestamp + '.xlsx', index = False)
            print(f'DONE!')
        
if doc_topic_probs_csv and write_results_csv:
        
    if timestamping:
        timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    else:
        timestamp = ''

    # The rows of probs_df are the documents and the columns the topics
    print(f'Saving file {doc_topic_probs_csv}.csv ...')
    pd.DataFrame(probs).to_csv(root_dir + results_dir + doc_topic_probs_csv + timestamp + '.csv', index = False)
    print(f'DONE!')
    
if merged_doc_topic_probs_csv and (write_results_csv or write_results_xlsx):
    
    timestamp = ''

    if timestamping:
        timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    # Merge the doc_info_df with the doc_topic_probs_df
    doc_info_df = model.get_document_info(docs)
    doc_topic_probs_df = pd.DataFrame(model.probabilities_)
    bertopic_merged_df = pd.concat([doc_info_df, doc_topic_probs_df], axis = 1)
    
    bertopic_merged_df = bertopic_merged_df.rename(columns = {'Document': docs_label, 'Topic': 'tp_num', 'Name': 'tp_name', 
                                                              'Top_n_words': 'top_n_words', 'Probability': 'prob',
                                                              'Representative_document': 'representative'})
    display(bertopic_merged_df)
    
    if write_results_csv:
        print(f'Saving file {merged_doc_topic_probs_csv + timestamp}.csv ...')
        bertopic_merged_df.to_csv(root_dir + results_dir + merged_doc_topic_probs_csv + timestamp + '.csv', index = False)
        print(f'DONE!')
    
    if write_results_xlsx:
        print(f'Saving file {merged_doc_topic_probs_csv + timestamp}.xlsx ...')
        bertopic_merged_df.to_excel(root_dir + results_dir + merged_doc_topic_probs_csv + timestamp + '.xlsx', index = False)
        print(f'DONE!')
    
if show_barchart:
    fig = model.visualize_barchart(width = 280, height = 330, top_n_topics = 10, n_words = 10)
    fig.show()


Unnamed: 0,Topic,Count,Name
0,-1,16871,-1_and_of_the_cardiac
1,0,842,0_sleep_apnea_sleep apnea_obstructive sleep
2,1,482,1_heart sound_sound_sounds_heart sounds
3,2,407,2_arrhythmia_cardiac arrhythmia_arrhythmias_ar...
4,3,352,3_activity recognition_activity_human activity...
...,...,...,...
573,572,10,572_diabetes_diabetes using_of diabetes_multil...
574,573,10,573_with congenital_congenital heart_developme...
575,574,10,574_predictive modeling_large clustered_standa...
576,575,10,575_extracorporeal cardiopulmonary_cardiopulmo...


Saving file bertopic_topic_info_ng_1_2_ml_heart_titles.csv ...
DONE!
Saving file bertopic_topic_info_ng_1_2_ml_heart_titles.xlsx ...
DONE!


Unnamed: 0,title,tp_num,tp_name,top_n_words,prob,representative
0,1D convolutional neural network for detecting ...,-1,-1_and_of_the_cardiac,and - of - the - cardiac - in - for - with - n...,0.762688,False
1,2D wavelet encoded deep CNN for image based EC...,57,57_convolutional neural_convolutional_ecg_and ...,convolutional neural - convolutional - ecg - a...,0.063912,False
2,3D ECG display with deep learning approach for...,347,347_12 lead_12_lead_lead electrocardiograms,12 lead - 12 - lead - lead electrocardiograms ...,0.087248,False
3,3D echocardiogram reconstruction employing a f...,8,8_echocardiography_echocardiographic_in echoca...,echocardiography - echocardiographic - in echo...,0.035859,False
4,4D segmentation of the thoracic aorta from 4D ...,-1,-1_and_of_the_cardiac,and - of - the - cardiac - in - for - with - n...,0.972388,False
...,...,...,...,...,...,...
47163,strokes prevented: biosurveillance of NVAF pat...,-1,-1_and_of_the_cardiac,and - of - the - cardiac - in - for - with - n...,0.825005,False
47164,transcriptomic profiling maps anatomically pat...,248,248_single cell_cell_cell rna_rna sequencing,single cell - cell - cell rna - rna sequencing...,1.000000,False
47165,vNN web server for ADMET predictions,-1,-1_and_of_the_cardiac,and - of - the - cardiac - in - for - with - n...,0.261823,False
47166,variables from the CMS heart failure readmissi...,296,296_readmission_30 day_readmissions_30,readmission - 30 day - readmissions - 30 - day...,0.041601,False


Saving file bertopic_doc_info_ng_1_2_ml_heart_titles.csv ...
DONE!
Saving file bertopic_doc_info_ng_1_2_ml_heart_titles.xlsx ...
DONE!


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
model.update_topics(docs, vectorizer_model=vectorizer_model)
fig = model.visualize_barchart(width = 280, height = 330, top_n_topics = 10, n_words = 10)
fig.show()

In [None]:
model.get_topic_freq(65)

In [None]:
model.visualize_topics()

In [None]:
model.find_topics("finance", top_n = 20)

In [None]:
model.visualize_barchart(width = 280, height = 330, top_n_topics = 60, n_words = 10)

In [None]:
show(model.get_document_info(docs))

In [None]:
import plotly.express as px
fig = px.bar(x=["a", "b", "c"], y=[1, 3, 2])
fig.show()

In [None]:
model.visualize_heatmap(n_clusters = 20)

In [None]:
model.visualize_documents(titles, topics = list(range(50)), custom_labels = True, height = 1000)

# Sandbox

In [None]:
# PARAMETERS
write_merged_csv = False # a CSV file that merges all the files found in data_dir
file_merged_csv_out = 'scopus_merged'
logger.setLevel(logging.INFO)
# --------------------------------------

# Read Lens CSV files
if file_in == '': # read all files in data_dir
  df_list = []

  for file in os.listdir(root_dir + data_dir):
    df = pd.read_csv(root_dir + data_dir + file, on_bad_lines = 'skip')
    df_list.append(df)
  
  lens_df = pd.concat(df_list)
  lens_df.reset_index(drop = True, inplace = True)

else: # read the single file specified in file_in
  lens_df = pd.read_csv(root_dir + data_dir + file_in, on_bad_lines = 'skip')

# Write merged file to CSV
if write_merged_csv:
  lens_df.to_csv(root_dir + results_dir + file_merged_csv_out + '.csv', index = False)

# Rename and filter the columns of the dataframe
lens_df.rename(columns = {'Author/s':'authors', 
                            'Title':'title',
                            'Abstract':'abstract',
                            'Publication Year':'year',
                            'Source Title':'source',
                            'Fields of Study':'fos',
                            'Citing Works Count':'cited',
                            'Keywords':'kws',
                            'MeSH Terms':'mesh'}, inplace = True)
lens_df = lens_df[['authors', 'title', 'abstract', 'year', 'source', 'fos', 'cited', 'kws', 'mesh']]

print(f'\nNumber of Scopus entries: {len(lens_df)}\n')

if logger.getEffectiveLevel() == logging.INFO:
  display(lens_df.head())
