# Exploratory analysis of large bibliographic datasets

Large bibliographic datasets can be difficult to explore at a lower level. This Jupyter notebook provides **several tools**:
* Filtered (stemmed) keywords and associated keywords (HTML).
* Differentiated Scopus search and bibliographic keyword filtering and highlighting (HTML, and Excel).
* Researcher, department, and country by number of publications in the full or filtered results.
* Topic model applied to the Scopus keywords.

Currently, this requires **Scopus bibliographic files in CSV** format as an input.

### Before you get started
* Open this notebook from the Github repository by clicking on the small 'Open in Colab' button in Github. This will open the notebook in your browser.
* On your Gogole Drive, create a folder 'Biblio Analysis' inside 'My Drive/Colab Notebooks'. Create two subfolders 'Input' and 'Output' inside of 'Biblio Analysis'.
* Save the notebook to 'Biblio Analysis' on your Google Drive by clicking on the menu 'File > Save a copy in Drive' and navigating to 'Biblio Analysis'. You can change the folder names and locations, but you will need to manually change the paths in the 'Folders and file paths' section below.
* Download the bibliographic files from Scopus as CSV, Make sure to check the following categories in the Export Document Settings in Scopus:
  * Citation information
  * Bibliographical information
  * Abstract & keywords
* Upload the Scopus files you want to analyse to the 'Input' folder. They will be bulk-read by the script (unless you specify a single file name in the script parameters).

#### TODO
- Add a module to analyse the topics.

### Imports

In [1]:
# Import libraries

import pandas as pd
import re
import os
import sys
import logging
import pycountry
import datetime
import ast

from IPython.display import display, HTML
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from openpyxl import Workbook
from openpyxl.styles import Font, colors, Alignment
from openpyxl.cell.text import InlineFont
from openpyxl.cell.rich_text import TextBlock, CellRichText
from keybert import KeyBERT

### Configurations

In [2]:
# Change the logging level to logging.INFO in the cells
# for additional informative output.
logger = logging.getLogger(__name__)

### Folders and file paths

In [3]:
# PARAMETERS

project = 'ml_in_engineering'  # the project folder

# ----------------------------


# Create folder structure if the project is new and set root directory of the project
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('../')

root_dir = os.getcwd() + '/projects/' + project

print(f'Working directory: {root_dir}')

if not os.path.exists(root_dir):
    os.makedirs(root_dir)
    os.makedirs(root_dir + '/data')
    os.makedirs(root_dir + '/data/processed')
    os.makedirs(root_dir + '/data/raw_scopus')
    os.makedirs(root_dir + '/data/raw_lens')
    os.makedirs(root_dir + '/models')
    os.makedirs(root_dir + '/results')


Working directory: /Users/gilbert/Analyses/bibliometrics/projects/ml_in_engineering


### Read bibliographic files
Reads previously pre-processed Scopus and Lens files. For raw Scopus or Lens files, it retains a subset of the columns and changes column names.

In [4]:
# PARAMETERS

biblio_source = 'processed'  # 'scopus' or 'lens' or 'processed' (previously processed and saved in one of the cells below)

cutoff = 0   # retain the first 'cutoff' rows of the dataset; set to zero for the full dataset

data_dir = '/results/'
results_dir = '/results/'

biblio_csv_files = []
biblio_csv_files = ['scopus_lens_ml_sim_engineering_all_st.csv'] # comment out to read all csv files in data_dir

logger.setLevel(logging.INFO)
# --------------------------------------

# Read all CSV files in the directory, otherwise those provided in the list biblio_csv_files
if len(biblio_csv_files) == 0:
    biblio_csv_files = [f for f in os.listdir(root_dir + data_dir) if f.endswith('.csv')]

all_f_df = []

# Read all CSV files into a single DataFrame
print(f'Reading bibliographic files...')

for f in biblio_csv_files:
    f_df = pd.read_csv(os.path.join(root_dir + data_dir, f))
    print(f'File: {os.path.basename(f)}, Size: {len(f_df)} rows')
    all_f_df.append(f_df)

biblio_df = pd.concat(all_f_df, ignore_index = True)

if biblio_source == 'scopus':

    biblio_df = biblio_df[['author', 'author_id', 'affiliation', 'title', 'abstract', 'year', 'source', 'cited', 'kws_author', 'kws_index']]

    print(f'\nNumber of Scopus publications: {len(biblio_df)}\n')

    if logger.getEffectiveLevel() == logging.INFO:
        display(biblio_df.head())

elif biblio_source == 'lens':
            
    biblio_df = biblio_df.loc[:,['id', 'title', 'abstract', 'kws', 'fos', 'year', 'source']] 
    
    print(f'\nNumber of Lens publications: {len(biblio_df)}\n')

    if logger.getEffectiveLevel() == logging.INFO:
        display(biblio_df.head())

elif biblio_source == 'processed':  # a file that has been previously processed in one of the cells below
    
    # Convert the string representation of the search term lists to Python lists
    if 'search_title' in biblio_df.columns:
        biblio_df['search_title'] = biblio_df['search_title'].apply(ast.literal_eval)
        biblio_df['search_abs'] = biblio_df['search_abs'].apply(ast.literal_eval)

    print(f'\nNumber of publications: {len(biblio_df)}\n')

else:
    raise Exception(f"The bibliographic source '{biblio_source}' does not exist. Only 'scopus' and 'lens' are valid")

if cutoff > 0:
    biblio_df = biblio_df.head(cutoff)  # this is handy when working with larger dataframes
    print(f'The dataframe was cut off at row {cutoff - 1}')

if logger.getEffectiveLevel() == logging.INFO:
    display(biblio_df.head())

print(f'DONE!')

Reading bibliographic files...
File: scopus_lens_ml_sim_engineering_all_st.csv, Size: 72967 rows

Number of publications: 72967



Unnamed: 0,id,title,year,abstract,kws,fos,source,lit_review,cited,authors,search_label,search_title,search_abs
0,000000_Wang_2023,2v2 air combat confrontation strategy based on...,2023,Aircraft cluster air combat scenario is a long...,"energy distributions,air combat,long sequences...",,Lecture Notes in Electrical Engineering,0,0,"Wang J., Zhu L., Yang H., Ji Y., Wang X.",scopus_ml_sim_subj_engineering,[reinforcement learning],"[simulation, reinforcement learning, rnn, mult..."
1,000001_Preethi_2023,3D echocardiogram reconstruction employing a f...,2023,Three dimensional 3D echocardiogram enables ca...,"3d ann patch matching,image reconstruction,fli...",Voxel; Computer science; Artificial intelligen...,Computer Systems Science and Engineering,0,0,"Preethi C., Mohamed Sathik M., Shajun Nisha S.","scopus_ml_sim_subj_engineering, lens_ml_sim_su...",[],"[ann, simulation]"
2,000002_Li_2023,3D ground penetrating radar cavity identificat...,2023,3D ground penetrating radar GPR is the main me...,,Ground-penetrating radar; Radar; Identificatio...,Measurement Science and Technology,0,0,Fanruo Li; Feng Yang; Xu Qiao; Wentai Xing; Ch...,lens_ml_sim_subj_engineering,[transfer learning],"[simulation, transfer learning]"
3,000003_Mehrpooya_2023,3D inverse synthetic aperture radar image qual...,2023,Generalisation of one-dimensional dictionary l...,"multidimensional data,generalisation,inverse s...",,"IET Radar, Sonar and Navigation",0,0,"Mehrpooya A., Karbasi S.M., Nazari M., Abbasi ...",scopus_ml_sim_subj_engineering,[],[simulation]
4,000004_Park_2023,3D off grid localization for adjacent cavitati...,2023,The propeller tip vortex cavitation TVC locali...,"off-grids,bayesian networks,noise source,bayes...",,Sensors,0,0,"Park M., Memon S.A., Kim G., Choo Y.",scopus_ml_sim_subj_engineering,[],[simulation]


DONE!


### Search term matches by title and abstract
Create two new columns that hold the search terms matches for titles and abstracts respectively.

#### TODO
- Remove rows where both search_title and search_abs are empty lists

In [None]:
# PARAMETERS

write_csv = False
file_csv_out = ''

logger.setLevel(logging.INFO)
# --------------------------------


'''
    Extract keywords from Lens search term
'''

# Dataset: 
# - [Anomaly]
# biblio_search_term = ''' 
#                     (( TITLE ( anomal* OR outlier OR novelt* ) OR KEY ( anomal* ) OR TITLE-ABS-KEY ( "anomaly detection" OR "detect anomalies" OR "detection of anomalies" OR "detection of point anomalies" OR "detection of collective anomalies" OR "detection of contextual anomalies" OR "outlier detection" OR "detection of outliers" OR "novelty detection" OR "detection of novelties" OR "event detection" OR "deviant discovery" OR "change point detection" ) ) AND TITLE-ABS-KEY ( "neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR transformer OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "bayesian network" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "linear regression" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "logistic regression" OR "statistical learning" OR lstm OR "neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning" ) AND TITLE-ABS-KEY ( time OR timeseries OR temporal OR duration OR frequency OR process ) )
#                   '''

# Dataset: 
# - [Practicum]
# biblio_search_term = '''
# TITLE-ABS-KEY ( "practice placement"  OR  practicum  OR  internship  OR  "work placement"  OR  "clinical experience"  OR  "professional placement"  OR  "clinical rotation"  OR  "practical training"  OR  "practice learning"  OR  "experiential learning" )  AND  ( TITLE-ABS-KEY ( stress  OR  depression  OR  anxiety  OR  burnout  OR  marginalisation  OR  exhaustion  OR  exclusionary  OR  unsupportive  OR  abusive )  OR  TITLE-ABS-KEY ( onboarding  OR  "on-boarding"  OR  orientation  OR  induction  OR  initiation  OR  "ramp-up"  OR  familiarisation  OR  welcome ) )  => 13,348 results
# '''

# I removed linear regression and logistic regression from the search term so that publications that contain only these and no other ML-related terms are removed from the dataset
# biblio_search_term = '''
#     ("neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "bayesian network" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "hidden markov" OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "statistical learning" OR lstm OR "neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning")
# '''

# Dataset: 
# - ML+SIM in Engineering
# - ML+SIM in Engineering Reviews
# biblio_search_term = '''
# ( ( ( TITLE( simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method" ) OR KEY( simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method" ) ) AND (TITLE( "neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "bayesian network" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "linear regression" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "logistic regression" OR "statistical learning" OR lstm ) OR KEY( "neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "bayesian network" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "linear regression" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "logistic regression" OR "statistical learning" OR lstm ) ) ) OR TITLE-ABS-KEY( "neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning" ) )
# '''

# Dataset: 
# - ML in Medicine
# - ML in Medicine Reviews
# biblio_search_term = '''
# (title:("neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "statistical learning" OR lstm) OR abstract:("neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "statistical learning" OR lstm ) OR title:("neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning" ) OR abstract:("neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning"))
# '''

# Dataset: 
# - SIM for Heart Reviews
# biblio_search_term = '''
# (title:(*cardi* OR heart OR *ventric* OR coronary OR *atria* OR atrium ) OR keyword:(*cardi* OR heart OR *ventric* OR coronary OR *atria* OR atrium )) AND (title:(simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method") OR keyword:(simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method") OR keyword:(simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method"))
# '''

# Dataset: 
# - ML+SIM for Heart
# biblio_search_term = '''
# (((title:("neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "statistical learning" OR lstm) OR abstract:("neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "statistical learning" OR lstm )) AND (title:(simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method") OR abstract:(simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method"))) OR (title:("neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning" ) OR abstract:("neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning"))) AND (title:(*cardi*  OR  heart  OR  *ventric*  OR  coronary  OR  *atria*  OR  atrium ) OR abstract:(*cardi*  OR  heart  OR  *ventric*  OR  coronary  OR  *atria*  OR  atrium )) 
# '''

# Dataset: 
# - ML Heart
biblio_search_term = '''
"neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "statistical learning" OR lstm OR *cardi*  OR  heart  OR  *ventric*  OR  coronary  OR  *atria*  OR  atrium 
'''

# Dataset: 
# - ML+SIM in Engineering
# biblio_search_term = '''
# ((title:("neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "bayesian network" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "linear regression" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "logistic regression" OR "statistical learning" OR lstm) OR abstract:("neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "bayesian network" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "linear regression" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "logistic regression" OR "statistical learning" OR lstm )) AND (title:(simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method") OR abstract:(simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method"))) OR (title:("neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning" ) OR abstract:("neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning"))
# '''

# Dataset: 
# - ML+SIM in Medicine
# biblio_search_term = '''
# ((title:("neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "statistical learning" OR lstm) OR abstract:("neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "statistical learning" OR lstm )) AND (title:(simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method") OR abstract:(simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method"))) OR (title:("neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning" ) OR abstract:("neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning")) 
# '''

full_search_terms = ['ann', 'bert', 'gan']  # force a full word match for these terms
# full_search_terms = []
# duplicate_terms = ['']

# Remove these strings from the search term
strings_to_remove = ['(',')','TITLE-ABS-KEY', 'TITLE', 'TITLE-ABS', 'KEY']  # Scopus items
strings_to_remove += ['(',')','title:', 'abstract:', 'keyword:']    # Lens items

for string_to_remove in strings_to_remove:
    biblio_search_term = biblio_search_term.replace(string_to_remove, '')

biblio_search_term = re.split(r'\b(OR|AND)\b', biblio_search_term)
biblio_search_term = [item.replace('OR', '').replace('AND', '').replace('"', '').replace('*', '').strip() 
                        for item in biblio_search_term 
                        if item.replace('OR', '').replace('AND', '').replace('"', '').replace('*', '').strip()]

# Replace '-' with whitespace in the biblio_search_terms
biblio_search_term = [search_term.replace('-', ' ') for search_term in biblio_search_term]

# Remove any duplicates from the biblio_search_terms
count_terms = len(biblio_search_term)
biblio_search_term = list(set(biblio_search_term))
print(f'Removed {count_terms - len(biblio_search_term)} duplicate search terms')

# Remove the full_search_terms from the search terms list so that later
# the titles and abstracts only match the complete words from the 
# full_search_terms list
filter_set = set(full_search_terms)
target_set = set(biblio_search_term)
biblio_search_term = list(target_set - filter_set)

print(f'biblio_search_term: {biblio_search_term}')

# Stem the search terms since in the original search they might have been stemmed (e.g. Lens has that option)
stemmer = SnowballStemmer("english")
biblio_search_term_stemmed = [' '.join([stemmer.stem(word) for word in word_tokenize(sentence)]) for sentence in biblio_search_term]
# print(biblio_search_term)

# Find the strings in search_terms and full_search_terms that are in a given sentence and create a new list with them
def filter_strings_by_sentence(row, column):

    if(row.name % 100 == 0): # print row index
        print(f'{row.name}', end = '\r')

    # 'title' or 'abstract'
    value = row[column]
    
    if not pd.isna(value):
        sentence = value.lower()
        sentence = sentence.replace('-', ' ')
        sentence = ' '.join([stemmer.stem(word) for word in word_tokenize(sentence)])

        matches = [string for string in full_search_terms if re.search(r'(?i)\b' + string +r'\b', sentence)]
        matches += [string for string in biblio_search_term_stemmed if re.search(r'(?i)\b' + string, sentence)]
    else:
        matches = []
    
    return matches

# biblio_df = biblio_df.head(4000).copy()
# biblio_df = biblio_df.iloc[:97].copy()
# display(biblio_df)

# Create lists of search terms as they appear in the title, abstract, and keywords
if logger.getEffectiveLevel() == logging.INFO: print(f'Extracting search terms from titles...')
# biblio_df['search_title'] = biblio_df['title'].apply(lambda x: filter_strings_by_sentence(biblio_search_term, full_search_terms, x))
biblio_df['search_title'] = biblio_df.apply(lambda row: filter_strings_by_sentence(row, 'title'), axis = 1)

if logger.getEffectiveLevel() == logging.INFO: print(f'\nExtracting search terms from abstracts...')
# biblio_df['search_abs'] = biblio_df['abstract'].apply(lambda x: filter_strings_by_sentence(biblio_search_term, full_search_terms, x))
biblio_df['search_abs'] = biblio_df.apply(lambda row: filter_strings_by_sentence(row, 'abstract'), axis = 1)

# TODO Add the keyword matches so that we don't remove publications (see below) that have no title and abstract match but that have a keyword match
# if logger.getEffectiveLevel() == logging.INFO: print(f'\nExtracting search terms from keywords...')
# biblio_df['search_kws'] = biblio_df.apply(lambda row: filter_strings_by_sentence(row, 'kws'), axis = 1)

# Number of publications before applying the search term filter to titles and abstracts. You can apply a 
# subset of the original search terms if you want to remove pulications that only contain search terms not
# in the subset. Sometimes this is useful when some of the search terms turn out to be adding many 
# publications that are not relevant.
n_pubs = len(biblio_df)

# Replace the stemmed search terms in columns 'search_title' and 'search_abstract' with the original search terms
mapping = dict(zip(biblio_search_term_stemmed, biblio_search_term))   # create a dictionary that maps values in A to their corresponding values in B

def replace_values(lst):    # replace values in a list using the mapping dictionary
    return [mapping.get(x, x) for x in lst]

biblio_df[['search_title', 'search_abs']] = biblio_df[['search_title', 'search_abs']].applymap(replace_values)

# FIXME This is a little dodgy since it might remove publciations that were matched by the Scopus or Lens search engine but that for some reason aren't matched here
# Remove publications where search_title and search_abs are empty lists. This happens when you remove search terms from the 
# biblio_search_term string, for instance terms that turn out to generate a lot of irrelevant publications.
print(f"Removing {len(biblio_df[~biblio_df[['search_title', 'search_abs']].apply(lambda x: any(x.apply(bool)), axis=1)])} \
      publications where search_title and search_abs are empty...")
biblio_df = biblio_df[biblio_df[['search_title', 'search_abs']].apply(lambda x: any(x.apply(bool)), axis=1)]

n_pubs_filtered = len(biblio_df)

if logger.getEffectiveLevel() == logging.INFO: print(f'Retained {n_pubs_filtered} of {n_pubs} publications.')

# Sort the table
if logger.getEffectiveLevel() == logging.INFO: print(f'Sorting table...')
biblio_df = biblio_df.sort_values(by = ['year', 'title'], ascending = [False, True]).reset_index()
biblio_df.drop(['index'], axis = 1, inplace = True)

if logger.getEffectiveLevel() == logging.INFO: display(biblio_df.head())

# Write dataframe to CSV
if write_csv:
    if logger.getEffectiveLevel() == logging.INFO: print(f'Saving file {file_csv_out}.csv ...')
    biblio_df.to_csv(root_dir + results_dir + file_csv_out + '.csv', index = False)

print(f'DONE!')

### Generate keyword stats tables
Create tables with counts for search terms, bibliographic keywords, BERT keywords, fields of study, and BERT topics. What counts are generated depends on the variables provided by the biblio_df table.

#### TODO
- Add BERTopic count functionality
- Add the abstract keyBERT processing
- Add the probability cutoff functionality

In [None]:
# PARAMETERS

# WARNING: You need to rerun the cells above to reload the biblio_df dataset if you change this filter
filter_st = ""  # if you want to do the keyword count on the subset of publications defined by filter_st

keybert_title_count = False     # set this to True if you want to compute title keywords with keyBERT
keybert_abstract_count = False  # idem for abstract keywords
n_keybert_kws = 10
keybert_cutoff = 0.4    # probability value below which the keywords are excluded from the count
keybert_use_mmr = False  # use the Maximal Marginal Relevance keyword algorithm
mmr_diversity = 0.5

read_csv = False
biblio_file_csv = ''

write_csv = False
prefix = ''
suffix = ''

write_st_filtered_csv = False
biblio_st_filtered_file_csv = ''

biblio_st_count_csv = prefix + '_st_count_' + project + suffix
biblio_kw_count_csv = prefix + '_kw_count_' + project + suffix
biblio_fos_count_csv = prefix + '_fos_count_' + project + suffix
biblio_bertopic_count_csv = prefix + '_bt_count_' + project + suffix
biblio_keybert_count_csv = prefix + '_kb_count_' + project + suffix

timestamping = False

logger.setLevel(logging.INFO)
# --------------------------------

# For large datasets, we read the initial dataframes from file
if read_csv:
    biblio_df = pd.read_csv(root_dir + results_dir + biblio_file_csv + '.csv')

    # Convert the string representation of the search term lists to Python lists
    biblio_df['search_title'] = biblio_df['search_title'].apply(ast.literal_eval)
    biblio_df['search_abs'] = biblio_df['search_abs'].apply(ast.literal_eval)

n_all_pubs = len(biblio_df)


'''
    0. Apply the search term filter
'''

# Convert filter_st into a word list
words = re.split(r'\(|\)|\b(?:and|or)\b', filter_st)
words = [x.strip() for x in words if x.strip()]

# Filter publications by filter_st
def evaluate_expression(lst):

    # If the first keyword is 'only', then only include publications where the
    # search_title and search_abs only contain the terms following 'only'
    if (words[0] == 'only') or (words[0] == 'not only'):

        # Find the substring between the parentheses
        only_terms = re.search(r'\((.*?)\)', filter_st).group(1)

        # Split the substring on commas and strip whitespace
        only_terms = [x.strip() for x in only_terms.split(',')]

        if set(lst).issubset(set(only_terms)):
            return True
        else:
            return False

    values = {word: word in lst for word in words}

    filter_st_eval = filter_st

    for key in values:
        filter_st_eval = filter_st_eval.replace(key, str(values[key]))
    
    return eval(filter_st_eval)

# biblio_tmp_df = biblio_df.head(10).copy()
# biblio_tmp_df = biblio_df.copy()

# display(biblio_tmp_df)

if filter_st != '':
    if words[0] == 'only':
        biblio_df = biblio_df[biblio_df[['search_title', 'search_abs']].applymap(evaluate_expression).all(axis=1)]
    elif words[0] == 'not only':
        biblio_df = biblio_df[~biblio_df[['search_title', 'search_abs']].applymap(evaluate_expression).all(axis=1)]
    else:
        biblio_df = biblio_df[biblio_df[['search_title', 'search_abs']].applymap(evaluate_expression).any(axis=1)]

print(f'Retained {len(biblio_df)} of {n_all_pubs} publications after applying filter_st')

# display(biblio_df)


'''
    1. Search term count
'''

# Create a count table for the search terms originally used to generate the bibliographic dataset
st_count_df = biblio_df[['search_title', 'search_abs']].copy()
st_count_df['search_terms'] = st_count_df.apply(lambda row: ','
                              .join(set([item for sublist in row[['search_title', 'search_abs']]
                                 .tolist() for item in sublist])), axis=1)
st_count_df.drop(['search_title', 'search_abs'], axis = 1, inplace = True)
st_count_df = st_count_df.apply(lambda x: x.str.split(',')).explode('search_terms').reset_index()
st_count_df.drop(['index'], axis = 1, inplace = True)
st_count_df = st_count_df['search_terms'].value_counts().reset_index().rename(columns = {'search_terms': 'count', 'index': 'search_term'})

if logger.getEffectiveLevel() == logging.INFO:
    print(f'Number of publication search terms: {len(st_count_df)}')


'''
    2. Keywords count
'''

# Create keywords table
kws_df = biblio_df[['kws']].copy()

# Create a single keyword list by exploding the table
kws_df = kws_df.apply(lambda x: x.str.split(',')).explode('kws').reset_index()
kws_df.drop(['index'], axis = 1, inplace = True)

# Remove NaN entries
kws_df.dropna(subset = ['kws'], inplace = True)
kws_df = kws_df[kws_df['kws'] != 'nan']

# Create a count table for the keywords
kw_count_df = pd.DataFrame(kws_df['kws'].value_counts()).reset_index()
kw_count_df.columns = ['kw', 'count']
kw_count_df = kw_count_df.sort_values(by = ['count', 'kw'], ascending = [False, True]).reset_index()
kw_count_df.drop(['index'], axis = 1, inplace = True)

if logger.getEffectiveLevel() == logging.INFO:
    print(f'Number of unique publication keywords: {len(kw_count_df)}')
    print(f"{biblio_df['kws'].count()} publications of a total of {n_all_pubs} have keywords")

kw_stem_count_df = pd.DataFrame()

'''
# Create stemmed keywords
stemmer = SnowballStemmer(language='english')
kws_df['kws_stemmed'] = kws_df['kws'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# Create a count table for the stemmed Scopus keywords
kw_stem_count_df = pd.DataFrame(kws_df['kws_stemmed'].value_counts()).reset_index()
kw_stem_count_df.columns = ['kw_stem', 'count']
kw_stem_count_df = kw_stem_count_df.sort_values(by = ['count', 'kw_stem'], ascending = [False, True]).reset_index()
kw_stem_count_df.drop(['index'], axis = 1, inplace = True)
'''


'''
    3. Fields of study count
'''

fos_df = pd.DataFrame()
fos_count_df = pd.DataFrame()

if 'fos' in biblio_df.columns:     # if biblio_df has a fields of study column

    # Create fields of study table
    fos_df = biblio_df[['fos']].copy()

    # Create a single keyword list by exploding the table
    fos_df = fos_df.apply(lambda x: x.str.split(';')).explode('fos').reset_index()
    fos_df.drop(['index'], axis = 1, inplace = True)

    # Remove NaN entries
    fos_df.dropna(subset = ['fos'], inplace = True)
    fos_df = fos_df[fos_df['fos'] != 'nan']

    # Create a count table for the Lens fields of study
    fos_count_df = pd.DataFrame(fos_df['fos'].value_counts()).reset_index()
    fos_count_df.columns = ['fos', 'count']
    fos_count_df = fos_count_df.sort_values(by = ['count', 'fos'], ascending = [False, True]).reset_index()
    fos_count_df.drop(['index'], axis = 1, inplace = True)

    if logger.getEffectiveLevel() == logging.INFO:
        print(f'Number of unique fields of study: {len(fos_count_df)}')
        print(f"{biblio_df['fos'].count()} publications of a total of {n_all_pubs} have fields of study")

'''
# Create a table with the count of counts of Lens keywords
kw_count_count_df = pd.DataFrame(kw_count_df['count'].value_counts()).reset_index().rename(columns = {'count': 'count_count', 'index': 'count'})
total_counts = kw_count_count_df['count_count'].sum()
kw_count_count_df['count_count_pct'] = kw_count_count_df['count_count'] / total_counts
kw_count_count_df = kw_count_count_df.sort_values(by = ['count_count', 'count'], ascending = [False, True]).reset_index()
kw_count_count_df.drop(['index'], axis = 1, inplace = True)

if logger.getEffectiveLevel() == logging.INFO:
    display(kw_count_count_df.head())
    print(f'count = 1 are the keywords that appear once in the Scopus keywords etc.\n')
'''


'''
    4. BERTopic count
'''

bertopic_df = pd.DataFrame()

if 'tp_name' in biblio_df.columns:  # biblio_df has BERTopic information
    pass

'''
    5. keyBERT count
'''

keybert_count_df = pd.DataFrame()

if keybert_title_count or keybert_abstract_count:   # generate title and abstract keywords using keyBERT

    kw_model = KeyBERT(model = 'all-MiniLM-L6-v2')

    def generate_keybert_keywords(row):
        tuples = []
        new_columns = {}

        if(row.name % 10 == 0):
            print(f'{row.name}', end = '\r')
        
        # Generate keywords for the title and/or the abstract
        if keybert_title_count:
            if not keybert_use_mmr:
                tuples = kw_model.extract_keywords(row['title'], keyphrase_ngram_range = (1, 2), 
                                                stop_words = 'english', top_n = n_keybert_kws)
            else :
                tuples = kw_model.extract_keywords(row['title'], keyphrase_ngram_range = (1, 2), 
                                                stop_words = 'english', top_n = n_keybert_kws,
                                                use_mmr = True, diversity = mmr_diversity)
        
            # Create a dictionary to store the new columns

            for i, (a, b) in enumerate(tuples):
                new_columns[f'kbt_{i+1}'] = a
                new_columns[f'kbt_p_{i+1}'] = b
            # Return the new columns as a Series

        if keybert_abstract_count:
            if not keybert_use_mmr:
                tuples = kw_model.extract_keywords(row['abstract'], keyphrase_ngram_range = (1, 2), 
                                                stop_words = 'english', top_n = n_keybert_kws)
            else :
                tuples = kw_model.extract_keywords(row['abstract'], keyphrase_ngram_range = (1, 2), 
                                                stop_words = 'english', top_n = n_keybert_kws,
                                                use_mmr = True, diversity = mmr_diversity)

            # Create a dictionary to store the new columns

            for i, (a, b) in enumerate(tuples):
                new_columns[f'kba_{i+1}'] = a
                new_columns[f'kba_p_{i+1}'] = b
            # Return the new columns as a Series

        return pd.Series(new_columns)

    # Create new columns in biblio_df with the keybert keyword information
    print(f'Generating keyBERT keywords...')

    new_columns = biblio_df.apply(generate_keybert_keywords, axis=1)
    biblio_df = pd.concat([biblio_df, new_columns], axis=1)

    # Melt the keybert keywords in biblio_df to create a single column with all the values
    keybert_count_df = biblio_df.loc[:, [col for col in biblio_df.columns if col.startswith('kbt_') and not col.startswith('kbt_p')]]
    keybert_count_df = pd.melt(keybert_count_df)
    keybert_count_df = keybert_count_df.drop(columns=['variable'])
    counts = keybert_count_df.value_counts().reset_index()
    keybert_count_df = pd.DataFrame({'string': counts.iloc[:, 0], 'count': counts.iloc[:, 1]})

    if logger.getEffectiveLevel() == logging.INFO:
        print(f'Number of unique keybert keywords: {len(keybert_count_df)}')
        display(keybert_count_df.head(20))

# Create timestamp
if timestamping:
    timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
else:
    timestamp = ''

# Write results to CSV
if write_st_filtered_csv:
    print(f'Saving file {biblio_st_filtered_file_csv + timestamp}.csv ...')
    biblio_df.to_csv(root_dir + results_dir + biblio_st_filtered_file_csv + timestamp + '.csv', index = False)

if write_csv:
    print(f'Saving file {biblio_st_count_csv + timestamp}.csv ...')
    st_count_df.to_csv(root_dir + results_dir + biblio_st_count_csv + timestamp + '.csv', index = False)

    print(f'Saving file {biblio_kw_count_csv + timestamp}.csv ...')
    kw_count_df.to_csv(root_dir + results_dir + biblio_kw_count_csv + timestamp + '.csv', index = False)

    if len(fos_count_df):
        print(f'Saving file {biblio_fos_count_csv + timestamp}.csv ...')
        fos_count_df.to_csv(root_dir + results_dir + biblio_fos_count_csv + timestamp + '.csv', index = False)

    if len(keybert_count_df):
        print(f'Saving file {biblio_keybert_count_csv + timestamp}.csv ...')
        keybert_count_df.to_csv(root_dir + results_dir + biblio_keybert_count_csv + timestamp + '.csv', index = False)

print(f'DONE!')

### Filtering Lens keywords & fields of study and associated keywords & fields of study
For a Lens bibliographic dataset with a given number of articles, there can be up to 10 times as many keywords. This script provides boolean filtering on the original or stemmed Scopus keywords. It produces two tables: (1) the keywords that match the boolean filter 'filter' (set in the Parameters below) and (2) the other keywords, filtered by 'filter_assoc' of the entries that contain the keywords that match the boolean filter 'filter'.

You can use any boolean expression with 'and' and 'or' keywords and with any number of parentheses. Examples
* cardi
* heart and cardi
* (neural network or deep learning) and convoluted

The notebook output is a truncated (see Parameters) HTML table that should be easier to inspect than the full keyword list. You can also write the table to an HTML file, which when opened in the browser gives you an additional way to search by using the browser search feature. This then highlights matches directly in the HTML.

**Parameters**
* filter: a boolean filter term applied to the Scopus keywords of the full dataframe.
* filter_assoc: a boolean filter that creates a table with the keywords that co-occur with the keywords matched by the filter.
* num_cols: number of columns in the output HTML table.
* max_rows: the output table will be truncated at max_rows rows.
* stemmed_kws: whether the filters are applied to the stemmed keywords list.
* write_html: set to True to generate the HTML file of the keywords.
* file_html_out: the output HTML file name.

#### TODO
- Add a topic filter here, using the bertopic numbers. As part of the HTML printouts, display all the topics in an HTML table and also display the filtered topics, as a check.
- Add a keybert filter.

In [None]:
# PARAMETERS

filter = "surrogate model"
filter_assoc = ""
filter_fos = ""

num_cols = 5
max_rows = 50
stemmed_kws = False

read_csv = False
lens_kw_count_csv = 'lens_kw_count_ml_in_medicine_all'
lens_st_count_csv = 'lens_st_count_ml_in_medicine_all'
lens_fos_count_csv = 'lens_fos_count_ml_in_medicine_all'

write_html = False
file_html_out = 'filtered_kws'

# ------------------------------------------

# For large datasets, we read the initial dataframes from file
if read_csv:
    kw_count_df = pd.read_csv(root_dir + results_dir + lens_kw_count_csv + '.csv')
    st_count_df = pd.read_csv(root_dir + results_dir + lens_st_count_csv + '.csv')
    fos_count_df = pd.read_csv(root_dir + results_dir + lens_fos_count_csv + '.csv')


'''
    Count of search terms (no filter is applied here)
'''

st_count = st_count_df.apply(lambda row: str(row['search_term']) + ' (' + str(row['count']) + ')', axis=1).tolist()
num_rows = len(st_count) // num_cols + (len(st_count) % num_cols > 0)
num_rows_p = num_rows

# Create an HTML string to display the list of strings in a table
st_html_str = '<table style="width:100%;">'
for j in range(num_cols):
    st_html_str += '<td style="vertical-align:top;">'
    for i in range(num_rows):
        idx = j * num_rows + i
        if idx < len(st_count) and st_count[idx]:
            st_html_str += '{}<br>'.format(st_count[idx])
    st_html_str += '</td>'
st_html_str += '</table>'

print(f'\nSEARCH TERMS (FULL DATASET)')
print("------------------------------------")
print(f'(displaying {num_rows} rows of {num_rows_p})')
print(f'Number of Scopus search terms: {len(st_count)}')

# Display the HTML table
display(HTML(st_html_str))


'''
    Count of keywords matched by filter
'''

# List of keywords with their counts, as strings
if stemmed_kws:
    kws_count = kw_stem_count_df.apply(lambda row: str(row['kw_stem']) + ' (' + str(row['count']) + ')', axis=1).tolist()
else:
    kws_count = kw_count_df.apply(lambda row: str(row['kw']) + ' (' + str(row['count']) + ')', axis=1).tolist()

# Filter the keywords using the filter provided above
def evaluate_expression(text, expression):
    words = re.split(r'\(|\)|\b(?:and|or)\b', expression)
    words = [x.strip() for x in words if x.strip()]
    values = {word: word in text for word in words}

    for key in values:
        expression = expression.replace(key, str(values[key]))

    return eval(expression)

if filter != "":
    kws_count = [x for x in kws_count if evaluate_expression(x, filter)]

# Calculate the number of rows in the table
num_rows = len(kws_count) // num_cols + (len(kws_count) % num_cols > 0)
num_rows_p = num_rows

if num_rows > max_rows:
    num_rows = max_rows

# Create an HTML string to display the list of strings in a table
kws_html_str = '<table style="width:100%;">'
for j in range(num_cols):
    kws_html_str += '<td style="vertical-align:top;">'
    for i in range(num_rows):
        idx = j * num_rows + i
        if idx < len(kws_count) and kws_count[idx]:
            kws_html_str += '{}<br>'.format(kws_count[idx])
    kws_html_str += '</td>'
kws_html_str += '</table>'

if stemmed_kws:
    print(f'\nFILTERED STEMMED KEYWORDS')
    print(f'(displaying {num_rows} rows of {num_rows_p})')
    print("--------------------------")
    print(f'Number of unique stemmed Lens keywords: {len(kw_stem_count_df)}')
else:
    print(f'\nFILTERED KEYWORDS')
    print(f'(displaying {num_rows} rows of {num_rows_p})')
    print("--------------------------")
    print(f'Number of unique Lens keywords: {len(kw_count_df)}')
    
print(f'Number of keyword matches for filter: {len(kws_count)}')
print(f'\nFilter term: {filter}\n')

# Display the HTML table
display(HTML(kws_html_str))


'''
    Associated keywords count after applying the filter 'filter_assoc'. When the 
    filter 'filter' is applied to the keywords in the dataset, an associated set of keywords
    is generated that consists of all keywords in those articles that were matched by the filter
    'filter'. Subsequently, the filter 'filter_assoc' is applied to home in on a particular
    subset of the typically very large associated keyword set.
'''

# Create table with Lens keywords
kws_assoc_count_df = biblio_df[['kws']].copy()

# Remove NaN entries
kws_assoc_count_df.dropna(subset = ['kws'], inplace = True)
kws_assoc_count_df = kws_assoc_count_df[kws_assoc_count_df['kws'].str.lower() != 'nan']

# Apply the filter
if filter != "":
    kws_assoc_count_df = kws_assoc_count_df[kws_assoc_count_df['kws'].apply(evaluate_expression, expression = filter)]

#display(kws_assoc_count_df)

# Create a single keyword list by exploding the table
kws_assoc_count_df = kws_assoc_count_df.apply(lambda x: x.str.split(',')).explode('kws').reset_index()
kws_assoc_count_df.drop(['index'], axis = 1, inplace = True)

#display(kws_assoc_count_df)

# Create a count table for the keywords
kw_assoc_count_df = pd.DataFrame(kws_assoc_count_df['kws'].value_counts()).reset_index()
kw_assoc_count_df.columns = ['kw', 'count']
kw_assoc_count_df = kw_assoc_count_df.sort_values(by = ['count', 'kw'], ascending = [False, True]).reset_index()
kw_assoc_count_df.drop(['index'], axis = 1, inplace = True)

kw_assoc_stem_count_df = pd.DataFrame()

if stemmed_kws:

    # Create stemmed keywords
    kws_assoc_count_df['kws_stemmed'] = kws_assoc_count_df['kws'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

    # Create a count table for the stemmed keywords
    kw_assoc_stem_count_df = pd.DataFrame(kws_assoc_count_df['kws_stemmed'].value_counts()).reset_index()
    kw_assoc_stem_count_df.columns = ['kw_stem', 'count']
    kw_assoc_stem_count_df = kw_assoc_stem_count_df.sort_values(by = ['count', 'kw_stem'], ascending = [False, True]).reset_index()
    kw_assoc_stem_count_df.drop(['index'], axis = 1, inplace = True)

#display(kws_assoc_count_df)

# List of keywords with their count, as strings
if (stemmed_kws == True) and (len(kw_assoc_stem_count_df) > 0):
    kws_assoc_count = kw_assoc_stem_count_df.apply(lambda row: row['kw_stem'] + ' (' + str(row['count']) + ')', axis=1).tolist()
elif (stemmed_kws == False) and (len(kw_assoc_count_df) > 0):
    kws_assoc_count = kw_assoc_count_df.apply(lambda row: row['kw'] + ' (' + str(row['count']) + ')', axis=1).tolist()
else:
    kws_assoc_count = []

if filter_assoc != "":
    kws_assoc_count = [x for x in kws_assoc_count if evaluate_expression(x, filter_assoc)]

# Calculate the number of rows in the table
num_rows = len(kws_assoc_count) // num_cols + (len(kws_assoc_count) % num_cols > 0)
num_rows_p = num_rows

if num_rows > 0:

    if num_rows > max_rows:
        num_rows = max_rows

  # Create an HTML string to display the list of strings in a table
    ka_html_str = '<table style="width:100%;">'
    for j in range(num_cols):
        ka_html_str += '<td style="vertical-align:top;">'
        for i in range(num_rows):
            idx = j * num_rows + i
            if idx < len(kws_assoc_count) and kws_assoc_count[idx]:
                ka_html_str += '{}<br>'.format(kws_assoc_count[idx])
        ka_html_str += '</td>'
    ka_html_str += '</table>'
else:
    ka_html_str = ''

if stemmed_kws:
    print(f'\nFILTERED STEMMED ASSOCIATED KEYWORDS')
    print(f'(displaying {num_rows} rows of {num_rows_p})')
    print("------------------------------")
    print(f'Number of kw_assoc_stem_count terms: {len(kw_assoc_stem_count_df)}')
else:
    print(f'\nFILTERED ASSOCIATED KEYWORDS')
    print(f'(displaying {num_rows} rows of {num_rows_p})')
    print("---------------------------")
    print(f'Number of kw_assoc_count terms: {len(kw_assoc_count_df)}')

print(f'Number of keyword matches for filter_assoc: {len(kws_assoc_count)}')
print(f'\nFilter term: {filter_assoc}\n')

# Display the HTML table
display(HTML(ka_html_str))


'''
    Count of fields of study matched by filter
'''

if len(fos_count_df):

    # List of FOS with their counts, as strings
    fos_count = fos_count_df.apply(lambda row: str(row['fos']) + ' (' + str(row['count']) + ')', axis=1).tolist()

    # Filter the FOS using the filter provided above
    def evaluate_expression(text, expression):
        words = re.split(r'\(|\)|\b(?:and|or)\b', expression)
        words = [x.strip() for x in words if x.strip()]
        values = {word: word in text for word in words}

        for key in values:
            expression = expression.replace(key, str(values[key]))

        return eval(expression)

    if filter_fos != "":
        fos_count = [x for x in fos_count if evaluate_expression(x, filter_fos)]

    # Calculate the number of rows in the table
    num_rows = len(fos_count) // num_cols + (len(fos_count) % num_cols > 0)
    num_rows_p = num_rows

    if num_rows > max_rows:
        num_rows = max_rows

    # Create an HTML string to display the list of strings in a table
    fos_html_str = '<table style="width:100%;">'
    for j in range(num_cols):
        fos_html_str += '<td style="vertical-align:top;">'
        for i in range(num_rows):
            idx = j * num_rows + i
            if idx < len(fos_count) and fos_count[idx]:
                fos_html_str += '{}<br>'.format(fos_count[idx])
        fos_html_str += '</td>'
    fos_html_str += '</table>'

    print(f'\nFILTERED FIELDS OF STUDY')
    print(f'(displaying {num_rows} rows of {num_rows_p})')
    print("--------------------------")
    print(f'Number of unique Lens fields of study: {len(fos_count_df)}')
    print(f'Number of fields of study matches for filter: {len(fos_count)}')
    print(f'\nFilter term: {filter_fos}\n')

    # Display the HTML table
    display(HTML(fos_html_str))


'''
    Write HTML
'''

if write_html:
    with open(root_dir + results_dir + file_html_out + '.html', 'w') as f:
        f.write(str(HTML('<h3>Search terms (full dataset)</h3>').data))
        f.write(str(HTML(st_html_str).data))
        f.write(str(HTML('<h3>Filtered bibliographic keywords</h3>').data))
        f.write(str(HTML(kws_html_str).data))
        f.write(str(HTML('<h3>Filtered associated bibliographic keywords</h3>').data))
        f.write(str(HTML(ka_html_str).data))
        if len(fos_count_df):
            f.write(str(HTML('<h3>Filtered fields of study</h3>').data))
            f.write(str(HTML(ka_html_str).data))
        

### Highlighted search terms and keyword filter terms and save to HTML and/or Excel
When skimming through larger numbers of titles and abstracts from a bibliograophic search result, it is possible to speed up the task by highlighting the search terms directly in the title and abstracts. This script highlights in bold red the search terms used in the original Scopus search and in bold blue the filter terms provided in the Parameter section below. The dataframe can be saved to an HTML file. The next cell builds the Excel file from scratch with the same highlighhts.

**Parameters**
* filter: a boolean filter string with the same specs than the boolean filters in the keyword searches, with one difference. You can specifiy where you want to search by prepending T, A, and/or K (stanfing for title, abstract, and keyword search respectively) to any of the search strings. If you provide a search term with none of the three letters prepended, it will search in all three (that is, in title, abstract, and keyword).
* write_html: set to True to generate the HTML file of the highlights.
* file_html_out: the output HTML file name.

#### TODO
- Add a bertopic filter here, using the topic numbers. As part of the HTML printouts, display all the topics in an HTML table and also display the filtered topics, as a check.
- Add a keybert filter (low priority)
- Add the possibility of colouring additional keywords that are not used to filter the dataset

In [5]:
# PARAMETERS

# filter_words = "Tmachine learning and (cardi or heart or vent)"  # Filter: T = Title, A = Abstract, K = keywords; TA = occurs in title or keywords etc.; no T,A,K == TAK

# Datasets
# - ML or PINN
# filter_words = '''
# "neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "bayesian network" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "linear regression" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "logistic regression" OR "statistical learning" OR lstm OR TITLE-ABS-KEY( "neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning")
# '''

# Datasets
# - ML in Medicine
# filter_words = '''
# (title:("neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "statistical learning" OR lstm) OR title:("neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning" ))
# '''

# SIM + HEART => for highlighting only
# filter_words = '''
# simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method" OR *cardi*  OR  heart  OR  *ventric*  OR  coronary  OR  *atria*  OR  atrium
# '''

# ML + SIM (Engineering, Medicine)
highlight_words = '''
    "neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "bayesian network" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "linear regression" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "logistic regression" OR "statistical learning" OR lstm OR "neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning" OR simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method" OR pinn
'''

# ML + SIM (Heart)
# highlight_words = '''
#     "neural network" OR "reinforcement learning" OR "machine learning" OR "deep learning" OR "transformer model" OR "BERT" OR "GPT" OR "adversarial network" OR "gan" OR "natural language processing" OR "word embedding" OR "document embedding" OR "sentence embedding" OR "transfer learning" OR "ensemble learning" OR "learning algorithm" OR "genetic algorithm" OR "evolutionary algorithm" OR "support vector machine" OR "decision tree" OR "bayesian network" OR "q-learning" OR "long short-term memory" OR "classification model" OR "classification algorithm" OR "ann" OR "clustering algorithm" OR "feature extraction" OR "anomaly detection" OR "inference engine" OR "k nearest neighbour" OR "cluster analysis" OR "linear regression" OR "hidden markov" OR perceptron OR "random forest" OR "support vector regression" OR cnn OR rnn OR "predictive model" OR "logistic regression" OR "statistical learning" OR lstm OR "neural differential" OR "neural ordinary" OR "neural ODE" OR "data-driven model" OR "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning" OR simulation  OR  "numerical method"  OR  "numerical model"  OR  "navier stokes"  OR  "system dynamics"  OR  "numerical experiment"  OR  fem  OR  turbulence  OR  "numerical analysis"  OR  "multiagent"  OR  "multi-agent"  OR  "surrogate"  OR  pde  OR  "partial differential equation"  OR  "computational fluid"  OR  "computational model*"  OR  "computational method*"  OR  "computational framework"  OR  "computational approach"  OR  "computational experiment"  OR  "computational mechanic*"  OR  "computational technique"  OR  "computational study"  OR  "computational analysis"  OR  "computational science"  OR  "computational electro*"  OR  "computational material"  OR  "computational biomech*"  OR  "computational physics"  OR  "computational research"  OR  "computational engineering"  OR  "finite element"  OR  "finite difference"  OR  "finite volume"  OR  "boundary element method"  OR  "discrete element method"  OR  "meshfree method"  OR  "mesh free method"  OR  "meshless method"  OR  "particle hydrodynamics"  OR  "dissipative particle dynamics"  OR  "particle method" OR pinn OR *cardi*  OR  heart  OR  *ventric*  OR  coronary  OR  *atria*  OR  atrium
# '''

# filter_words = '''
#     "physics-informed" OR "physics-constrained" OR "physics-embedded" OR "physics-inspired" OR "physics-aware" OR "physics-enhanced" OR "hidden physics" OR "differentiable physics" OR "scientific machine learning" OR "physics machine learning" OR "inverse model" OR pinn
# '''

filter_words = "surrogate model"

# filter_acronyms = ['PINN', 'GAN', 'ANN', 'BERT']
filter_acronyms = []

filter_fos = ""

highlights_only = False      # skip the filtering; useful if the dataset is very large and already filtered
highlight_partial = True    # highlight partial matches too

display_html = True
max_html_rows = 20

data_dir = '/results/'
results_dir = '/results/'

data_src = 'csv_load'  # data sources: 'csv_load', 'biblio_df', 'csv_loaded'
read_biblio_csv = 'scopus_lens_ml_sim_engineering_all_st_plus_abstract_topics'

write_biblio_highlights_file = 'scopus_lens_ml_sim_engineering_all_highlights_plus_abstract_topics'

write_csv = False  # just the filtered publications without highlights; use e.g. for merging with other datasets
write_html = False
write_xlsx = True

# --------------------------------------------

copy_of_biblio_highlights_df = pd.DataFrame()

if data_src == 'csv_load':  # for large or datasets further processed elsewhere, we read the bibliographic dataset from file

    print(f'\nReading the input file {read_biblio_csv}.csv ...')

    biblio_highlights_df = pd.read_csv(root_dir + results_dir + read_biblio_csv + '.csv') #.head(100)

    # Convert the search_title and search_abs into lists
    biblio_highlights_df['search_title'] = biblio_highlights_df['search_title'] \
                                           .apply(lambda x: [i.strip().strip("'") \
                                            for i in x.strip('[]').split(',')])
    biblio_highlights_df['search_abs'] = biblio_highlights_df['search_abs'] \
                                           .apply(lambda x: [i.strip().strip("'") \
                                            for i in x.strip('[]').split(',')])
    copy_of_biblio_highlights_df = biblio_highlights_df.copy()

elif data_src == 'biblio_df':   # use the biblio_df dataset from the previous cells

    biblio_highlights_df = biblio_df.copy()

elif data_src == 'csv_loaded': # for large datasets, use biblio_highlights_df if the CSV file was previously loaded

    biblio_highlights_df = copy_of_biblio_highlights_df.copy()
else:
    raise Exception(f'The variable data_src cannot have the value "{data_src}"')

n_pubs = len(biblio_highlights_df)

def highlight_selected_text(text, search_terms, color, i_row):
    text = str(text)
    
    if(i_row % 100 == 0): # print row index
        print(f'{i_row}', end = '\r')

    if len(search_terms) == 0:
        return text
    
    if (len(search_terms) == 1) and (search_terms[0] == ""):
        return text

    for k in search_terms:
        if highlight_partial:
            pattern = r"\b\w*{}+\w*\b".format(k)
            text = re.sub(pattern, lambda match: f'<span style="color: {color}; font-weight: bold">{match.group()}</span>', text, flags = re.IGNORECASE)
        else:
            text = re.sub(r"(?i)\b"+k+r"[\w-]*", lambda match: f'<span style="color: {color}; font-weight: bold">{match.group()}</span>', text)

    return text

def explode_filter_kws(kw):
    match = re.match(r'^([A-Z]{0,3})([^A-Z]+)$', kw)
    kw_stripped = kw.lstrip('TAK')
    kw_expl = kw

    if match:
        if len(match.group(1)) == 0:
            kw_expl = '(T' + kw_stripped + ' or A' + kw_stripped + ' or K' + kw_stripped + ')'
        elif len(match.group(1)) == 1:
            kw_expl = kw
        elif len(match.group(1)) == 2:
            kw_expl = '(' + match.group(1)[0] + kw_stripped + ' or ' + match.group(1)[1] + kw_stripped + ')'
        elif len(match.group(1)) == 3:
            kw_expl = '(' + match.group(1)[0] + kw_stripped + ' or ' + match.group(1)[1] + kw_stripped + ' or ' + match.group(1)[2] + kw_stripped + ')'

    return kw_expl

# Expand the search terms in the boolean filter string that are not prepended with T/A/K
def expand_search_terms(my_filter):
    new_filter_as_list = []
    my_filter = my_filter.replace('"', '').replace("'", '').replace("*", '')
    filter_as_list = re.split(r'(\(|\)|\b(?:and|or|AND|OR)\b)', my_filter)   # extract all the words used in the filter
    filter_as_list = [x.strip() for x in filter_as_list if x.strip()]

    i = 0
    comp_str = ""

    while i < len(filter_as_list):

        if filter_as_list[i] not in ['or', 'and', 'OR', 'AND', '(', ')']:
            if comp_str == "":
                comp_str = filter_as_list[i]
            else:
                comp_str += ' ' + filter_as_list[i]
            i += 1
            
            if i == len(filter_as_list):
                new_filter_as_list.append(comp_str)

        elif comp_str == "":
            new_filter_as_list.append(filter_as_list[i])
            i += 1
        else:
            new_filter_as_list.append(comp_str)
            new_filter_as_list.append(filter_as_list[i])
            comp_str = ""
            i += 1

    return(new_filter_as_list)

# Strings to remove from filter_words and highlight_words (this allows direct copying of search terms from Scopus and Lens)
strings_to_remove = ['TITLE-ABS-KEY', 'TITLE', 'TITLE-ABS', 'KEY','title:', 'abstract:', 'keyword:']

# biblio_highlights_df = biblio_highlights_df.head(100)
# display(biblio_highlights_df)

# Filter the titles, abstracts, and/or keywords with filter_words
if not highlights_only:

    for string_to_remove in strings_to_remove:
        filter_words = filter_words.replace(string_to_remove, '')

    # Remove quotes from search term
    filter_words = re.sub(r'[\"\']', '', filter_words)

    # Parse the filter and expand unlabeled (T,A,K) terms to include all labels T, K, A
    filter_exp_list = expand_search_terms(filter_words)
    filter_exp = ' '.join([explode_filter_kws(x) if x not in ['(', ')', 'and', 'or', 'AND', 'OR'] else x.lower() for x in filter_exp_list])

    # Filter the dataframe using the filter provided above
    def evaluate_expression(row, filter):

        if(row.name % 100 == 0): # print row index
            print(f'{row.name}', end = '\r')

        # Apply the Field of Study filter first (single keyword only at this moment)
        if filter_fos != "":

            fos_list = str(row['fos']).split(';')

            if not any(filter_fos.lower() in s.lower().strip() for s in fos_list):
                return False
            
            if filter == "":
                return True

        title = str(row['title'])
        abs = str(row['abstract'])
        kws = str(row['kws'])

        words_all = expand_search_terms(filter)
        words = [x for x in words_all if x not in ['(', ')', 'and', 'or']]  # remove the parantheses, 'and' and 'or'

        # Prepend TAK to filter keywords that do not start with T, A, or K
        words = ["TAK" + string if string[0] not in ['T', 'A', 'K'] else string for string in words]
        
        words_T, words_A, words_K = [], [], []
        values_T, values_A, values_K = {}, {}, {}

        try:
            words_T = [string.lstrip('TAK') for string in words if string[:3].count('T') > 0]
            # values_T = {'T' + word: word.lower() in title.lower() for word in words_T}
            values_T = {'T' + word: word.upper() in title if word.upper() in filter_acronyms else word.lower() in title.lower() for word in words_T}

            words_A = [string.lstrip('TAK') for string in words if string[:3].count('A') > 0]
            # values_A = {'A' + word: word.lower() in abs.lower() for word in words_A}
            values_A = {'A' + word: word.upper() in abs if word.upper() in filter_acronyms else word.lower() in abs.lower() for word in words_A}

            words_K = [string.lstrip('TAK') for string in words if string[:3].count('K') > 0]
            # values_K = {'K' + word: word.lower() in kws.lower() for word in words_K}
            values_K = {'K' + word: word.upper() in kws if word.upper() in filter_acronyms else word.lower() in kws.lower() for word in words_K}
        except AttributeError as e:
            print(f"Error: {e}. Row has values {title}\n{abs}\n{kws}")
        
        words_all = list(set(words_T + words_A + words_K))

        values = values_T.copy()
        values.update(values_A)
        values.update(values_K)

        for key in values:
            filter = filter.replace(key, str(values[key]))

        return eval(filter)

    print(f'Filtering the dataframe...')

    # Apply the keyword filter
    if filter_exp != "" or filter_fos != "":
        biblio_highlights_df = biblio_highlights_df[biblio_highlights_df.apply(evaluate_expression, filter = filter_exp, axis = 1)].copy()

    print(f"\n{len(biblio_highlights_df)} filter matches of a total of {n_pubs} publications")

# Remove quotes, parantheses, stars, boolean operators, and special search operators from highlight_words
for string_to_remove in strings_to_remove:
    highlight_words = highlight_words.replace(string_to_remove, '')

highlight_words = re.sub(r'[\"\']', '', highlight_words)
highlight_words = highlight_words.replace('"', '').replace("'", '').replace("*", '').lower()
highlights_as_list = re.split(r'(\(|\)|\b(?:and|or)\b)', highlight_words)   # extract all the words used in the filter
highlights_as_list = [x.strip() for x in highlights_as_list if x.strip()]   # remove leading and trailing whitespaces

# All words in highlight_words
# words_filter_all = expand_search_terms(highlight_words)  # extract all the words used in the filter
# words_filter_all = [x.lstrip('TAK') for x in words_filter_all if x not in ['(', ')', 'and', 'or']]  # remove the parantheses, 'and' and 'or'
# words_filter_all = list(set(words_filter_all))
highlights_as_list = [x.strip() for x in highlights_as_list if x not in ['(', ')', 'and', 'or']]  # remove the parantheses, 'and' and 'or'
highlights_as_list = list(set(highlights_as_list))

# Words in the search term that are not in the filter (to avoid nested <span> tags; the Excel creator function below doesn't like it)
words_st = set(x for lst in biblio_highlights_df['search_title'] + biblio_highlights_df['search_abs'] for x in lst)
words_st = {x.lower() for x in words_st}
words_diff_st = list(words_st - set(highlights_as_list))
words_diff_st = [x for x in words_diff_st if x]     # remove empty list items

# biblio_highlights_df = biblio_highlights_df.head(10)

# Highlight the keywords used in the search terms
if not biblio_highlights_df.empty:
    biblio_no_highlights_df = pd.DataFrame()

    if write_csv:
        biblio_no_highlights_df = biblio_highlights_df.copy()   # filtered dataset without the highlight markup

    # Highlight the filter keywords
    print(f'Highlighting the filter keywords in the titles...')
    biblio_highlights_df['title'] = biblio_highlights_df.apply(lambda x: highlight_selected_text(x['title'], highlights_as_list, 'blue', biblio_highlights_df.index.get_loc(x.name)), axis = 1)
    print(f'\nHighlighting the filter keywords in the abstracts...')
    biblio_highlights_df['abstract'] = biblio_highlights_df.apply(lambda x: highlight_selected_text(x['abstract'], highlights_as_list, 'blue', biblio_highlights_df.index.get_loc(x.name)), axis = 1)
    print(f'\nHighlighting the filter keywords in the keywords...')
    biblio_highlights_df['kws'] = biblio_highlights_df.apply(lambda x: highlight_selected_text(x['kws'], highlights_as_list, 'blue', biblio_highlights_df.index.get_loc(x.name)), axis = 1)
    
    # Highlight the original search terms
    print(f'\nHighlighting the original search terms in the titles...')
    biblio_highlights_df['title'] = biblio_highlights_df.apply(lambda x: highlight_selected_text(x['title'], words_diff_st, 'red', biblio_highlights_df.index.get_loc(x.name)), axis = 1)
    print(f'\nHighlighting the original search terms in the abstracts...')
    biblio_highlights_df['abstract'] = biblio_highlights_df.apply(lambda x: highlight_selected_text(x['abstract'], words_diff_st, 'red', biblio_highlights_df.index.get_loc(x.name)), axis = 1)
    print(f'\nHighlighting the original search terms in the keywords...')
    biblio_highlights_df['kws'] = biblio_highlights_df.apply(lambda x: highlight_selected_text(x['kws'], words_diff_st, 'red', biblio_highlights_df.index.get_loc(x.name)), axis = 1)

    print(f'Saving results...')
    
    if display_html:
        display(HTML(biblio_highlights_df[['title', 'abstract', 'kws', 'fos']].iloc[:max_html_rows].to_html(escape = False)))

    if write_csv:
        print(f'Saving file {write_biblio_highlights_file}.csv ...')
        biblio_no_highlights_df.to_csv(root_dir + results_dir + write_biblio_highlights_file + '.csv', index = False)
    
    if write_html:
        print(f'Saving file {write_biblio_highlights_file}.html ...')
        with open(root_dir + results_dir + write_biblio_highlights_file + '.html', 'w') as f:
            f.write(str(HTML(biblio_highlights_df[['id', 'title', 'abstract', 'kws', 'fos', 'year', 'source']].to_html(escape = False)).data))

    if write_xlsx:

        # Define the rich text formattings
        bold_red = InlineFont(b = True, color = '00FF0000')
        bold_blue = InlineFont(b = True, color = '000000FF')

        # Create a new workbook
        wb = Workbook()

        # Create a new sheet with the name 'TAK Highlights'
        ws = wb.create_sheet('TAK Highlights')

        if wb["Sheet"]:
            wb.remove(wb["Sheet"])

        # Make a copy of titles_highlights_df
        tak_excel_df = biblio_highlights_df.copy()

        # Rearrange columns
        if not 'tp_num' in tak_excel_df.columns:
            new_order = ['title', 'abstract', 'year', 'cited', 'source', 'kws', 
                        'search_label', 'authors', 'search_title', 'search_abs',
                        'lit_review']
        else:
            new_order = ['title', 'abstract', 'year', 'cited', 'tp_num', 'tp_name', 
                         'top_n_words', 'prob', 'representative', 'source', 'kws', 
                         'search_label', 'authors', 'search_title', 'search_abs',
                         'lit_review']
            tak_excel_df = tak_excel_df.sort_values(by = ['tp_num', 'prob'], ascending = [True, False])

        tak_excel_df = tak_excel_df.reindex(columns = new_order)

        # Excel column headers
        for j in range(len(tak_excel_df.columns)):
            ws.cell(row = 1, column = j + 1, value = str(tak_excel_df.columns[j]))

        # Apply findall() to split a string at '<span.../span>'
        def split_string_at_span(string):
            lst = re.findall(r"(.*?)(<span.*?/span>|$)", string)
            lst = [elem for tup in lst for elem in tup]
            lst = [x for x in lst if x.strip()]
            return lst

        def replace_span_with_textblock(lst):

            is_prev_kw = False  # need to add a space between two consecutive keywords

            # TODO replace with a list comprehension: new_list = [x if x != 'banana' else 'orange' for x in my_list]
            for i in range(len(lst)):

                if '<span style="color: red; font-weight: bold">' in lst[i]:
                    text = (' ' if is_prev_kw else '') + lst[i].split('>')[1].split('<')[0]
                    text_block = TextBlock(bold_red, text)
                    lst[i] = text_block
                    is_prev_kw = True
                elif '<span style="color: blue; font-weight: bold">' in lst[i]:
                    text = (' ' if is_prev_kw else '') + lst[i].split('>')[1].split('<')[0]
                    text_block = TextBlock(bold_blue, text)
                    lst[i] = text_block
                    is_prev_kw = True
                else:
                    is_prev_kw = False

            return lst

        # print(tak_excel_df.iloc[0,1])

        # Create a list of each cell content by splitting at '<span...>some text</span>' using findall()
        tak_excel_df['title'] = tak_excel_df['title'].apply(split_string_at_span)
        tak_excel_df['abstract'] = tak_excel_df['abstract'].apply(split_string_at_span)
        tak_excel_df['kws'] = tak_excel_df['kws'].apply(split_string_at_span)

        # Replace all '<span...>some text</span>' with the results of calling TextBlock(bold_red, 'some text')
        tak_excel_df['title'] = tak_excel_df['title'].apply(replace_span_with_textblock)
        tak_excel_df['abstract'] = tak_excel_df['abstract'].apply(replace_span_with_textblock)
        tak_excel_df['kws'] = tak_excel_df['kws'].apply(replace_span_with_textblock)

        highlight_cols = ['title', 'abstract', 'kws']
        num_value_cols = ['tp_num', 'prob']

        # Loop through rows and columns of the dataframe
        for i in range(len(tak_excel_df)):
            for col in tak_excel_df.columns:
                j = tak_excel_df.columns.get_loc(col)

                if col in highlight_cols:
                    rs = CellRichText(tak_excel_df.iloc[i, j])
                elif col in num_value_cols:
                    # rs = str(tak_excel_df.iloc[i, j])
                    rs = tak_excel_df.iloc[i, j]
                else:
                    rs = str(tak_excel_df.iloc[i, j])

                ws.cell(row = i + 2, column = j + 1, value = rs)

        if not 'tp_num' in tak_excel_df.columns:
            ws.column_dimensions['A'].width = 40
            ws.column_dimensions['B'].width = 80
            ws.column_dimensions['C'].width = 6
            ws.column_dimensions['D'].width = 6
            ws.column_dimensions['E'].width = 30
            ws.column_dimensions['F'].width = 40
            ws.column_dimensions['G'].width = 25
            ws.column_dimensions['H'].width = 25
            ws.column_dimensions['I'].width = 25
            ws.column_dimensions['J'].width = 25
            ws.column_dimensions['K'].width = 6
            wrap_col = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
        else:
            ws.column_dimensions['A'].width = 40
            ws.column_dimensions['B'].width = 80
            ws.column_dimensions['C'].width = 6
            ws.column_dimensions['D'].width = 6
            ws.column_dimensions['E'].width = 8
            ws.column_dimensions['F'].width = 35
            ws.column_dimensions['G'].width = 30
            ws.column_dimensions['H'].width = 8
            ws.column_dimensions['I'].width = 6
            ws.column_dimensions['J'].width = 30
            ws.column_dimensions['K'].width = 35
            ws.column_dimensions['L'].width = 25
            ws.column_dimensions['M'].width = 25
            ws.column_dimensions['N'].width = 25
            ws.column_dimensions['O'].width = 25
            ws.column_dimensions['P'].width = 6
            wrap_col = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'J', 'K', 'L', 'M', 'N', 'O']

        for col in wrap_col:
            for cell in ws[col]:
                cell.alignment = Alignment(wrap_text = True)

        ws.sheet_view.zoomScale = 140
        ws.freeze_panes = 'A2'

        print(f'Saving file {write_biblio_highlights_file}.xlsx ...')
        wb.save(root_dir + results_dir + write_biblio_highlights_file + '.xlsx')

else:
    print(f"No matches found for boolean search: {filter_words}")

print(f'DONE!')


Reading the input file scopus_lens_ml_sim_engineering_all_st_plus_abstract_topics.csv ...
Filtering the dataframe...
73500
2684 filter matches of a total of 73557 publications
Highlighting the filter keywords in the titles...
2600
Highlighting the filter keywords in the abstracts...
2600
Highlighting the filter keywords in the keywords...
2600
Highlighting the original search terms in the titles...
2600
Highlighting the original search terms in the abstracts...
2600
Highlighting the original search terms in the keywords...
Saving results...


Unnamed: 0,title,abstract,kws,fos
19,A CNN based surrogate model of isogeometric analysis in nonlocal flexoelectric problems,We proposed a convolutional neural network CNN -based surrogate model to predict the nonlocal response for flexoelectric structures with complex topologies. The input i.e. the binary images for the CNN is obtained by converting geometries into pixels while the output comes from simulations of an isogeometric IGA flexoelectric model which in turn exploits the higher-order continuity of the underlying non-uniform rational B-splines NURBS basis functions to fast computing of flexoelectric parameters e.g. electric gradient mechanical displacement strain and strain gradient. To generate the dataset of porous flexoelectric cantilevers we developed a NURBS trimming technique based on the IGA model. As for CNN construction the key factors were optimized based on the IGA dataset including activation functions dropout layers and optimizers. Then the cross-validation was conducted to test the CNNgeneralization ability. Last but not least the potential of the CNN performance has been explored under different model output sizes and the corresponding possible optimal model layout is proposed. The can be instructive for studies on deep learning of other nonlocal mech-physical simulations. 2022 The Author.,"trimming,network-based,isogeometric analysis,convolutional neural networks,trimming techniques,topology,deep learning,rational functions,nurbs trimming technique,convolution,binary images,nonlocal flexoelectricity,flexoelectric,flexoelectricity,convolutional neural network,non-uniform rational b-splines,non-uniform rational b-spline trimming technique,interpolation,nonlocal",Isogeometric analysis; Convolutional neural network; Computer science; Surrogate model; Generalization; Basis function; Topology (electrical circuits); Algorithm; Applied mathematics; Mathematics; Mathematical optimization; Artificial intelligence; Mathematical analysis; Structural engineering; Finite element method; Combinatorics; Engineering
27,A PINN surrogate modeling methodology for steady state integrated thermofluid systems modeling,Physics-informed neural networks PINNs were developed to overcome the limitations associated with the acquisition of large training data sets that are commonly encountered when using purely data-driven machine learning methods. This paper proposes a PINN surrogate modeling methodology for steady-state integrated thermofluid systems modeling based on the mass energy and momentum balance equations combined with the relevant component characteristics and fluid property relationships. The methodology is applied to two thermofluid systems that encapsulate the important phenomena typically encountered namely a heat exchanger network with two different fluid streams and components linked in series and parallel and ii a recuperated closed Brayton cycle with various turbomachines and heat exchangers. The generated with the PINN models were compared to benchmark solutions generated via conventional physics-based thermofluid process models. The largest average relative errors are 0.17 and 0.93 for the heat exchanger network and Brayton cycle respectively. It was shown that the use of a hybrid Adam-TNC optimizer requires between 180 and 690 fewer iterations during the training process thus providing a significant computational advantage over a pure Adam optimization approach. The resulting PINN models can make predictions 75 to 88 times faster than their respective conventional process models. This highlights the potential for PINN surrogate models as a valuable engineering tool in component and system design and optimization as well as in real-time simulation for anomaly detection diagnosis and forecasting.,,
38,A bayesian based inspection monitoring data fusion approach for historical buildings and its post earthquake application to a monumental masonry palace,Many countries exposed to high levels of seismic risk including Italy are facing a huge challenge in promptly quantifying post-earthquake damages to their built historical heritage. In this context structural health monitoring plays a fundamental role allowing to continuously track changes in selected damage-sensitive features. However monitoring data interpretation is often not univocal and may be affected by large uncertainty provoking false positives and false negatives. Hence this research proposes a novel approach for post-earthquake structural condition assessment by exploiting the aggregation of different sources of information notably steering from both monitoring and visual inspection campaigns in order to take risk-informed decisions. More in depth an automatic tool is proposed to detect and locate structural damages in monumental structures with the aid of a data fusion approach including vibration-based system identification static and dynamic measurements finite element FE and surrogate modeling Bayesian-based model updating and visual inspections. In a preliminary phase potential damage-sensitive regions in the structure are identified through FE-based numerical analysis and engineering judgment. Then the solution of the inverse problem aimed at deriving the Bayesian posterior statistics of the uncertain parameters is entrusted to a computational-effective surrogate model SMFinally the Bayesian-based updated parameters are adjusted considering the different allowable sources of information to achieve a final assessment. The effectiveness of the proposed approach is demonstrated by using the recorded data acquired in the Consoli Palace an historical building located in Umbria central Italy which has been continuously monitored since 2017 using dynamic static and environmental sensors and which has been hit by low-intensity earthquakes in 2021. 2022 The Authorunder exclusive licence to Springer Nature B.V.,"bayesian,risk assessment,continuous monitoring,bayesian networks,historic masonry construction,inverse problems,uncertainty analysis,historic masonry,data interpretation,earthquake damages,historical buildings,earthquakes,data fusion,finite element method,masonry materials,masonry construction,damage detection,damage classification,masonry,umbria,bayesian model updating,health monitoring,building,monitoring,post-earthquake damage classification,surrogate modeling,structural health monitoring,italy",
41,A bayesian optimization algorithm for the optimization of mooring system design using time domain analysis,Dynamic analysis can consider the complex behavior of mooring systems. However the relatively long analysis time of the dynamic analysis makes it difficult to use in the design of mooring systems. To tackle this we present a Bayesian optimization algorithm BOA which is well known as fast convergence using a small number of data points. The BOA evaluates design candidates using a probability-based function which is updated during the optimization process as more data points are achieved. In a case study we applied the BOA to improve an initial mooring system that had been designed by human experts. The BOA was also compared with a genetic algorithm GA that used a pre-trained surrogate model for fast evaluation. The optimal designs that were determined by both the BOA and GA have a 50 lower maximum tension than the initial design. However the computation time of the GA needed 20 times more than that of the BOA because of the training time of the surrogate model.,,
42,A beam on elastic foundation method for predicting deflection of braced excavations considering uncertainties,Predicting wall deflection is important to provide a critical reference to evaluate the current construction conditions and prevent potential damage risks of adjacent facilities during excavations. This paper presents a combination of a beam on elastic foundation model BEFM and the Bayesian framework to realize effective probabilistic predictions of wall deflection at various depths in braced excavations. First a finite element solving algorithm to calculate wall deflection for the BEFM is developed and incorporated into the Bayesian framework. Next the most suitable distribution pattern for soil resistance and an appropriate set of uncertain parameters in the BEFM are determined through the application of the Bayesian model selection technique. Meanwhile the uncertain parameters are updated. A prediction is then made using the optimal model and corresponding posterior probability distributions of the updated parameters at each stage. The parameter updating and prediction process are repeated as additional field observations become available during construction. The performance of the proposed method is examined using a field case study. The show that this method provides a satisfactory approach to predict both the magnitudes and profile of deflection when considering uncertainties. Additionally comparisons with a Bayesian updating framework using a surrogate model i.e. the KJHH model indicate higher updating efficiency of the developed method. 2022 John Wiley Sons Ltd.,"wall deflection,uncertain parameters,bayesian networks,deflection (structures),beam on elastic foundation,elastic foundation model,beam on elastic foundation method,bayesian frameworks,uncertainty,forecasting,uncertainty analysis,wall,probability,bayesian analysis,braced excavations,finite element method,probability distributions,bayesian probabilistic method,excavation,bayesian probabilistic methods,current construction,wall deflections,prediction,beams on elastic foundation",
67,A comparative study of learning techniques for the compressible aerodynamics over a transonic RAE2822 airfoil,In this study the modeling of the compressible pressure field on the RAE 2822 airfoil using deep learning DL is investigated. The is to generate at low cost the complete Mach envelope from a given aerodynamic database. A dataset with more than 2.000 RANS simulations at various angles of attack and Mach numbers has been created and validated to train fast and accurate surrogate models model of the pressure field. The procedure is inspired by the well-known analytical Prandtl Glauert transformation which maps solutions from one Mach number to another but however fails in the transonic regime because of the strong non-linearities due to shock waves. Thus instead of a classical direct generation of the pressure fields DL is used here to perform a geometric transformation from an incompressible pressure field towards a compressible solution. The key question addressed in this study is how accurate and sample efficient are DL techniques compared with classical surrogate modeling tools To answer this question two types of DL are investigated CNN and GCNN and compared with Proper Orthogonal Decomposition POD coupled to a Gaussian Process Regression GPRThese methods are trained on various input resolutions and database sizes. show that DL models are able to accurately predict the intensity and location of the shock over a wide range of angles of attack and Mach numbers up to the transonic regime. Predictions of the lift coefficient reveal a relative error lower thanwith respect to the high-fidelity data. Because of their inherent nonlinear nature CNN and GCNN provide more accurate even for small training datasets whereas POD encounters difficulties to reconstruct properly shock waves. Since CNN requires inputs and outputs in a pixel-like format it suffers from interpolation errors that can only be mitigated with fine resolutions making the training more difficult. By performing convolutions directly on unstructured data GCNN eliminates this interpolation error and provides the best accuracy and sample efficiency. This study shows that advanced DL techniques such as CNN and GCNN are capable of predicting complex flows and outperform classical tools on such tasks. 2022 Elsevier Ltd,"rae2822,learning techniques,shock waves,shock-waves,transonic regime,forecasting,errors,proper orthogonal,deep learning,incompressible flow,mach number,orthogonal decomposition,cfd,computational fluid dynamics,airfoils,interpolation error,pressure-field,angle of attack,principal component analysis,interpolation,surrogate modeling,transonic aerodynamics",
87,A computational strategy for determining the optimal scaled wind speed in icing wind tunnel experiments,A new computational strategy for determining the optimal scaled wind speed in icing wind tunnel experiments. Icing numerical computation is involved in icing scaling analysis. A high-efficient surrogate model and Differential Evolution Genetic Algorithm are used in the method. The new method shows improvement in finding reasonable scaling icing parameters. The icing wind tunnel experiment is one of the most important methods to investigate icing problems. Due to the blockage or capability limitations of icing wind tunnels the geometric size of the testing model is usually needed to be sub-scaled and the corresponding icing conditions are required to be converted. However the icing phenomenon is affected by various parameters and how to determine the optimal subscale wind speed remains inconclusive. To solve this problem a new computational strategy named Improved Ruff Icing Scaling Method IRISM is proposed. In IRISM the Ruff icing scaling theory is firstly applied to calculate the basic sub-scale temperature and the related cloudy parameters etc. and then the icing numerical computation is utilized to evaluate the influence of sub-scale icing conditions on the similarity of the reference and subscale ice shapes. Finally the optimal scaled wind speed is calculated by Differential Evolution Genetic Algorithm based on a high-efficient surrogate model in which the ice shape consistency especially the ice horn structure is set as the optimizationThe IRISM is comprehensively analyzed by icing numerical computations as well as icing wind tunnel experiments. The show that the IRISM is capable of automatically providing the optimal scaled wind speed with the acceptable icing scaling error. As an important supplement to the current icing scaling methods the IRISM can find the reasonable experimental parameters of scaled icing wind tunnel tests which provides a new way for icing scaling analysis.,,Wind tunnel; Meteorology; Environmental science; Wind speed; Computational fluid dynamics; Icing; Marine engineering; Computer science; Mechanics; Physics; Engineering
88,A computationally efficient high fidelity multi physics design optimization of traction motors for drive cycle loss minimization,Continuous improvement in performance of interior permanent magnet IPM machines is critical for electric vehicle traction applications. However due to the cross-coupling and saturation effects a significant amount of time-consuming finite element analysis FEA simulations are required to accurately estimate machine performance. Moreover iterative design optimization will take significantly longer. In this article an improved rapid performance estimation technique utilizing surrogate models is developed and coupled with a design optimization algorithm. The proposed framework has significantly less computational cost than alternative surrogate-based approaches and efficiently employs drive cycle loss minimization for a multi-physics multi- traction motor design optimization. Simulation and experimental suggest the proposed optimization framework yields optimal designs more efficiently than existing methods while maintaining accuracy. 1972-2012 IEEE.,"traction motors,computational modelling,predictive models,design optimization,drive cycle,multi-physic optimization,drive cycles,interior permanent magnet,iterative methods,optimisations,multi-physics,electric traction,couplings,electric losses,finite element method,multi-physics optimization,surrogate model,permanent magnets,surrogate modeling",
114,A data driven model of the yield and strain hardening response of commercially pure titanium in uniaxial stress,This study presents a technique to develop data-driven constitutive models for the elastic-plastic response of materials and applies this technique to the case of commercially pure titanium. The complex yield and strain hardening characteristics of this solid are captured for random non-monotonic uniaxial loading without relying on specific theoretical descriptions. The surrogate model is obtained by supervised machine learning relying on feed-forward neural networks trained with data obtained from random loading of titanium specimens in uniaxial stress. Uniaxial tests are conducted in strain control applying random histories of axial strain in the range 0.04 0.04 to prevent the occurrence of significant damage. The corresponding stress versus strain histories are subdivided into a finite number of increments and machine learning is applied to predict the change in stress in each increment. A suitable architecture of the data-driven model key to obtaining accurate predictions is presented. The predictions of the surrogate model are validated by comparing to experiments not used in the training process and compared to those of an established theoretical model. An excellent agreement is obtained between the measurements and the predictions of the data-driven surrogate model. 2023 The Authors,"hardening response,uniaxial stress,elastic-plastic response,machine-learning,machine learning,titanium compounds,learning systems,elastoplasticity,monotonics,plasticity,cyclic loading,tensile stress,feedforward neural networks,commercially pure titanium,forecasting,supervised learning,compressive stress,surrogate model,data driven,stress analysis,strain hardening,data-driven model,surrogate modeling",
123,A data driven tip flow loss prediction method for a transonic fan under boundary layer ingesting inflow distortion,In a boundary layer ingesting BLI propulsion system the fan blades need to operate continuously under large-scale inflow distortion. The distortion will lead to serious aerodynamic losses in the fan degrading the fan performance and the overall aerodynamic benefits of the aircraft. Therefore in the preliminary design of a BLI propulsion system it is necessary to evaluate the influence of the fuselage boundary layer under different flight conditions on the fan aerodynamic performance. However a gap exists in the current computational methods for BLI fan performance evaluations. The full-annulus unsteady Reynolds-averaged Navier-Stokes URANS simulations can provide reliable predictions but are computationally expensive for design iterations. The low-order computational methods are cost-efficient but rely on the loss models for accurate prediction. The conventional empirical or physics-based loss models show notable limitations under complex distortion-induced off-design working conditions in a BLI fan especially in the rotor tip region compromising the reliability of the low-order computational methods. To balance the accuracy and cost of loss prediction the paper proposes a data-driven tip flow loss prediction framework for a BLI fan. It employs a neural network to build a surrogate model to predict the tip flow loss at complex non-uniform aerodynamic conditions. Physical understandings of the flow features in the BLI fan are integrated into the datadriven modeling process to further reduce the computational cost and improve the methodapplicability. The data-driven prediction method shows good accuracy in predicting the overall values and radial distributions of fan rotor tip flow loss under various BLI inflow distortion conditions. Not only does it have higher accuracy than the conventional physics-based loss models but also needs much less computational time than the fullannulus time-accurate simulations. The present work has demonstrated a significant potential of data-driven approaches in complex aerodynamic loss modeling and will contribute to future BLI fan design. Copyright 2022 by ASME.,"bli fan,neural network,tip flow,forecasting,aerodynamics,loss prediction method,boundary layer ingesting fan,loss model,boundary layers,flow loss,loss prediction,navier stokes equations,neural-networks,data driven,fans,prediction methods,computational methods,tip flow loss",Aerodynamics; Distortion (music); Inflow; Computational fluid dynamics; Boundary layer; Computer science; Fuselage; Engineering; Aerospace engineering; Simulation; Marine engineering; Structural engineering; Mechanics; Physics; Electronic engineering; Amplifier; CMOS


Saving file scopus_lens_ml_sim_engineering_all_highlights_plus_abstract_topics.xlsx ...
DONE!


### Researcher, departments, and countries

The script extracts the author names and affiliations and creates tables with:
* Number of articles per country.
* Number of articles per institution/department.
* Number of articles per author.

The tables can be saved in a sheet of an Excel workbook.

**Parameters**
* write_xlsx: set to True to generate the Excel file of the highlights.
* file_xlsx_out: the output Excel file name.

In [None]:
# PARAMETERS
use_highlights_df = False
write_excel = True
file_xlsx_out = 'country_dept_author_counts'
logger.setLevel(logging.INFO)
# --------------------------------------------

# For the initial table, either copy the full table or the highlights table
if use_highlights_df:
  ctry_affil_auth_df = titles_highlights_clean_df[['author', 'author_id', 'affiliation', 'title', 'year', 'source', 'cited']].copy() #.head(20)
else:
  ctry_affil_auth_df = biblio_df[['author', 'author_id', 'affiliation', 'title', 'year', 'source', 'cited']].copy() #.head(5)

# Find duplicate titles
dup = ctry_affil_auth_df.duplicated(subset=['title'], keep = False)
df_dup = ctry_affil_auth_df[dup]

'''
print("DUPLICATE TITLES")
print("----------------")
print(df_dup['title'].value_counts())
'''

# Remove duplicate titles
ctry_affil_auth_df = ctry_affil_auth_df.drop_duplicates(subset=['title'], keep = 'first')

ctry_affil_auth_df['author'] = ctry_affil_auth_df['author'].apply(lambda x: x.split(','))
ctry_affil_auth_df['author_id'] = ctry_affil_auth_df['author_id'].apply(str)
ctry_affil_auth_df['author_id'] = ctry_affil_auth_df['author_id'].apply(lambda x: x.rstrip(';'))
ctry_affil_auth_df['author_id'] = ctry_affil_auth_df['author_id'].apply(lambda x: x.split(';'))
ctry_affil_auth_df['affiliation'] = ctry_affil_auth_df['affiliation'].apply(str)
ctry_affil_auth_df['affiliation'] = ctry_affil_auth_df['affiliation'].apply(lambda x: x.split(';'))

# Count the delimiters for author and author_id and remove those rows where they don't conincide
ctry_affil_auth_df['length'] = [len(x) for x in ctry_affil_auth_df['author']]
ctry_affil_auth_df['length2'] = [len(x) for x in ctry_affil_auth_df['author_id']]
ctry_affil_auth_df['length3'] = [len(x) for x in ctry_affil_auth_df['affiliation']]
ctry_affil_auth_df = ctry_affil_auth_df[ctry_affil_auth_df['length'] == ctry_affil_auth_df['length2']]
ctry_affil_auth_df = ctry_affil_auth_df[ctry_affil_auth_df['length'] == ctry_affil_auth_df['length3']]

ctry_affil_auth_df.drop(['length', 'length2', 'length3'], axis=1, inplace=True)

# Explode the columns with author and affiliation information
ctry_affil_auth_df = ctry_affil_auth_df.set_index(['title', 'year', 'source', 'cited']).apply(lambda x: x.explode()).reset_index()

# Separate the author from the affiliation and the country
ctry_affil_auth_df['affiliation'] = ctry_affil_auth_df['affiliation'].apply(lambda x: x.split(','))
ctry_affil_auth_df['affil_auth'] = ctry_affil_auth_df['affiliation'].apply(lambda x: ''.join(x[0:2]))
ctry_affil_auth_df['affil_dept'] = ctry_affil_auth_df['affiliation'].apply(lambda x: ','.join(x[2:-1]))
ctry_affil_auth_df['affil_country'] = ctry_affil_auth_df['affiliation'].apply(lambda x: x[-1].strip())

ctry_affil_auth_df = ctry_affil_auth_df.sort_values(by = ['affil_country', 'affil_dept', 'affil_auth'], ascending = [True, True, True], na_position = 'first').reset_index()
ctry_affil_auth_df.drop(['index'], axis = 1, inplace = True)

# Mark rows where affil_country is not a country

def country_name(country):

  try:
    country_name = pycountry.countries.search_fuzzy(country)[0].name
  except LookupError:
    country_name = "no_country"
 
  return country_name


def is_country(country):

  if pycountry.countries.get(name = country) == None:
    return False
 
  return True

ctry_affil_auth_df['is_country'] = ctry_affil_auth_df['affil_country'].apply(is_country)
not_countries_df = ctry_affil_auth_df[['affil_country']][~ctry_affil_auth_df['is_country']].drop_duplicates(subset = ['affil_country']).copy()
not_countries_df['fuzzy_search'] = not_countries_df['affil_country'].apply(country_name)
ctry_affil_auth_df = ctry_affil_auth_df.merge(not_countries_df, left_on = 'affil_country', right_on = 'affil_country', how = 'left')
ctry_affil_auth_df['affil_country'] = ctry_affil_auth_df['fuzzy_search'].fillna(ctry_affil_auth_df['affil_country'])
ctry_affil_auth_df.drop(columns = ['is_country', 'fuzzy_search'], inplace = True)

#display(ctry_affil_auth_df)
#display(not_countries_df)

'''
print(f"\nHEAD COUNTRY-AFFILIATION-AUTHOR DF")
print("-------------------------------------")
display(ctry_affil_auth_df[['title', 'year', 'source', 'cited', 'affil_auth', 'affil_dept', 'affil_country']].head())

# Entries where author names and those extracted from the affiliations don't coincide
print(f"\nAUTHOR AND AFFIL AUTHOR DO NOT COINCIDE")
print("------------------------------------------")
display(ctry_affil_auth_df[ctry_affil_auth_df['author'] != ctry_affil_auth_df['affil_auth']])
'''

ctry_affil_auth_df.drop(['affiliation', 'author', 'author_id'], axis=1, inplace = True)
#display(ctry_affil_auth_df)

# Compute the number of articles per country
country_freq_df = ctry_affil_auth_df.groupby('affil_country')['title'].nunique().reset_index(name = 'count').copy()
country_freq_df = country_freq_df.sort_values(by = ['count', 'affil_country'], ascending = [False, True]).reset_index()
country_freq_df.drop(['index'], axis = 1, inplace = True)

#display(country_freq_df)

# Compute the number of articles per institution
dept_freq_df = ctry_affil_auth_df.groupby(['affil_dept', 'affil_country'])['title'].nunique().reset_index(name = 'count').copy()
dept_freq_df = dept_freq_df.sort_values(by = ['count', 'affil_dept'], ascending = [False, True]).reset_index()
dept_freq_df.drop(['index'], axis = 1, inplace = True)

display(dept_freq_df)

# Compute the number of articles per author
auth_freq_df = ctry_affil_auth_df.groupby(['affil_auth', 'affil_country']).size().reset_index(name = 'freq').copy()
auth_freq_df = auth_freq_df.sort_values(by = ['freq', 'affil_country', 'affil_auth'], ascending = [False, True, True]).reset_index()
auth_freq_df.drop(['index'], axis = 1, inplace = True)


#display(auth_freq_df)

# Write file to Excel
if write_excel == True:

  # Create a new workbook
  wb = Workbook()

  # Create a new sheet with the name 'TAK Highlights'
  ws = wb.create_sheet('Counts')

  if wb["Sheet"]:
    wb.remove(wb["Sheet"])

  # Create the headings in the sheet
  ws.cell(row = 1, column = 1, value="Country")
  ws.cell(row = 1, column = 2, value="Count")
  ws.cell(row = 1, column = 4, value="Institution")
  ws.cell(row = 1, column = 5, value="Country")
  ws.cell(row = 1, column = 6, value="Count")
  ws.cell(row = 1, column = 8, value="Author")
  ws.cell(row = 1, column = 9, value="Country")
  ws.cell(row = 1, column = 10, value="Count")
  
  # Write the count results to the cells
  for i in range(len(country_freq_df)):
    for j in range(len(country_freq_df.columns)):
      ws.cell(row = i + 2, column = j + 1, value = country_freq_df.iloc[i, j])

  for i in range(len(dept_freq_df)):
    for j in range(len(dept_freq_df.columns)):
      ws.cell(row = i + 2, column = j + 1 + 3, value = dept_freq_df.iloc[i, j])

  for i in range(len(auth_freq_df)):
    for j in range(len(auth_freq_df.columns)):
      ws.cell(row = i + 2, column = j + 1 + 7, value = auth_freq_df.iloc[i, j])

  ws.column_dimensions['A'].width = 20
  ws.column_dimensions['B'].width = 5
  ws.column_dimensions['D'].width = 70
  ws.column_dimensions['E'].width = 20
  ws.column_dimensions['F'].width = 5
  ws.column_dimensions['H'].width = 20
  ws.column_dimensions['I'].width = 20
  ws.column_dimensions['J'].width = 5

  ws.sheet_view.zoomScale = 140

if write_xlsx:
  wb.save(root_dir + results_dir + file_xlsx_out + '.xlsx')

#ctry_affil_auth_df.to_excel('/drive/My Drive/Colab Notebooks/Scopus files/ELEM Hire ML in CE/scopus_ml_ce_PINN_out.xlsx', index=False)

# TODO
# - When authors are affiliated with multiple institutions, these are not split out in the results. Here is an
#   example: "Raghunath, S., Geisinger, Danville, PA, United States, Tempus Labs Inc., Chicago, IL, United States;"
#   It seems the only way to split this is to split the string on country name. I can do this using an array with all
#   country names.


## Sandbox
From this point onward, the cells are for playing around with code.

### keyBERT

In [None]:
from keybert import KeyBERT

doc = """
         Since the creation of stock markets there have been attempts to predict their movements and new prediction methodologies have been devised According to a recent study when the Russell industry index starts to rise stocks belonging to the corresponding industry in other countries also rise accordingly Based on this empirical result this study seeks to predict the start date of industry uptrends using the Russell industry index The proposed model in this study predicts future stock prices using a denoising autoencoder DAE long short term memory LSTM model and predicts the existence and timing of future change points in stock prices through Pettitttest The of the empirical analysis confirmed that this proposed model can find the change points in stock prices within days prior to the start date of actual uptrends in selected industries This study contributes to predicting a change point through a combination of statistical and deep learning models and the methodology developed in this study could be applied to various financial time series data for various purposes.
      """
kw_model = KeyBERT(model = 'all-MiniLM-L6-v2')

print(f'{doc}')
print(f'Baseline')

keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range = (1, 2), 
                                     stop_words = 'english', top_n = 10)

display(keywords)

print(f'Max Sum Distance')

keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range = (1, 2), 
                                     stop_words = 'english', use_maxsum = True, 
                                     nr_candidates = 20, top_n = 10)

display(keywords)

print(f'Maximal Marginal Relevance')

keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range = (1, 2), 
                                     stop_words = 'english', use_mmr = True, 
                                     diversity = 0.5, top_n = 10)

display(keywords)


### LDA topic modelling
This is a first implementation of an LDA topic modelling following the [video tutorial](https://https://www.youtube.com/watch?v=TKjjlp5_r7o) by William Mattingly. It uses the Scopus keywords to create the topic model.

In [None]:
import spacy
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import warnings
import json

import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Lemmatise the keywords
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

# Do some pre-processing on the keywords
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

# Create a bag-of-word representation of the keywords
def keywords_to_bow(words):
  corpus = []
  for text in data_words:
      new = id2word.doc2bow(text)
      corpus.append(new)
  return corpus


In [None]:
# PARAMETERS
filter_lda = "cardi or heart"  # uses the keywords of the full dataset or the filtered keywords
# ------------------------

if filter_lda != "":
  '''
  lda_kws_df['keep'] = titles_df['kws'].apply(evaluate_expression, expression = filter_lda)
  lda_kws_df = lda_kws_df[lda_kws_df['keep'] == True]
  print(f'Filtered dataset has {len(lda_kws_df)} entries out of the {len(titles_df)} in the original dataset')
  '''
  filter_lda_exp_list = expand_search_terms(filter_lda)
  filter_lda_exp = ' '.join([explode_filter_kws(x) if x not in ['(', ')', 'and', 'or'] else x for x in filter_exp_list])  # build the exploded string

  if filter_lda_exp != "":
    lda_kws_df = titles_df[titles_df.apply(evaluate_expression, filter = filter_lda_exp, axis = 1)].copy()
    #print(f'Scopus keywords of the dataframe filtered by the boolean expression: {filter_lda}')
    #display(lda_kws_df)
else:
  lda_kws_df = titles_df[['kws']].copy()

# Combine the multi-word keywords with underscores '_' and convert to a single string for each entry
lda_kws_df['kws'] = lda_kws_df['kws'].str.split(',')
lda_kws_df['kws'] = lda_kws_df['kws'].apply(lambda x: [word.strip().replace(' ', '_') for word in x])
lda_kws_df['kws'] = lda_kws_df['kws'].apply(lambda x: ' '.join(x))

# Create a list of the keyword strings
data = lda_kws_df['kws'].to_list()

''' Remove this comment to run the topic model with William Mattingly's test file
# Get test json file
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f) 
    return (data)

data = load_data('/content/drive/My Drive/Colab Notebooks/Biblio Analysis/Input_test/ushmm_dn.json')["texts"]
data = data[0:4]
'''
# Lemmatise the keywords
lemmatized_texts = lemmatization(data)

# High-count keywords that don't contribute much to the topics
exclude = ['article', 'adult', 'human', 'male', 'non', 'middle_age', 'female', 'age', 
           'high', 'single', 'child', 'aged', 'animal']
lemmatized_texts = [" ".join([word for word in sentence.split() if word not in exclude]) for sentence in lemmatized_texts]

# Do some pre-processing on the keywords
data_words = gen_words(lemmatized_texts)

# Generate the wordID-to-word dictionary
id2word = corpora.Dictionary(data_words)

# Create a bag-of-word representation of the keywords
corpus = keywords_to_bow(data_words)

# TODO
# - add the search terms to the keyword list; create two corpora: one with and one without the search terms
# - run the lemmatiser on the individual words of the multi-word keywords and remove stop words

In [None]:
# Compute LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")


In [None]:
# Visualise the topics model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

### DEPRECATED Associated term clustering
I replaced this with the LDA topic model above.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

df = titles_df[['kws']].copy()

df['kws'] = df['kws'].str.split(',')
df['kws'] = df['kws'].apply(lambda x: [word.strip().replace(' ', '_') for word in x])
df['kws'] = df['kws'].apply(lambda x: [word.strip('_') for word in x])
print(df)

# Convert the 'kws' column to a list of strings
docs = [' '.join(kws) for kws in df['kws']]

# Use CountVectorizer to create a co-occurrence matrix
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
X = vectorizer.fit_transform(docs)
co_occurrence_matrix = (X.T * X)

# Cluster the keywords based on their co-occurrence patterns
num_clusters = 10
kmeans = KMeans(n_clusters = num_clusters, n_init = 10)
kmeans.fit(co_occurrence_matrix)

# Get the cluster labels and print the top keywords in each cluster
cluster_labels = kmeans.labels_

for i in range(num_clusters):
    cluster_keywords = [vectorizer.get_feature_names_out()[idx] for idx, label in enumerate(cluster_labels) if label == i]
    print(f'Cluster {i+1}: {" | ".join(cluster_keywords[:100])}')

# TODO
# - Show the most frequent cluster keywords first. Currently they are in alphabetical order.
# - Remove low count keywords directly from df before doing any processing.
