# Utilities for bibliometric analysis

### Library imports

In [1]:
# Import libraries and modules

import os
import sys
import pandas as pd
import datetime
import logging
import re

from IPython.display import display
from itables import init_notebook_mode, show

### Configurations

In [2]:
%%capture

# Change the logging level to logging.INFO in the cells
# for additional informative output.
logger = logging.getLogger(__name__)

# Activate Interactive Tables for better dataframe visualisations: https://mwouts.github.io/itables/quick_start.html
init_notebook_mode(all_interactive = False)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Folders

In [3]:
# PARAMETERS

project = 'mabs_repsol'  # the project folder

# ----------------------------


# Create folder structure if the project is new and set root directory of the project
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('../')

root_dir = os.getcwd() + '/projects/' + project

print(f'Working directory: {root_dir}')

if not os.path.exists(root_dir):
    os.makedirs(root_dir)
    os.makedirs(root_dir + '/data')
    os.makedirs(root_dir + '/data/processed')
    os.makedirs(root_dir + '/data/raw')
    os.makedirs(root_dir + '/models')
    os.makedirs(root_dir + '/results')


Working directory: /Users/gilbert/Analyses/bibliometrics/projects/mabs_repsol


### Clean bibliography dataset


In [37]:
# PARAMETERS

biblio_source = 'lens'  # 'scopus' or 'lens'
search_label = 'lens_ml_sim_heart_all'   # can be useful to know from which search the publication came from when merging datasets

data_dir = '/data/raw_lens_ml_sim_heart/'
results_dir = '/data/processed/'

biblio_csv_files = []
# biblio_csv_files = ['lens_ml_fos_medicine_2023.csv']  # comment out to read all csv files in data_dir

write_csv = True
file_cleaned_csv = 'lens_ml_sim_heart_all'

timestamping = False

# ----------------------------

# Get list of CSV files in directory
if len(biblio_csv_files) == 0:
    biblio_csv_files = [f for f in os.listdir(root_dir + data_dir) if f.endswith('.csv')]

all_f_df = []

# Read all CSV files into a single DataFrame
for f in biblio_csv_files:
    f_df = pd.read_csv(os.path.join(root_dir + data_dir, f), on_bad_lines = 'skip')
    print(f'File: {os.path.basename(f)}, Size: {len(f_df)} rows')
    all_f_df.append(f_df)

biblio_df = pd.concat(all_f_df, ignore_index = True)

print(f'Number of publications in the input dataframe: {len(biblio_df)}')

if biblio_source == 'scopus':

    # Rename and filter the columns of the dataframe
    biblio_df.rename(columns = {'Authors':'authors', 
                                'Author(s) ID':'author_ids', 
                                'Authors with affiliations':'affiliations', 
                                'Title':'title',
                                'Abstract':'abstract',
                                'Year':'year',
                                'Source title':'source',
                                'Cited by':'cited',
                                'Author Keywords':'kws_author',
                                'Index Keywords':'kws_index',
                                'Document Type':'doc_type'}, inplace = True)

    biblio_df = biblio_df[['authors', 'author_ids', 'affiliations', 
                           'title', 'abstract', 'year', 'source', 
                           'cited', 'kws_author', 'kws_index', 'doc_type']]
    
    biblio_df['search_label'] = search_label

elif biblio_source == 'lens':
    
    # Drop columns and change remaining column names
    biblio_df = biblio_df.loc[:,['Author/s', 'Title', 'Abstract', 'Keywords', 'MeSH Terms', 
                                 'Fields of Study', 'Publication Year', 'Source Title', 
                                 'Citing Works Count', 'Date Published']]
    biblio_df = biblio_df.rename(columns = {'Author/s':'authors', 
                                            'Title':'title', 
                                            'Abstract':'abstract', 
                                            'Keywords':'lens_kws', 
                                            'MeSH Terms':'mesh', 
                                            'Fields of Study':'fos', 
                                            'Publication Year':'year',
                                            'Source Title':'source', 
                                            'Citing Works Count':'cited'})

    # Type of year should be integer
    biblio_df['year'] = pd.to_numeric(biblio_df['year'], errors = 'coerce')
    biblio_df['year'] = biblio_df['year'].fillna(0)
    biblio_df['year'] = biblio_df['year'].astype(int)

    biblio_df['search_label'] = search_label
    
else:
    raise Exception(f'The bibliographic source {biblio_source} does nto exist')

# Remove title-author duplicates
count_dupl = biblio_df.duplicated(subset = ['title', 'authors']).sum()
biblio_df.drop_duplicates(subset = ['title', 'authors'], inplace = True)
print(f'Number of duplicate title-author pairs that were removed: {count_dupl}')

# Identify remaining duplicate titles
count_dupl_df = biblio_df[biblio_df['title'].str.lower().duplicated(keep = 'first')]

# Remove remaining title duplicates
biblio_df = biblio_df[~biblio_df['title'].str.lower().duplicated(keep = 'first')]
print(f'Number of remaining duplicate titles that were removed: {count_dupl_df.shape[0]}')

# Identify duplicate abstracts
mask = biblio_df.duplicated(subset = 'abstract', keep = False)
duplicates = biblio_df[mask]
num_unique_duplicates = duplicates['abstract'].nunique()
print(f'Number of unique abstracts that have duplicates: {num_unique_duplicates}')
# duplicates['abstract'] = duplicates['abstract'].apply(lambda x: ' '.join(x.split()[:5]) if isinstance(x, str) and x else x)
duplicate_counts = duplicates['abstract'].apply(lambda x: ' '.join(x.split()[:5]) if isinstance(x, str) and x else x).value_counts()
# print(f'Number of duplicate abstracts for each unique abstract:\n{duplicate_counts}')

# Remove the abstract duplicates
if biblio_source == 'lens': # Lens can have many duplicate abstracts where the title slightly differs
    biblio_df = biblio_df.sort_values(by = 'Date Published', ascending = False)
    biblio_df = biblio_df.drop_duplicates(subset = 'abstract', keep = 'first')
else:
    biblio_df = biblio_df[~biblio_df['abstract'].str.lower().duplicated(keep = 'first')]

print(f'Number of duplicate abstracts that were removed: {duplicate_counts.sum() - num_unique_duplicates}')

# Remove empty titles
count_titles_empty = len(biblio_df[biblio_df['title'] == ''])
biblio_df = biblio_df[biblio_df['title'] != '']
print(f'Removed {count_titles_empty} titles that were empty strings')

# Remove all titles that are NaN
count_titles_nan = biblio_df['title'].isna().sum()
biblio_df = biblio_df.dropna(subset = ['title'])
print(f'Removed {count_titles_nan} titles that were NaN')

# Convert the titles to lower case except for the first word
def title_to_lc(s):
    words = s.split()
    words = [words[0]] + [w.lower() if w[0].isupper() and len(w) > 1 and w[1].isalpha() and not w[1].isupper() else w for w in words[1:]]
    return ' '.join(words)

biblio_df['title'] = biblio_df['title'].apply(title_to_lc)

# Remove all titles that contain 'conference', 'workshop', or 'proceedings'
count_procs = len(biblio_df[biblio_df['title'].str.contains('proceedings|conference|workshop', case = False)])
biblio_df = biblio_df[~biblio_df['title'].str.contains('proceedings|conference|workshop', case = False)]
print(f'Removed {count_procs} records where the title contained "conference", "workshop", or "proceeding"')

# Replace all abstracts that are NaN with empty strings
count_abs_nan = biblio_df['abstract'].isna().sum()
biblio_df['abstract'] = biblio_df['abstract'].fillna('')
print(f'Replaced {count_abs_nan} abtracts that were NaN with an empty string')

# Remove all text between '<' and '>' characters
print(f'Removing all strings of the form <some text>')
biblio_df['title'] = biblio_df['title'].str.replace(r'<.*?>', '', regex = True)
biblio_df['abstract'] = biblio_df['abstract'].str.replace(r'<.*?>', '', regex = True)

# Remove all non-alphabetic characters except for '.', '-', ',', ':', ')', '(', '$', '%',  whitespace and Greek letters
biblio_df['title'] = biblio_df['title'].apply(lambda x: re.sub(r'[^a-zA-Z0-9α-ωΑ-Ω\s.,:’()$%\'\"\-]+', ' ', x))
biblio_df['abstract'] = biblio_df['abstract'].apply(lambda x: re.sub(r'[^a-zA-Z0-9α-ωΑ-Ω\s.,:’()$%\'\"\-]+', ' ', x))

# Replace any special whitespace character with a normal whitespace
biblio_df['title'] = biblio_df['title'].str.replace(r'\u2002|\u2003|\u2005|\u2009|\u200a|\u202f|\xa0', ' ', regex = True)
biblio_df['abstract'] = biblio_df['abstract'].str.replace(r'\u2002|\u2003|\u2005|\u2009|\u200a|\u202f|\xa0', ' ', regex = True)

# Replace all newline and tab characters with a white space
biblio_df['title'] = biblio_df['title'].str.replace(r'\n|\t', ' ', regex = True)
biblio_df['abstract'] = biblio_df['abstract'].str.replace(r'\n|\t', ' ', regex = True)

# Remove the word 'abstract' at the start of any title
print(f'Removing the word "abstract" at the start of the title')
biblio_df['title'] = biblio_df['title'].str.replace(r'^(?i)abstract\s*', '', regex = True)

# Remove the word 'abstract' and 'objective' at the start of any abstract
print(f'Removing the word "abstract" and "objective" at the start of the abstract')
biblio_df['abstract'] = biblio_df['abstract'].str.replace(r'^(?i)abstract\s*', '', regex = True)
biblio_df['abstract'] = biblio_df['abstract'].str.replace(r'^(?i)objective(s)?\s*', '', regex = True)

# Remove the following common terms from the abstract independently of the case
print(f'Removing common terms like "background", "conclusions",... from the abstract')
remove_strings = ['background', 'objective', 'results', 'conclusions', 'introduction']
pattern = "|".join(remove_strings)
biblio_df['abstract'] = biblio_df['abstract'].apply(lambda x: re.sub(pattern, '', x, flags = re.IGNORECASE))

# Replace colons and semi-colons in the abstract with a white space
print(f'Replacing colons and semi-colons in the abstract with a white space')
biblio_df['abstract'] = biblio_df['abstract'].str.replace('(;|:)', ' ', regex = True)

# Remove words from the beginning of the title that are combinations of at least one number and zero or more special charcters
biblio_df['title'] = biblio_df['title'].apply(lambda x: re.sub(r'^[\W\d]+(?=\s)', '', x))

# Remove any words starting with '.' or '-' and any single letter except 'a' from the beginning of the title and abstract
biblio_df['title'] = biblio_df['title'].str.replace(r'^[-.]+\s*\w+\s*|[-.]+(?!\w)|(\s|^)[^Aa\s+](\s+|$)', '', regex = True)
#biblio_df['abstract'] = biblio_df['abstract'].str.replace(r'^[-.]+\s*\w+\s*|[-.]+(?!\w)|(\s|^)[^Aa\s+](\s+|$)', '', regex = True)
biblio_df['abstract'] = biblio_df['abstract'].str.replace(r'^[-.]+\s*\w+\s*|(\s|^)[^Aa\s+](\s+|$)', '', regex = True)

# Remove excess whitespace
print(f'Removing excess white space')
biblio_df['title'] = biblio_df['title'].str.replace(r'\s+', ' ', regex = True).str.strip()
biblio_df['abstract'] = biblio_df['abstract'].str.replace(r'\s+', ' ', regex = True).str.strip()

# Remove any remaining empty titles
count_titles = biblio_df.shape[0]
biblio_df = biblio_df[biblio_df['title'].str.strip().astype(bool)]
print(f'Removed {count_titles - biblio_df.shape[0]} titles that were empty strings')

# Identify remaining duplicate titles (2nd sweep)
count_dupl_df = biblio_df[biblio_df['title'].str.lower().duplicated(keep = 'first')]

# Remove any remaining title duplicates (2nd sweep)
biblio_df = biblio_df[~biblio_df['title'].str.lower().duplicated(keep = 'first')]
print(f'Number of remaining duplicate titles that were removed in 2nd sweep: {count_dupl_df.shape[0]}')


# Sort the dataset before creating the ids
biblio_df = biblio_df.sort_values(by = ['year', 'title'], ascending = [False, True])

# Generate the record IDs
counter = 0

def generate_id(row):
    global counter

    if (row['authors'] != "") and isinstance(row['authors'], str):
        if "no author name" in row['authors'].lower():
            author = 'Anonymous'
        else:
            if biblio_source == 'scopus':
                author = row['authors'].split(',')[0].split(' ')[0]
            elif biblio_source == 'lens':
                author = row['authors'].split(';')[0].split(' ')[-1]
            else:
                raise Exception(f'bibliographic source {biblio_source} not recognised')
    else:
        author = 'Anonymous'

    id = str(counter).zfill(6) + '_' + author + '_' + str(row['year'])
    counter += 1

    return id

biblio_df['id'] = biblio_df.apply(generate_id, axis = 1)

# Check that there are no special characters left other than '.', '-', ' ', and greek letters
non_alphanumeric_characters = set()

for text in biblio_df['title']:
    non_alphanumeric_characters.update(set(re.findall('[^a-zA-Z0-9]', text)))

for text in biblio_df['abstract']:
    non_alphanumeric_characters.update(set(re.findall('[^a-zA-Z0-9]', text)))

if biblio_source == 'scopus':

    # Merge index and author keywords
    def unique_keywords(row):
        keywords = set()
        cols = [col for col in ['kws_author', 'kws_index'] if not pd.isna(row[col])]
        
        for col in cols:
            for keyword in row[col].split(';'):
                keywords.add(keyword.lower())

        return ','.join(sorted(keywords))

    biblio_df['kws'] = biblio_df.apply(unique_keywords, axis = 1)
    biblio_df = biblio_df.drop(columns = ['kws_author', 'kws_index'])

    # Add a column that indicates whether this is a literature review
    biblio_df['lit_review'] = biblio_df['doc_type'].apply(lambda x: 1 if x == 'Review' else 0)
    biblio_df = biblio_df.drop(columns = ['doc_type'])

    # Reorder columns before saving to CSV
    biblio_df = biblio_df.reindex(columns = ['id', 'title', 'year', 'abstract', 'kws', 'source', 
                                            'lit_review', 'cited', 'authors', 'affiliations', 
                                            'author_ids', 'search_label'])

elif biblio_source == 'lens':

    # Merge keywords and MeSH terms
    def unique_keywords(row):
        keywords = set()
        cols = [col for col in ['lens_kws', 'mesh'] if not pd.isna(row[col])]
        
        for col in cols:
            for keyword in row[col].split(';'):
                keywords.add(keyword.lower())

        return ','.join(sorted(keywords))

    biblio_df['kws'] = biblio_df.apply(unique_keywords, axis = 1)
    biblio_df = biblio_df.drop(columns = ['lens_kws', 'mesh'])

    # Add a column that indicates whether this is a literature review
    biblio_df['lit_review'] = biblio_df['title'].apply(lambda x: 1 if 'review' in x.lower() or 'survey' in x.lower() else 0)

    # Reorder columns before saving to CSV
    biblio_df = biblio_df.reindex(columns = ['id', 'title', 'year', 'abstract', 'kws', 
                                             'fos', 'source', 'lit_review', 'cited', 
                                             'authors', 'search_label'])
    
print(f'Remaining special characters: {non_alphanumeric_characters}')
print(f"Number of anonymous authors: {len(biblio_df[biblio_df['id'].str.contains('Anonymous')])}")
print(f'Number of entries in cleaned dataframe: {len(biblio_df)}')

# Create timestamp
if timestamping:
    timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
else:
    timestamp = ''

# Write dataframe to CSV
if write_csv:
    print(f'Saving file {file_cleaned_csv + timestamp}.csv ...')
    biblio_df.to_csv(root_dir + results_dir + file_cleaned_csv + timestamp + '.csv', index = False)

print(f'DONE!')

File: lens_ml_sim_heart_all.csv, Size: 2113 rows
Number of publications in the input dataframe: 2113
Number of duplicate title-author pairs that were removed: 40
Number of remaining duplicate titles that were removed: 28
Removed 0 titles that were empty strings
Removed 1 titles that were NaN
Removed 13 records where the title contained "conference", "workshop", or "proceeding"
Replaced 22 abtracts that were NaN with an empty string
Removing all strings of the form <some text>
Removing the word "abstract" at the start of the title
Removing the word "abstract" and "objective" at the start of the abstract
Removing common terms like "background", "conclusions",... from the abstract
Replacing colons and semi-colons in the abstract with a white space
Removing excess white space
Removed 2 titles that were empty strings
Number of remaining duplicate titles that were removed in 2nd sweep: 83
Remaining special characters: {"'", 'ψ', 'π', 'γ', ')', ':', '-', '%', 'μ', '.', '’', 'Δ', 'χ', 'β', ','

### Scopus & Lens: Merge Scopus and Lens datasets
**Prerequisite**: create the input datasets using the utilities "Scopus/Lens: Generate a merged ScopusLens file with a reduced set of columns".

In [38]:
# PARAMETERS

data_dir = '/data/processed/'
results_dir = '/data/processed/'

scopus_csv_files = ['scopus_ml_sim_heart_all.csv']
lens_csv_files = ['TEST_lens_ml_sim_heart_all.csv']

write_csv = True
merged_csv_file = 'TEST_scopus_lens_ml_sim_heart_all'

lens_source_filter = []   # only publications with the sources listed here are included; if empty, all are included
add_fos_to_kws = False
timestamping = False

# ----------------------------

# Read Scopus files
all_f_df = []

for f in scopus_csv_files:
    f_df = pd.read_csv(os.path.join(root_dir + data_dir, f), on_bad_lines = 'skip')
    print(f'File: {os.path.basename(f)}, Size: {len(f_df)} rows')
    all_f_df.append(f_df)

scopus_df = pd.concat(all_f_df, ignore_index = True)

# Read Lens files
all_f_df = []

for f in lens_csv_files:
    f_df = pd.read_csv(os.path.join(root_dir + data_dir, f))
    print(f'File: {os.path.basename(f)}, Size: {len(f_df)} rows')
    all_f_df.append(f_df)

lens_df = pd.concat(all_f_df, ignore_index = True)

print(f'Number of entries in the Scopus dataset: {len(scopus_df)}')
print(f'Number of entries in the Lens dataset: {len(lens_df)}')

# Apply the lens_source_filter
if lens_source_filter:
    lens_df = lens_df[lens_df['source'].apply(lambda x: isinstance(x, str) and any(s.lower() in x.lower() for s in lens_source_filter))]
    print(f'Number of entries in the Lens dataset after applying the filter: {len(lens_df)}')

# Shrink columns
scopus_df = scopus_df.loc[:, ['id', 'title', 'year', 'abstract', 'kws', 'source', 'lit_review', 'cited', 'authors', 'search_label']]
lens_df = lens_df.loc[:, ['id', 'title', 'year', 'abstract', 'kws', 'fos', 'source', 'lit_review', 'cited', 'authors', 'search_label']]

# Add lower case title columns for the join operation
scopus_df['title_lc'] = scopus_df['title'].apply(lambda x: x.lower())
lens_df['title_lc'] = lens_df['title'].apply(lambda x: x.lower())

# Merge the two dataframes, keeping all the rows
merged_df = pd.merge(scopus_df, lens_df, left_on = 'title_lc', right_on = 'title_lc', suffixes = ('_sco', '_len'), how = 'outer')

print(f'Number of duplicate titles: {len(scopus_df) + len(lens_df) - len(merged_df)}')

# Scopus articles take precedence over lens articles so that the id, abstract, year, 
# and source is that of the Scopus article. We first create the new columns and further
# below check whether Scopus data are available for a given publication.
merged_df['title'] = merged_df['title_sco']
merged_df['id'] = merged_df['id_sco']
merged_df['year'] = merged_df['year_sco'].fillna(0).astype(int)
merged_df['abstract'] = merged_df['abstract_sco']
merged_df['source'] = merged_df['source_sco']
merged_df['authors'] = merged_df['authors_sco']

# Use the values from Lens if the Scopus data are not available for a given title 
def len_if_scopus_nan(row):
    new_title = row['title']
    new_id = row['id']
    new_year = row['year']
    new_abstract = row['abstract']
    new_source = row['source']
    new_authors = row['authors']
    
    if pd.isnull(row['id_sco']) or pd.isna(row['id_sco']):
        new_title = row['title_len']
        new_id = row['id_len']
        new_year = int(row['year_len'])
        new_abstract = row['abstract_len']
        new_source = row['source_len']
        new_authors = row['authors_len']

    new_row = pd.Series({'title': new_title, 'id': new_id, 'year': new_year, 'abstract': new_abstract, 'source': new_source, 'authors': new_authors})
    
    return new_row

# Merge the Scopus and Lens dataframes
merged_df[['title', 'id', 'year', 'abstract', 'source', 'authors']] = merged_df.apply(len_if_scopus_nan, axis = 1)
merged_df.drop(['title_sco', 'title_len', 'id_sco', 'id_len', 'year_sco', 'year_len', 
                'abstract_sco', 'abstract_len', 'source_sco', 
                'source_len', 'authors_sco', 'authors_len'], axis = 1, inplace = True)

# Merge the keywords and fields of study
def merge_kws_fos(row):
    kws_merged = []

    if not pd.isnull(row['kws_sco']):
        kws_merged += row['kws_sco'].split(',')
    
    if not pd.isnull(row['kws_len']):
        kws_merged += row['kws_len'].split(';')

    if not pd.isnull(row['fos']) and add_fos_to_kws:
        kws_merged += row['fos'].split(';')
    
    kws_merged = [x.lower().strip() for x in kws_merged]
    kws_merged = list(set(kws_merged))
    kws_merged = ','.join(kws_merged)
    
    return kws_merged

merged_df['kws'] = merged_df.apply(merge_kws_fos, axis = 1)
merged_df.drop(['kws_sco', 'kws_len'], axis = 1, inplace = True)

if add_fos_to_kws:

    merged_df['kws'] = merged_df.apply(merge_kws_fos, axis = 1)
    merged_df.drop(['fos'], axis = 1, inplace = True)

# Merge the cited columns
def merge_cited(row):
    cited = 0
    
    if not pd.isnull(row['cited_sco']):
        cited += int(row['cited_sco'])
    
    if not pd.isnull(row['cited_len']):
        cited += int(row['cited_len'])

    return cited

merged_df['cited'] = merged_df.apply(merge_cited, axis = 1)
merged_df.drop(['cited_sco', 'cited_len'], axis = 1, inplace = True)

# Merge the literature review columns
def merge_lit(row):
    is_review = 0
    
    if not pd.isnull(row['lit_review_sco']):
        is_review += int(row['lit_review_sco'])
    
    if not pd.isnull(row['lit_review_len']):
        is_review += int(row['lit_review_len'])

    if is_review == 2:
        is_review = 1

    return is_review

merged_df['lit_review'] = merged_df.apply(merge_lit, axis = 1)
merged_df.drop(['lit_review_sco', 'lit_review_len'], axis = 1, inplace = True)

# Merge the search labels
def merge_search_label(row):
    sl = ''

    if not pd.isnull(row['search_label_sco']):
        sl = row['search_label_sco']

        if not pd.isnull(row['search_label_len']):        
            sl += ', ' + row['search_label_len']

    elif not pd.isnull(row['search_label_len']):
        sl = row['search_label_len']

    return sl

merged_df['search_label'] = merged_df.apply(merge_search_label, axis = 1)
merged_df.drop(['search_label_sco', 'search_label_len'], axis = 1, inplace = True)

# For duplicate ids, keep only the row corresponding to the first occurrence
print(f"Number of duplicate ids: {merged_df.duplicated(subset = 'id', keep = 'first').sum()}")
merged_df = merged_df.drop_duplicates(subset = 'id', keep = 'first')

# For any remaining duplicate titles, keep only the row corresponding to the first occurrence
print(f"Number of remaining duplicate titles: {merged_df.duplicated(subset = 'title', keep = 'first').sum()}")
merged_df = merged_df.drop_duplicates(subset = 'title', keep = 'first')

# Sort the dataset before re-indexing
merged_df = merged_df.sort_values(by = ['year', 'title'], ascending = [False, True])

# Re-index the ids
counter = 0

def reindex_id(id):
    global counter
    new_id = ""

    try:
        new_id = str(counter).zfill(6) + id[6:]
        counter += 1
    except TypeError:
        print(id)
        
    return new_id

merged_df['id'] = merged_df['id'].apply(reindex_id)
merged_df.reset_index(drop = True, inplace = True)

# Reorder columns
if 'fos' in merged_df.columns:
    merged_df = merged_df.reindex(columns = ['id', 'title', 'year', 'abstract', 'kws', 
                                            'fos', 'source', 'lit_review', 'cited', 
                                            'authors', 'search_label'])
else:
    merged_df = merged_df.reindex(columns = ['id', 'title', 'year', 'abstract', 'kws', 
                                             'source', 'lit_review', 'cited', 'authors'
                                             'search_label'])

print(f'Number of publications in the Scopus & Lens dataset: {len(merged_df)}')

# Create timestamp
if timestamping:
    timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
else:
    timestamp = ''

# Write dataframe to CSV
if write_csv:
    print(f'Saving file {merged_csv_file + timestamp}.csv ...')
    merged_df.to_csv(root_dir + results_dir + merged_csv_file + timestamp + '.csv', index = False)

print(f'DONE!')


File: scopus_ml_sim_heart_all.csv, Size: 1644 rows
File: TEST_lens_ml_sim_heart_all.csv, Size: 1946 rows
Number of entries in the Scopus dataset: 1644
Number of entries in the Lens dataset: 1946
Number of duplicate titles: 150
Number of duplicate ids: 1
Number of remaining duplicate titles: 0
Number of publications in the Scopus & Lens dataset: 3439
Saving file TEST_scopus_lens_ml_sim_heart_all.csv ...
DONE!


### Extract review publications from bibliographic dataset

In [None]:
# PARAMETERS

data_dir = '/data/processed/'
results_dir = '/data/processed/'

biblio_csv_files = ['lens_ml_in_engineering_all.csv']

write_csv = True
review_pubs_csv = 'lens_ml_in_engineering_reviews_all'

timestamping = False

# ----------------------------

# Get list of CSV files in directory
if len(biblio_csv_files) == 0:
    biblio_csv_files = [f for f in os.listdir(root_dir + data_dir) if f.endswith('.csv')]

all_f_df = []

# Read all CSV files into a single DataFrame
for f in biblio_csv_files:
    f_df = pd.read_csv(os.path.join(root_dir + data_dir, f), on_bad_lines = 'skip')
    print(f'File: {os.path.basename(f)}, Size: {len(f_df)} rows')
    all_f_df.append(f_df)

biblio_df = pd.concat(all_f_df, ignore_index = True)
print(f'Total number of publications in the dataset: {len(biblio_df)}')

biblio_df = biblio_df[biblio_df['lit_review'] == 1]
print(f'Number of review publications in the dataset: {len(biblio_df)}')

# Create timestamp
if timestamping:
    timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
else:
    timestamp = ''

# Write dataframe to CSV
if write_csv:
    print(f'Saving file {review_pubs_csv + timestamp}.csv ...')
    biblio_df.to_csv(root_dir + results_dir + review_pubs_csv + timestamp + '.csv', index = False)

print(f'DONE!')

### Stitch together the original bibliography dataset with the computed topic information from BERTopic
Titles are assumed to have no suffix like 039836_Zheng_2022

In [48]:
# PARAMETERS

match_col = 'abstract'  # 'title' or 'abstract' topic information provided?
has_abs_suffix = False  # is the publication ID appended to the abstract?

data_dir = '/results/'
results_dir = '/results/'

read_biblio_csv = 'scopus_lens_ml_sim_engineering_all_st.csv'     # the original processed bibliographic dataset
read_bertopic_csv = 'bertopic_doc_info_ml_sim_engineering_abstracts.csv' # the title by topic probabilities

write_csv = True
write_xlsx = False
write_biblio_bertopic_file = 'scopus_lens_ml_sim_engineering_all_st_plus_abstract_topics'

timestamping = False

logger.setLevel(logging.ERROR)

# ----------------------------

# Read CSV files
biblio_df = pd.read_csv(root_dir + data_dir + read_biblio_csv)
print(f'File: {read_biblio_csv}, Size: {len(biblio_df)} rows')

bertopic_df = pd.read_csv(root_dir + data_dir + read_bertopic_csv)
print(f'File: {read_bertopic_csv}, Size: {len(bertopic_df)} rows')

biblio_plus_topic_df = pd.DataFrame()

# Merge biblio_df and bertopic_merged_df
if match_col == 'title':
    biblio_plus_topic_df = pd.merge(biblio_df, bertopic_df, on = 'title', how = 'left')
    print(f'The merged file has {len(biblio_plus_topic_df)} publications')
elif match_col == 'abstract':
    if has_abs_suffix:
        bertopic_df['id'] = bertopic_df['abstract'].str.split().str[-1]
        biblio_plus_topic_df = pd.merge(biblio_df, bertopic_df, on = 'id', how = 'left')
        biblio_plus_topic_df = biblio_plus_topic_df.drop('abstract_y', axis = 1).rename(columns = {'abstract_x': 'abstract'})
    else:
        biblio_plus_topic_df = pd.merge(biblio_df, bertopic_df, on = 'abstract', how = 'left')

    print(f'The merged file has {len(biblio_plus_topic_df)} publications')

    # if len(biblio_plus_topic_df) != len(biblio_df):
    #     raise Exception(f'biblio_plus_topic_df has {len(biblio_plus_topic_df)} rows, but it needs to have the same number of rows \
    #                     than biblio_df, which are {len(biblio_df)}')
else:
    raise Exception(f'Match column {match_col} not recognised')

biblio_plus_topic_df['tp_num'] = biblio_plus_topic_df['tp_num'].fillna(0).astype(int)
    
if logger.getEffectiveLevel() == logging.INFO:
    pd.set_option('display.max_colwidth', 0)
    display(biblio_plus_topic_df[['title', 'tp_name']].head())
    pd.reset_option('display.max_colwidth')
    display(biblio_plus_topic_df.head())
    
display(biblio_plus_topic_df.head())

# Create timestamp
if timestamping:
    timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
else:
    timestamp = ''

# Write results to file
if write_csv:
    print(f'Saving file {write_biblio_bertopic_file}.csv with {len(biblio_plus_topic_df)} publications...')
    biblio_plus_topic_df.to_csv(root_dir + results_dir + write_biblio_bertopic_file + timestamp + '.csv', index = False)

if write_xlsx:
    print(f'Saving file {write_biblio_bertopic_file}.xlsx with {len(biblio_plus_topic_df)} publications...')
    biblio_plus_topic_df.to_excel(root_dir + results_dir + write_biblio_bertopic_file + timestamp + '.xlsx', index = False)

print(f'DONE!')


File: scopus_lens_ml_sim_engineering_all_st.csv, Size: 72967 rows
File: bertopic_doc_info_ml_sim_engineering_abstracts.csv, Size: 74543 rows
The merged file has 73557 publications


Unnamed: 0,id,title,year,abstract,kws,fos,source,lit_review,cited,authors,search_label,search_title,search_abs,tp_num,tp_name,top_n_words,prob,representative
0,000000_Wang_2023,2v2 air combat confrontation strategy based on...,2023,Aircraft cluster air combat scenario is a long...,"energy distributions,air combat,long sequences...",,Lecture Notes in Electrical Engineering,0,0,"Wang J., Zhu L., Yang H., Ji Y., Wang X.",scopus_ml_sim_subj_engineering,['reinforcement learning'],"['simulation', 'reinforcement learning', 'rnn'...",-1,-1_of_the_to_and,of - the - to - and - in - for - is - model - ...,0.821538,False
1,000001_Preethi_2023,3D echocardiogram reconstruction employing a f...,2023,Three dimensional 3D echocardiogram enables ca...,"3d ann patch matching,image reconstruction,fli...",Voxel; Computer science; Artificial intelligen...,Computer Systems Science and Engineering,0,0,"Preethi C., Mohamed Sathik M., Shajun Nisha S.","scopus_ml_sim_subj_engineering, lens_ml_sim_su...",[],"['ann', 'simulation']",-1,-1_of_the_to_and,of - the - to - and - in - for - is - model - ...,0.868854,False
2,000002_Li_2023,3D ground penetrating radar cavity identificat...,2023,3D ground penetrating radar GPR is the main me...,,Ground-penetrating radar; Radar; Identificatio...,Measurement Science and Technology,0,0,Fanruo Li; Feng Yang; Xu Qiao; Wentai Xing; Ch...,lens_ml_sim_subj_engineering,['transfer learning'],"['simulation', 'transfer learning']",346,346_gpr_penetrating_radar_bscan,gpr - penetrating - radar - bscan - undergroun...,0.398205,False
3,000003_Mehrpooya_2023,3D inverse synthetic aperture radar image qual...,2023,Generalisation of one-dimensional dictionary l...,"multidimensional data,generalisation,inverse s...",,"IET Radar, Sonar and Navigation",0,0,"Mehrpooya A., Karbasi S.M., Nazari M., Abbasi ...",scopus_ml_sim_subj_engineering,[],['simulation'],508,508_cs_compressive_sensing_compressed,cs - compressive - sensing - compressed - reco...,0.036927,False
4,000004_Park_2023,3D off grid localization for adjacent cavitati...,2023,The propeller tip vortex cavitation TVC locali...,"off-grids,bayesian networks,noise source,bayes...",,Sensors,0,0,"Park M., Memon S.A., Kim G., Choo Y.",scopus_ml_sim_subj_engineering,[],['simulation'],16,16_aerodynamic_drag_wing_airfoil,aerodynamic - drag - wing - airfoil - lift - p...,0.017674,False


Saving file scopus_lens_ml_sim_engineering_all_st_plus_abstract_topics.csv with 73557 publications...
DONE!


### Generate dataset with titles only
This creates a CSV file with two columns (id and title) for topic modeling.

#### TODO
- This utility and that for the abstracts isn't needed since I can filter this dierctly in the BERTopic notebook.

In [42]:
# PARAMETERS

data_dir = '/results/processed/'
results_dir = '/results/'

read_files_csv = ['scopus_lens_ml_heart_all.csv']

write_csv = True
write_title_file_csv = 'scopus_lens_ml_heart_all_title'

timestamping = False

# ----------------------------

# Get list of CSV files in directory
if len(read_files_csv) == 0:
    read_files_csv = [f for f in os.listdir(root_dir + data_dir) if f.endswith('.csv')]

all_f_df = []

# Read all CSV files into a single DataFrame
for f in read_files_csv:
    f_df = pd.read_csv(os.path.join(root_dir + data_dir, f))
    print(f'File: {os.path.basename(f)}, Size: {len(f_df)} rows')
    all_f_df.append(f_df)

biblio_df = pd.concat(all_f_df, ignore_index = True)
biblio_df = biblio_df.loc[:, ['id', 'title']]

print(f'Number of entries in dataset: {len(biblio_df)}')

# Create timestamp
if timestamping:
    timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
else:
    timestamp = ''

# Write dataframe to CSV
if write_csv:
    print(f'Saving file {write_title_file_csv + timestamp}.csv ...')
    biblio_df.to_csv(root_dir + results_dir + write_title_file_csv + timestamp + '.csv', index = False)

print(f'DONE!')


File: scopus_lens_ml_heart_all.csv, Size: 47168 rows
Number of entries in dataset: 47168
Saving file scopus_lens_ml_heart_all_title.csv ...
DONE!


### Generate dataset with abstracts only
This creates a CSV file with two columns (id and abstract) for topic modeling.

In [45]:
# PARAMETERS

data_dir = '/results/processed/'
results_dir = '/results/'

read_file_csv = ['scopus_lens_ml_heart_all.csv']

write_csv = True
abs_file = 'scopus_lens_ml_heart_all_abstract'

timestamping = False

# ----------------------------

# Get list of CSV files in directory
if len(read_file_csv) == 0:
    read_file_csv = [f for f in os.listdir(root_dir + data_dir) if f.endswith('.csv')]

all_f_df = []

# Read all CSV files into a single DataFrame
for f in read_file_csv:
    f_df = pd.read_csv(os.path.join(root_dir + data_dir, f))
    print(f'File: {os.path.basename(f)}, Size: {len(f_df)} rows')
    all_f_df.append(f_df)

biblio_df = pd.concat(all_f_df, ignore_index = True)
biblio_df = biblio_df.loc[:, ['id', 'abstract']]

print(f'Number of entries in dataset: {len(biblio_df)}')

# Create timestamp
if timestamping:
    timestamp = '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
else:
    timestamp = ''

# Write dataframe to CSV
if write_csv:
    print(f'Saving file {abs_file + timestamp}.csv ...')
    biblio_df.to_csv(root_dir + results_dir + abs_file + timestamp + '.csv', index = False)

print(f'DONE!')


File: scopus_lens_ml_heart_all.csv, Size: 47168 rows
Number of entries in dataset: 47168
Saving file scopus_lens_ml_heart_all_abstract.csv ...
DONE!
