**TODO**
- Add the possibility to switch off the logger output.
- Print some basic stats at the end of the code.
- Group the Scopus, Dimensions, and Lens operations so that they can be selected by an if statement in case the user doesn't specify all three (Scopus, Lens, Dimensions) directories.

In [1]:
import sys

from pathlib import Path

# Add the src directory to the Python path
# if os.path.abspath('../src') not in sys.path:
#     sys.path.insert(0, os.path.abspath('../src'))

src_path = Path("..") / "src"
if src_path.resolve() not in sys.path:
    sys.path.insert(0, str(src_path.resolve()))

from utilities import *
from config import *
from clean import *

In [2]:
# Input parameters
# -----------------------
biblio_project_dir = 'systemic_risk'    # directory for the data and models of your bibliographic project
scopus_input_dir = 'raw/scopus'         # directory with Scopus data; leave empty if you don't have Scopus data
lens_input_dir = 'raw/lens'             # directory with Lens data; leave empty if you don't have Lens data
dims_input_dir = 'raw/dimensions'       # directory with Dimensions data; leave empty if you don't have Diemnsions data

output_dir = 'results'                  # directory where you want to save the merged and cleaned bibliographic dataset
output_file = f'biblio_sr_1000.csv'     # filename of the bibliographic dataset; leave empty if you don't want to save the data

n_rows = 0                              # the maxium number of rows read for each dataset; set to '0' if you want to read all the data
# -----------------------

In [3]:
# 1. Read the bibliographic datasets
scopus_df = read_biblio_csv_files_to_df(biblio_project_dir = biblio_project_dir, 
                                        input_dir = scopus_input_dir,
                                        biblio_source = BiblioSource.SCOPUS,
                                        n_rows = n_rows)

lens_df = read_biblio_csv_files_to_df(biblio_project_dir = biblio_project_dir, 
                                      input_dir = lens_input_dir,
                                      biblio_source = BiblioSource.LENS,
                                      n_rows = n_rows)

dims_df = read_biblio_csv_files_to_df(biblio_project_dir = biblio_project_dir, 
                                      input_dir = dims_input_dir,
                                      biblio_source = BiblioSource.DIMS,
                                      n_rows = n_rows)

# 2. Select and rename columns from the dataset
scopus_df = modify_cols_biblio_df(biblio_df_ = scopus_df, 
                                  reshape_base = Reshape.SCOPUS_ALL)

lens_df = modify_cols_biblio_df(biblio_df_ = lens_df, 
                                reshape_base = Reshape.LENS_ALL)

dims_df = modify_cols_biblio_df(biblio_df_ = dims_df, 
                                reshape_base = Reshape.DIMS_ALL)

# 3. Normalise key variables in the dataset (bib_src, links, keywords, authors, author-affils)
scopus_df = normalise_biblio_entities(biblio_df_ = scopus_df)
lens_df = normalise_biblio_entities(biblio_df_ = lens_df)
dims_df = normalise_biblio_entities(biblio_df_ = dims_df)

# 4. Merge the datasets from Scopus, Lens, and Dimensions
biblio_df = merge_biblio_dfs(scopus_df, lens_df, dims_df)

# 5. Clean the title and abstract, remove duplicate titles, and merge values from
#    different bibliographic datasets
biblio_df = clean_biblio_df(biblio_df_ = biblio_df)

# 6. Optionally save the results
if output_file:
    write_df(biblio_df = biblio_df,
            biblio_project_dir = biblio_project_dir,
            output_dir = output_dir,
            output_file = output_file)


2023-05-20 19:09:42,570 - Biblio - Reading 3 CSV files...
2023-05-20 19:09:42,629 - Biblio - File: scopus_systemic_risk_1999_2014.csv, Size: 1621 rows
2023-05-20 19:09:42,688 - Biblio - File: scopus_systemic_risk_2015_2019.csv, Size: 1790 rows
2023-05-20 19:09:42,751 - Biblio - File: scopus_systemic_risk_2020_2023.csv, Size: 1909 rows
2023-05-20 19:09:42,798 - Biblio - Total number of publications in the dataframe: 5320
2023-05-20 19:09:42,800 - Biblio - Reading 1 CSV files...
2023-05-20 19:09:43,094 - Biblio - File: lens_systemic_risk_all.csv, Size: 13813 rows
2023-05-20 19:09:43,148 - Biblio - Total number of publications in the dataframe: 13813
2023-05-20 19:09:43,150 - Biblio - Reading 1 CSV files...
2023-05-20 19:09:43,375 - Biblio - File: dims_systemic_risk_all.csv, Size: 8520 rows
2023-05-20 19:09:43,466 - Biblio - Total number of publications in the dataframe: 8520
2023-05-20 19:09:54,644 - Biblio - Number of publications in the input biblio_df: 27653


Removed 7 titles that were empty strings
Removed 0 titles that were NaN
Removed 104 records where the title contained "conference", "workshop", or "proceeding"
Removed additional 125 titles that were empty strings
Replaced 3093 abtracts that were NaN with an empty string


2023-05-20 19:10:01,591 - Biblio - Number of publications before removing duplicate titles: 27417


Duplicate group: #0 

AttributeError: Can only use .str accessor with string values!

In [None]:
def split_string(string):
    splits = []
    stack = []  # Stack to keep track of open parentheses
    start = 0   # Starting index of the current substring
    
    for i, char in enumerate(string):
        if char == '(':
            stack.append(i)  # Add the index of opening parentheses to the stack
        elif char == ')' and stack:
            stack.pop()  # Remove the index of matching opening parentheses from the stack
            if not stack and i < len(string) - 1 and string[i + 1] == ';':
                # If stack is empty and ';' is followed by the substring, split the string
                splits.append(string[start:i + 1])
                start = i + 2

    splits.append(string[start:])  # Append the remaining part of the string
    return splits

string = "Haas, Armin (Institute for Advanced Sustainability Studies, (IASS), Potsdam, Brandenburg, Germany; Global Climate Forum (GCF), Berlin, Germany); Laubichler, Manfred (Arizona State University, Phoenix, AZ, USA); Applegate, Joffa (Arizona State University, Phoenix, AZ, USA); Steudle, Gesine (Global Climate Forum (GCF), Berlin, Germany); Jaeger, Carlo C. (Global Climate Forum (GCF), Berlin, Germany; Beijing Normal University, Beijing, People's Republic of China)"

result = split_string(string)
print(result)
