# Cleaning bibliographic datasets

In [8]:
import sys
import os

# Add the src directory to the Python path
if os.path.abspath('../src') not in sys.path:
    sys.path.insert(0, os.path.abspath('../src'))

from pathlib import Path
from utilities import *
from config import *
from clean import *

### Parameters
- `biblio_project_dir`: Name of the folder containing the bibliographic project data and models (`bibliometrics/data/{biblio_project_dir}` and `bibliometrics/models/{biblio_project_dir}`).
- `scopus_data_dir,...`: Names of the folders inside `biblio_project_dir` where the input files are stored. All files in those folder will be read.
- `output_dir`: Name of the folder inside `biblio_project_dir` where the file should be written to.
- `output_file`: Name of the output file. The file extension can be .xlsx or .csv.
- `n_rows`: The maxium number of rows to be read from the bibliographic dataset. Set this to `0` to read the full dataset.

In [9]:
biblio_project_dir = 'systemic_risk'
scopus_data_dir = 'raw/scopus'
lens_data_dir = 'raw/lens'
dims_data_dir = 'raw/dimensions'
output_dir = 'results'

n_rows = 100

scopus_output_file = f'scopus_sr_{n_rows}.xlsx'
lens_output_file = f'lens_sr_{n_rows}.xlsx'
dims_output_file = f'dims_sr_{n_rows}.xlsx'

### Read the bibliogaphic datasets
Read the Scopus, Lens, and Dimensions CSV files and store them in a dataframe.

In [10]:
scopus_df = read_biblio_csv_files_to_df(biblio_project_dir_name = biblio_project_dir, 
                                        input_dir = scopus_data_dir,
                                        biblio_source = BiblioSource.SCOPUS,
                                        n_rows = n_rows)

lens_df = read_biblio_csv_files_to_df(biblio_project_dir_name = biblio_project_dir, 
                                      input_dir = lens_data_dir,
                                      biblio_source = BiblioSource.LENS,
                                      n_rows = n_rows)

dims_df = read_biblio_csv_files_to_df(biblio_project_dir_name = biblio_project_dir, 
                                      input_dir = dims_data_dir,
                                      biblio_source = BiblioSource.DIMS,
                                      n_rows = n_rows)

2023-05-18 10:10:17,577 - Biblio - Reading 3 CSV files...
2023-05-18 10:10:17,649 - Biblio - File: scopus_systemic_risk_1999_2014.csv, Size: 100 rows
2023-05-18 10:10:17,660 - Biblio - File: scopus_systemic_risk_2015_2019.csv, Size: 100 rows
2023-05-18 10:10:17,677 - Biblio - File: scopus_systemic_risk_2020_2023.csv, Size: 100 rows
2023-05-18 10:10:17,698 - Biblio - Total number of publications in the dataframe: 100
2023-05-18 10:10:17,703 - Biblio - Reading 1 CSV files...
2023-05-18 10:10:17,721 - Biblio - File: lens_systemic_risk_all.csv, Size: 100 rows
2023-05-18 10:10:17,727 - Biblio - Total number of publications in the dataframe: 100
2023-05-18 10:10:17,735 - Biblio - Reading 1 CSV files...
2023-05-18 10:10:17,748 - Biblio - File: dims_systemic_risk_all.csv, Size: 100 rows
2023-05-18 10:10:17,757 - Biblio - Total number of publications in the dataframe: 100


### Select and rename columns
Rename and retain a set of columns from the full bibliographic dataset.

In [11]:
scopus_df = modify_cols_biblio_df(biblio_df_ = scopus_df,
                                  reshape_base = Reshape.SCOPUS_COMPACT)

lens_df = modify_cols_biblio_df(biblio_df_ = lens_df,
                                reshape_base = Reshape.LENS_COMPACT)

dims_df = modify_cols_biblio_df(biblio_df_ = dims_df,
                                reshape_base = Reshape.DIMS_COMPACT)

### Normalise the dataset

In [12]:

scopus_df = normalise_biblio_entities(biblio_df_ = scopus_df,
                                      biblio_source = BiblioSource.SCOPUS)

lens_df = normalise_biblio_entities(biblio_df_ = lens_df,
                                    biblio_source = BiblioSource.LENS)

dims_df = normalise_biblio_entities(biblio_df_ = dims_df,
                                    biblio_source = BiblioSource.DIMS)

### Merge the datasets

In [13]:
biblio_df = merge_biblio_dfs(scopus_df, lens_df, dims_df)

### Clean the dataset
Clean the bibliographic dataset.

In [7]:
biblio_df = clean_biblio_df(biblio_df_ = biblio_df)

2023-05-18 09:55:20,690 - Biblio - Number of publications in the input biblio_df: 300
2023-05-18 09:55:20,786 - Biblio - Number of publications before removing duplicate titles: 299


Removed 0 titles that were empty strings
Removed 0 titles that were NaN
Removed 1 records where the title contained "conference", "workshop", or "proceeding"
Removed additional 0 titles that were empty strings
Replaced 14 abtracts that were NaN with an empty string


AttributeError: Can only use .str accessor with string values!

Write the bibliographic dataset to an Excel file.

In [None]:

write_df(biblio_df = biblio_df,
         biblio_project_dir_name = biblio_project_dir,
         output_dir = output_dir,
         output_file = 'biblio_output.xlsx')