<p style="text-align: center;"><a target="_blank" href="https://colab.research.google.com/github/gitwitcho/bibliokeywords/blob/master/notebooks/Clean.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a></p>

# Clean bibliographic datasets
- Read bibliographic datasets
- Select and rename columns
- Normalise entries
- Merge datasets into single dataset
- Clean dataset
- Write dataset to file

**TODO**
- Add the possibility to switch off the logger output.
- Print some basic stats at the end of the code.
- Group the Scopus, Dimensions, and Lens operations so that they can be selected by an if statement in case the user doesn't specify all three (Scopus, Lens, Dimensions) directories.

### Clone the BiblioKeywords project from GitHub

In [None]:
%cd /content
%rm -rf bibliokeywords
!git clone https://github.com/gitwitcho/bibliokeywords.git
%cd /content/bibliokeywords/src

### Imports and configurations

In [None]:
import sys

from pathlib import Path

# Add the src directory to the Python path
src_path = Path("..") / "src"
if src_path.resolve() not in sys.path:
    sys.path.insert(0, str(src_path.resolve()))

from config import BiblioSource, Reshape
from clean import (read_biblio_csv_files_to_df, modify_cols_biblio_df,
                   normalise_biblio_entities, merge_biblio_dfs,
                   clean_biblio_df, write_df)

In [None]:
# Input parameters
# -----------------------
biblio_project_dir = 'example_project'      # directory for the data and models of your bibliographic project
scopus_input_dir = 'raw/scopus'             # directory with Scopus data; leave empty if you don't have Scopus data
lens_input_dir = 'raw/lens'                 # directory with Lens data; leave empty if you don't have Lens data
dims_input_dir = 'raw/dimensions'           # directory with Dimensions data; leave empty if you don't have Diemnsions data

output_dir = 'processed'                    # directory where you want to save the merged and cleaned bibliographic dataset
output_file = f'biblio_example_100.csv'     # filename of the bibliographic dataset; leave empty if you don't want to save the data

n_rows = 100                                # the maxium number of rows read for each dataset; set to '0' if you want to read all the data

write_cols = ['authors', 'title', 'abstract', 'year', 'pub_date',       # the columns in the output file
              'n_cited', 'source', 'kws', 'fos', 'anzsrc_2020', 
              'auth_affils', 'link', 'links', 'bib_src', 'scopus_id', 
              'lens_id', 'dims_id']
# -----------------------

In [None]:
# Create a merged bibliographic dataset from the different sources
#   1. Read the bibliographic datasets
#   2. Select and rename columns from the dataset
#   3. Normalise key variables in the dataset (bib_src, links, keywords, authors, author-affils)

if scopus_input_dir != '':
    scopus_df = read_biblio_csv_files_to_df(biblio_project_dir = biblio_project_dir, 
                                            input_dir = scopus_input_dir,
                                            input_files = input_files,
                                            biblio_source = BiblioSource.SCOPUS,
                                            n_rows = n_rows)
    scopus_df = modify_cols_biblio_df(biblio_df_ = scopus_df, 
                                      reshape_base = Reshape.SCOPUS_ALL)
    scopus_df = normalise_biblio_entities(biblio_df_ = scopus_df)
else:
    scopus_df = pd.DataFrame()
    
if lens_input_dir != '':
    lens_df = read_biblio_csv_files_to_df(biblio_project_dir = biblio_project_dir, 
                                        input_dir = lens_input_dir,
                                        biblio_source = BiblioSource.LENS,
                                        n_rows = n_rows)
    lens_df = modify_cols_biblio_df(biblio_df_ = lens_df, 
                                    reshape_base = Reshape.LENS_ALL)
    lens_df = normalise_biblio_entities(biblio_df_ = lens_df)
else:
    lens_df = pd.DataFrame()

if dims_input_dir != '':
    dims_df = read_biblio_csv_files_to_df(biblio_project_dir = biblio_project_dir, 
                                        input_dir = dims_input_dir,
                                        biblio_source = BiblioSource.DIMS,
                                        n_rows = n_rows)
    dims_df = modify_cols_biblio_df(biblio_df_ = dims_df, 
                                    reshape_base = Reshape.DIMS_ALL)
    dims_df = normalise_biblio_entities(biblio_df_ = dims_df)
else:
    dims_df = pd.DataFrame()

# 4. Merge the datasets from Scopus, Lens, and Dimensions
biblio_df = merge_biblio_dfs(scopus_df, lens_df, dims_df)

# 5. Clean the title and abstract, remove duplicate titles, and merge values from
#    different bibliographic datasets
biblio_df = clean_biblio_df(biblio_df_ = biblio_df)

# 6. Optionally save the results
if output_file:
    write_df(biblio_df = biblio_df[write_cols],
            biblio_project_dir = biblio_project_dir,
            output_dir = output_dir,
            output_file = output_file)
