<p style="text-align: center;"><a target="_blank" href="https://colab.research.google.com/github/gitwitcho/bibliokeywords/blob/master/notebooks/Highlight.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a></p>

# Highlight keywords

### Clone the BiblioKeywords project from GitHub.

In [None]:
%cd /content
%rm -rf bibliokeywords
!git clone https://github.com/gitwitcho/bibliokeywords.git
%cd /content/bibliokeywords/src
!pip install webcolors
!pip install lxml
!pip install openpyxl==3.1.2

### Imports and configurations

In [None]:
import sys

from pathlib import Path

# Add the src directory to the Python path
src_path = Path("../") / "src"
if src_path.resolve() not in sys.path:
    sys.path.insert(0, str(src_path.resolve()))

from config import *
from utilities import *
from highlight import *     # openpyxl needs lxml installed, otherwise it throws an error

In [None]:
# Input parameters
# -----------------------
biblio_project_dir = 'example_project'          # directory for the data and models of your bibliographic project
biblio_input_dir = 'processed'                  # directory containing the input file
biblio_input_file = 'biblio_example_all.csv'    # input file (bibliographic dataset)

output_dir = 'results'                          # directory where you want to save the highlighted Excel file
output_file = f'highlights_example.xlsx'        # filename of the Excel file; leave empty if you don't want to save the data

n_rows = 0              # the maximum number of rows read for each dataset; set to '0' if you want to read all the data

highlight_params = [    # strings: words to highlight; targets: columns to apply highlighting to; colour: highlight colour
    {
        'strings': ['human', 'covid'],
        'targets': ['title', 'abstract'],
        'colour': 'red'
    },
    {
        'strings': ['var', 'risk assess', 'risk manag', 'risk analys'],
        'targets': ['title', 'abstract'],
        'colour': 'magenta'
    },
    {
        'strings': ['systemic risk', 'contagion', 'vulnerab', 'fragil', 'spillover'],
        'targets': ['title', 'abstract'],
        'colour': 'blue'
    }
]

excel_params = {    # Excel sheet and column configuration parameters
    'cols': [           # cols also determines the order of the columns in the Excel sheet
        {'col': 'title', 'heading': 'Title', 'width': 40, 'wrap': True},
        {'col': 'abstract', 'heading': 'Abstract', 'width': 80, 'wrap': True},
        {'col': 'year', 'heading': 'Year', 'width': 6, 'wrap': False}
    ],
    'sheet_name': 'Highlights',
    'zoom': 140,
    'freeze_panes': 'A2'
}
# -----------------------

In [None]:
# 1. Read the bibliographic datasets
biblio_df = read_biblio_csv_files_to_df(biblio_project_dir = biblio_project_dir, 
                                        input_dir = biblio_input_dir,
                                        input_files = biblio_input_file,
                                        biblio_source = BiblioSource.BIBLIO,
                                        n_rows = n_rows)

# 2. Highlight keywords in Excel
_, excel_wb = highlight_keywords(biblio_df_ = biblio_df,
                                highlight_params = highlight_params,
                                # xlsx_cols = ['title', 'abstract', 'year'],
                                excel_params= excel_params)

# 5. Save the results to a CSV file
if output_file and excel_wb:
    print(f"Writing Excel ({len(biblio_df)} records) to file '{output_file}'...")
    excel_wb.save(get_root_dir() / 'data' / biblio_project_dir / output_dir / output_file)