# Keyword count
- ...

In [1]:
import sys

from pathlib import Path

# Add the src directory to the Python path
src_path = Path("../") / "src"
if src_path.resolve() not in sys.path:
    sys.path.insert(0, str(src_path.resolve()))

from config import *
from utilities import *
from count import *

[nltk_data] Downloading package wordnet to /Users/gilbert/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Input parameters
# -----------------------
biblio_project_dir = 'example_project'          # directory for the data and models of your bibliographic project
biblio_input_dir = 'processed'                  # directory containing the input file
biblio_input_file = 'biblio_example_all.csv'    # input file (bibliographic dataset)

output_dir = 'results'                          # directory where you want to save the keyword count file
output_file = f'keyword_count_example.xlsx'     # filename of the output file; leave empty if you don't want to save the data

n_rows = 100                # the maximum number of rows read for each dataset; set to '0' if you want to read all the data

keyword_cols = ['kws']      # columns with the keywords that will be counted separately         
assoc_filter = "[risk, model] in* {}"            # filter for the associated keywords

# -----------------------

In [3]:
# 1. Read the bibliographic datasets
biblio_df = read_biblio_csv_files_to_df(biblio_project_dir = biblio_project_dir, 
                                        input_dir = biblio_input_dir,
                                        input_files = biblio_input_file,
                                        biblio_source = BiblioSource.BIBLIO,
                                        n_rows = n_rows)

# 2. Generate the keyword counts for the columns keyword_cols
#    and the associated keyword counts for the filter assoc_filter
keywords_dict = generate_keyword_stats(biblio_df_ = biblio_df,
                                       cols = keyword_cols,
                                       assoc_filter = assoc_filter,
                                       singularise = True,
                                       stem = True)

# 3. Write keywords to console multi-column table
write_keyword_count_to_console(keywords_dict = keywords_dict,
                               max_n_rows = 10,
                               display_width = 180)

# 4. Extract the keyword counts to a dataframe for write_df(...)
keywords_stacked_df = stack_keyword_count_dfs(keywords_dict = keywords_dict)

# 5. Save the results to a CSV file
if output_file:
    write_df(biblio_df = keywords_stacked_df,
            biblio_project_dir = biblio_project_dir,
            output_dir = output_dir,
            output_file = output_file)

2023-05-22 11:42:32,031 - Biblio - Reading 1 CSV files...
2023-05-22 11:42:32,084 - Biblio - File: biblio_example_all.csv, Size: 100 rows
2023-05-22 11:42:32,099 - Biblio - Total number of publications in the dataframe: 100
Singularising the keywords in 'kws': 100%|██████████| 100/100 [00:01<00:00, 53.12it/s]
Stemming the keywords in 'kws': 100%|██████████| 100/100 [00:00<00:00, 12372.94it/s]
2023-05-22 11:42:34,066 - Biblio - Writing biblio_df (394 records) to file 'keyword_count_example.xlsx'...



Keywords for column 'kws'
Displaying 100 keywords of a total of 394
-------------------------------------------
systemic risk (31)               immun (2)                                     anterior chamb (1)         cardiovascular diseas (1)               
human (10)                       male (2)                                      anti bacterial ag (1)      cascading failur (1)                    
covid 19 (7)                     markov chain monte carlo (2)                  antibiotic prophylaxi (1)  cascading failure model (1)             
risk assess (7)                  model (2)                                     apolipoprotein a1 (1)      case control studi (1)                  
commerc (5)                      network topolog (2)                           apolipoprotein b (1)       cataract extract (1)                    
risk factor (5)                  optical coherence tomographi (2)              asset market (1)           central bank polici (1)                 
finan