# Filter bibliographic datasets
- Read previously cleaned bibliographic dataset
- Filter the dataset using the filter_query

In [1]:
import sys

from pathlib import Path

# Add the src directory to the Python path
src_path = Path("../../") / "src"
if src_path.resolve() not in sys.path:
    sys.path.insert(0, str(src_path.resolve()))

from utilities import *
from filter import *

In [2]:
# Input parameters
# -----------------------
biblio_project_dir = 'example_project'          # directory for the data and models of your bibliographic project
biblio_input_dir = 'processed'                  # directory containing the input file
biblio_input_file = 'biblio_example_all.csv'    # input file (bibliographic dataset)

output_dir = 'results'                          # directory where you want to save the filtered bibliographic dataset
output_file = f'filtered_example.csv'           # filename of the filtered dataset; leave empty if you don't want to save the data

n_rows = 0      # the maximum number of rows read for each dataset; set to '0' if you want to read all the data

# Filter examples (uncomment a filter to try it out)
# --------------------------------------------------
filter_query = "cris in* title"
# filter_query = "public in* title | extreme in* abstract"
# filter_query = "013 in* lens_id"
# filter_query = "'exists' in* abstract"
# filter_query = "['modules', extreme] in* title"
# filter_query = "extreme in* [title, abstract]"
# filter_query = "[[systemic, 'equity']] in* title"
# filter_query = "[[procyc, 'module']] in* [title, abstract]"
# filter_query = "['modules', extreme] in* [title, abstract]"
# filter_query = "~systemic in* title"
# filter_query = "~(~public in* title | extreme in* abstract)"
# filter_query = "~'exists' in* abstract"
# filter_query = "~['modules', extreme] in* title"
# filter_query = "~[[systemic risk, 'equity']] in* title"
# filter_query = "(risk in* title) & (year == 2014)"

# -----------------------

In [3]:
# 1. Read the bibliographic datasets
biblio_df = read_biblio_csv_files_to_df(biblio_project_dir = biblio_project_dir, 
                                        input_dir = biblio_input_dir,
                                        input_files = biblio_input_file,
                                        biblio_source = BiblioSource.BIBLIO,
                                        n_rows = n_rows)

# 2. Filter the biblio_df dataset
biblio_df = filter_biblio_df(biblio_df_ = biblio_df,
                             query_str = filter_query)

# 3. Optionally save the results
if output_file:
    write_df(biblio_df = biblio_df,
            biblio_project_dir = biblio_project_dir,
            output_dir = output_dir,
            output_file = output_file)

2023-05-22 11:44:38,344 - Biblio - Reading 1 CSV files...
2023-05-22 11:44:38,412 - Biblio - File: biblio_example_all.csv, Size: 2668 rows
2023-05-22 11:44:38,421 - Biblio - Total number of publications in the dataframe: 2668
2023-05-22 11:44:38,429 - Biblio - Writing biblio_df (124 records) to file 'filtered_example.csv'...
