# Publication list filtering

-- TODO: describe the purpose of this Notebook

## Way of working

-- TODO: describe how to work with this Notebook

## Feature ideas 

- Pretty-printing to the review interface using Jupyter's Markdown features: https://stackoverflow.com/a/469342041
- support CSV export
- support YAML export
- support Plaintext export

In [None]:
import sys
!{sys.executable} -m pip install bibtexparser tinydb

In [2]:
SMS_DIR = "./resources/citations"
SMS_OUTPUT_DUR = "./outputs/sms/"
SMS_DB_FILE_NAME = "sms_db.json"
SMS_DB_FILE_PATH = f"{SMS_OUTPUT_DUR}{SMS_DB_FILE_NAME}"
EXPORT_FILE_PATTERNS = {
    "ACM_EXPORT_FILE_PATTERN": "acm_export",
    "IEEE_EXPORT_FILE_PATTERN": "ieee_export"
}
BIB_EXTENSION = "bib"

## Collecting exported references

This collects all the reference exports into a single TinyDB (JSON-file based) database.   
This database will be used further for the selection based on the selection criteria.


In [None]:
from pathlib import Path, PurePath

base_wd_p = Path('.')

export_collections = [
    list(base_wd_p.glob(f'{SMS_DIR}/{fp}*.{BIB_EXTENSION}'))
    for fp in EXPORT_FILE_PATTERNS.values()
]
for eps in export_collections:
    for ep in eps:
        print(ep)

In [None]:
# basic use of bibtexparser
import bibtexparser

MAKE_DEMO_PARSING = False
DEMO_CITATIONS_FILE = "<YOUR FILE PATH HERE>"

if MAKE_DEMO_PARSING:
    # using a sample export for the test:
    with open(DEMO_CITATIONS_FILE) as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)

    ENTRIES_TO_PRINT = 2
    for entry in bib_database.entries[:ENTRIES_TO_PRINT]:
        print(entry.keys())
        print(entry)

In [None]:
from tinydb import TinyDB

# List any available backup files
backup_db_files = list(base_wd_p.glob(f'{SMS_OUTPUT_DUR}/bu.*.json'))
for bdbf in backup_db_files:
    print(bdbf)

# Load the default DB
print(f"> Using: {SMS_DB_FILE_PATH}")
sms_db = TinyDB(SMS_DB_FILE_PATH)

### Insert the clean entries in the TinyDB

This block populates the TinyDB database with entries.  
It is controlled by `RESTART_DB` flag.  
If `RESTART_DB = False`, no changes will be made to the DB.  
If `RESTART_DB = True`, the DB will be populated with clean data.

> 💡  
> Running this block with `RESTART_DB = True` overrides the content of the database with clean entries.  
> If this is ran (unintended) after any reviews, please try to recover the latest backup.  

In [None]:
from itertools import chain

all_export_files = chain(acm_exports, ieee_exports)

# Start with clean TinyDB
RESTART_DB = False
# RESTART_DB = True # uncomment to re-populate the DB

print(f"> Restarting the DB, (controlled by `RESTART_DB`): {RESTART_DB}")

if RESTART_DB:
    default_db_name = sms_db.default_table_name
    sms_db.drop_tables()
    default_table = sms_db.table(default_db_name)

    for export_file in all_export_files:
        print(export_file)

        with open(export_file) as bibtex_file:
            bib_collection = bibtexparser.load(bibtex_file)

            print(f"> inserting {len(bib_collection.entries)} entries")
            sms_db.insert_multiple(bib_collection.entries)

print(f"> Available DB entries (might contain duplicates): {len(sms_db)}")

In [None]:
# Small demo

ENTRY_EXPLORE_COUNT = 2

for entry in sms_db.all()[:ENTRY_EXPLORE_COUNT]:
    print(entry)

In [None]:
TEST_INPUT = False

if TEST_INPUT:
    test_input = input()
else:
    print(f"> Testing input function skipped since `TEST_INPUT = {TEST_INPUT}`")

## Perform selection (manual entries per author)

This block iterates the bib entries and promts the selection criteria input.  
To start, one needs to provide the author number (1,2,3,...).  
Only 1 review per author is accepted.

> 💡  
> The iteration of the entries goes in batches, controlled by `BATCH_START; BATCH_END`.  
> To stop the review, click the Stop/Interrup (⏹️) button in Jupyter Notebook's UI.

In [None]:
from IPython.display import clear_output
from tinydb import Query
from time import sleep

SmsDbEntry = Query()

# Selection criteria dict
# Contains all inclusion criteria
# Only if all the criteria are "1", then it is deemed suitable
selection_criteria = {
    "sc_<your_selection_criteria_with_default_pass_value>": "1",
    "sc_<your_selection_criteria_with_default_nogo_value>": "0",
}

author_number = input("Author count (e.g. 1, 2, 3, ...):")
if not author_number:
    raise ValueError("`author_number` must be set (e.g. 1, 2, 3, ...)")

BATCH_START = 300
BATCH_END_SEED = 300
BATCH_END = BATCH_END_SEED if (BATCH_END_SEED <= len(sms_db)) else len(sms_db)

for entry in sms_db.all()[BATCH_START:BATCH_END]:
    clear_output(wait=True)
    sleep(1) # sleep to gracefully handle output refresh

    applied_criteria = {}
    print("\n" + "=" * 5 + f" Details: Entry with ID {entry['ID']}")
    entry_details = [(k,v) for (k,v) in entry.items() if not any((kc in k) for kc in selection_criteria.keys())]
    for key, value in entry_details:
        print(f"{key}: {value}")
    print("~" * 5 + f" Author check: #{author_number}")
    author_prefix = f"a_{author_number}"
    autor_review_items = [(k,v) for (k,v) in entry.items() if author_prefix in k]
    sleep(1) # sleep to allow the user to read the output

    # Checking the review for the paper
    if len(autor_review_items) == len(selection_criteria.keys()):
        print("~" * 5 + f" Author check already made: #{author_number}")
        print({k: v for (k,v) in autor_review_items})
    else:    
        for key, value in selection_criteria.items():
            print(f"--- Criteria {key}: {value}")
            author_value = input(
                f"Please add your review on {key} (0/1, default: {value})")
            sc_result_check_value = author_value or value
            print(f"entered: {author_value}, result: {sc_result_check_value}")
            applied_criteria.update({f"a_{author_number}_{key}": sc_result_check_value})
        print("~" * 5 + f" Author check results: #{author_number}")
        print(applied_criteria)
        print("~" * 5 + f" Updating record with ID: {entry['ID']}")
        upserted_ids = sms_db.upsert({**entry, **applied_criteria}, SmsDbEntry.ID == entry["ID"])
        print(f"> updated_ids == {upserted_ids}")
    _ = input("Enter anything when ready to proceed (this will clear the output and go to next entry):...")
    sleep(1)

### Backup progress

In [None]:
import shutil
import datetime

SMS_DB_BACKUP_FILE_NAME = f"bu.{datetime.datetime.now().timestamp()}.{SMS_DB_FILE_NAME}"
SMS_DB_BACKUP_FILE_PATH = f"{SMS_OUTPUT_DUR}{SMS_DB_BACKUP_FILE_NAME}"

shutil.copyfile(SMS_DB_FILE_PATH, SMS_DB_BACKUP_FILE_PATH)

## Check the selection metrics

In [None]:
METRIC_DEBUG_PRINT = True

def check_selection_presence(entry):
    entry_keys = entry.keys()
    return all(
        bool([ek for ek in entry_keys if sck in ek]) 
        for sck in selection_criteria.keys()
    )

def get_entry_checks(entry):
    entry_items = entry.items()
    entry_checks = {
        "ID": entry["ID"], 
        "checks": [
            {ek: ev for ek, ev in entry_items if sck in ek}
            for sck in selection_criteria.keys()
        ]
    }
    return entry_checks

all_entries = sms_db.all()

print("---")
checked_entries = [entry for entry in all_entries if check_selection_presence(entry)]
print(f"> Checked entries, nr: {len(checked_entries)}")


print("---")
unique_titles = set()
checked_unique_entries = []
for entry in checked_entries:
    if entry['title'] not in unique_titles:
        unique_titles.add(entry['title'])
        checked_unique_entries.append(entry)
print(f"> Checked unique entries, nr: {len(checked_unique_entries)}")

# override the checked entries with only unique ones:
checked_entries = checked_unique_entries

print("---")
entry_checks = [get_entry_checks(entry) for entry in checked_entries]
METRIC_DEBUG_PRINT and print(f"~> Examples: {entry_checks[:1]}", "...")

print("---")
approved_entry_checks = [
    entry_check for entry_check in entry_checks 
    if all(
        map(lambda c: all(map(lambda v: v == "1", c.values())), entry_check["checks"])
    )
]
print(f"> Approved entries, nr: {len(approved_entry_checks)}")
METRIC_DEBUG_PRINT and print(f"~> Approved entries: {[ec['ID'] for ec in approved_entry_checks]}")

print("---")
conflicting_entry_checks = [
    entry_check for entry_check in entry_checks 
    if any(
        map(
            lambda c: any(map(lambda v: v == "0", c.values())), 
            entry_check["checks"])
    )
]
print(f"> Conflicting entries, nr: {len(conflicting_entry_checks)}")
METRIC_DEBUG_PRINT and print(f"~> Conflicting entries: {[ec['ID'] for ec in conflicting_entry_checks]}")

print("---")
discarded_entry_checks = [
    entry_check for entry_check in entry_checks 
    if all(
        map(lambda c: all(map(lambda v: v == "0", c.values())), entry_check["checks"])
    )
]
print(f"> Fully discarded entries, nr: {len(discarded_entry_checks)}")
METRIC_DEBUG_PRINT and print(f"~> Fully discarded entries: {[ec['ID'] for ec in discarded_entry_checks]}")

## Export selection results

### Markdown export

In [12]:
def write_entry_markdown(entry, header_level=2):
    header = "#" * header_level + f" ID: {entry['ID']}, title: \"{entry['title']}\""
    data_header = "| Attribute | Value |\n| --- | --- |"
    data = "\n".join((f"| *{key}* | {value} |" for key, value in entry.items()))
    return header + "\n\n" + data_header + "\n" + data + "\n\n"


MD_EXPORT_APPROVED = f"{SMS_OUTPUT_DUR}/approved_entries.md"
with open(MD_EXPORT_APPROVED, 'w') as f:
    f.write("# Approved entries\n\n")
    for approved_entry_selection in approved_entry_checks:
        approved_entries = (entry for entry in checked_entries if entry['ID'] == approved_entry_selection['ID'])
        for entry in approved_entries:
            f.write(write_entry_markdown(entry))


MD_EXPORT_CONFLICTING = f"{SMS_OUTPUT_DUR}/conflicting_entries.md"
with open(MD_EXPORT_CONFLICTING, 'w') as f:
    f.write("# Conflicting entries\n\n")
    for conflicting_entry_selection in conflicting_entry_checks:
        conflicting_entries = (entry for entry in checked_entries if entry['ID'] == conflicting_entry_selection['ID'])
        for entry in conflicting_entries:
            f.write(write_entry_markdown(entry))

### More formats

TODO:
- [ ] support CSV export
- [ ] support YAML export
- [ ] support Plaintext export