# Script for evaluation of scan logs from literature research
This script processes literature research logs by creating a
CSV file and a readme file for overview over the literature
scan. Moreover, it extracts the bibtex info of all positively
scanned publications into a separate file.

In [733]:
start_directory = "Hierarchical Forecasting"

## Imports

In [734]:
import re
import pandas as pd
import Levenshtein as lev
import itertools
from pathlib import Path

In [735]:
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode, homogenize_latex_encoding

## Functions

In [736]:
def scan_to_df(scan, finished=True):
    BODY_SEPARATOR = "#-------------------------\nDETAILS ------------------\n#-------------------------"
    BIBTEX_PATTERN = "(.+?) ?= ?\{(.*?)\}"
    PUBLICATION_PATTERN = "@[A-Za-z]+?\{.*?\}\n\n"

    scan_header = scan.split(sep=BODY_SEPARATOR)[0]
    scan_body = scan.split(sep=BODY_SEPARATOR)[1]

    header_infos = {k: v for k, v in re.findall(BIBTEX_PATTERN, scan_header)}
    header_infos["found"] = int(header_infos["found"])
    try:
        header_infos["kept_after_scan"] = int(header_infos["kept_after_scan"])
    except Exception:
        pass

    publications_raw = re.findall(PUBLICATION_PATTERN, scan_body, re.DOTALL)
    publications = [
        {k: v for k, v in re.findall(BIBTEX_PATTERN, publication)}
        for publication in publications_raw
    ]

    # check integrity
    if finished:
        # is number of found publications equal to reported number of publications?
        if len(publications) != header_infos["found"]:
            raise UserWarning(
                "The number of found publications does not match the reported number of publications."
            )

        # have all publications been tagged correctly with a "take" decision?
        for publication in publications:
            if "take" not in publication:
                raise UserWarning(
                    f"Publication {publication['title']} has not valid take tag"
                )

    # make dataframe
    publicationFrame = pd.DataFrame(publications)

    publicationFrame.columns = [col.lower() for col in publicationFrame.columns]

    if finished:
        take = pd.DataFrame(
            publicationFrame["take"]
            .apply(lambda x: re.findall("(.+?)\.(.*)", x)[0])
            .tolist()
        )
        publicationFrame["take"] = take[0]
        publicationFrame["take_explanation"] = take[1].str.strip()

        if set(publicationFrame["take"]) != set(["Yes", "No"]):
            raise AssertionError("Not all take tags could be classified as Yes or No.")

        publicationFrame["take"] = publicationFrame["take"] == "Yes"

        for col in publicationFrame.columns:
            publicationFrame.loc[publicationFrame[col].isnull(), col] = ""

    return publicationFrame, header_infos, publications_raw


def pub_to_md(pub):
    source_tag = ""
    if "journal" in pub: source_tag=f"*{pub['journal'].strip()}*"
    if source_tag in ["","**"] and "booktitle" in pub: source_tag=f"*{pub['booktitle'].strip()}*"
        
    res = f"### **{pub['title'].strip()}**\n{pub['author'].strip()}. ({pub['year']}). {pub['title'].strip()}. {source_tag}"
    if pub["take_explanation"] != "":
        res += "\n\n**Comment:** " + (pub["take_explanation"].strip())
    return res


def df_to_AccDis(pubframe):
    accepted = "\n\n".join(
        [pub_to_md(pub) for k, pub in pubframe[pubframe["take"].str.lower().str.contains("yes")].iterrows()]
    )
    discarded = "\n\n".join(
        [pub_to_md(pub) for k, pub in pubframe[pubframe["take"].str.lower().str.contains("no")].iterrows()]
    )
    to_decide = "\n\n".join(
        [pub_to_md(pub) for k, pub in pubframe[pubframe["take"]==""].iterrows()]
    )
    return accepted, discarded, to_decide


def read_file(path):
    with open(path, "r", encoding="latin-1") as f:
        file = f.read()
    return file


def write_file(text, path):
    with open(path, "w", encoding="latin-1") as f:
        f.write(text)
        
def already_tagged(new, already, threshold=4):
    new1 = new[["title"]].assign(key=1).reset_index()
    already1 = (
        already[["title", "take", "take_explanation"]].assign(key=1).reset_index()
    )
    compare = pd.merge(new1, already1, on="key", suffixes=("_new", "_already"))
    compare["titles"] = tuple(zip(compare["title_new"], compare["title_already"]))
    compare["dist"] = compare["titles"].apply(lambda x: lev.distance(x[0], x[1]))
    return compare[compare["dist"] <= threshold].reset_index()[
        [
            "index_new",
            "title_new",
            "title_already",
            "index_already",
            "dist",
            "take",
            "take_explanation",
        ]
    ]

def update_scan(new_scan, new_raw, already, already_header_info):
    for _, v in already.iterrows():
        insert = (
            ("Yes." if v["take"] else "No.") + " " + v["take_explanation"]
        ).strip()
        first_seen = already_header_info["date"] + " " + already_header_info["where"]
        new_scan = new_scan.replace(
            new_raw[v["index_new"]],
            new_raw[v["index_new"]].replace(
                "\n}\n\n", f"\ntake={{{insert}}},\nfirst_seen={{{first_seen}}},\n}}\n\n"
            ),
        )
    return new_scan

def export_readme(path,df,header_infos):
    accepted, discarded, to_decide = df_to_AccDis(df.fillna("").astype("str"))
    pending = '\n\n-----\n\n## **Pending Publications**\n\n'+to_decide if to_decide!='' else ''
    # write README file
    readme_text = f"# **Literature Research Scan**\n\n**Source:** {header_infos['where']}\n\n**Date:** {header_infos['date']}\n\n**Search Terms:** {header_infos['terms']}\n\n**Search Criteria:** {header_infos['criterion']}\n\n**Results:** {header_infos['found']} publications were scanned by title and abstract, {header_infos['kept_after_scan']} were kept. {header_infos['notes']}\n\n-----\n\n## **Accepted Publications**\n\n{accepted}\n\n-----\n\n## **Discarded Publications**\n\n{discarded}{pending}"
    write_file(readme_text, path)
    
def export_summary_readme(path,df,header_infos):
    accepted, discarded, to_decide = df_to_AccDis(df.fillna("").astype("str"))
    pending = '\n\n-----\n\n## **Pending Publications**\n\n'+to_decide if to_decide!='' else ''
    # write README file
    readme_text = f"# **Summary of Literature Research Scan**\n\n**Sources** (links to individual scan summaries): {header_infos['where']}\n\n**Search Terms:** {header_infos['terms']}\n\n**Search Criteria:** {header_infos['criterion']}\n\n**Results:** Overall, {header_infos['found']} different publications were scanned by title and abstract, {header_infos['kept_after_scan']} were kept.\n\n-----\n\n## **Accepted Publications**\n\n{accepted}\n\n-----\n\n## **Discarded Publications**\n\n{discarded}{pending}"
    write_file(readme_text, path)
    
def get_bib_duplicates(df):
    dups =  df.loc[[subitem for item in [(x[0][1],x[1][1]) for x in list(itertools.combinations(df[["title","index"]].values.tolist(),2)) if lev.distance(x[0][0].lower(), x[1][0].lower())<4] for subitem in item]]
    return dups
    
def load_bibtex(path, verbatim=True):
    if verbatim: print(path)
    with open(path, "r", encoding="latin-1") as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)
        
    header_infos = bib_database.strings
    
    df = pd.DataFrame(bib_database.entries)
    df = df.assign(path=path)
    df = df.reset_index()
    
    if "take" not in df.columns:
        df["take"]=None
        df["take_explanation"]=None
        
    df["title"] = df["title"].str.replace("{","").str.replace("}","").str.replace("\n"," ")
    
    return bib_database, header_infos, df

In [737]:
# very important columns (for inspection)
vic = ["index","title","author","journal","take","take_explanation","path","match_id"]

# Matching

In [738]:
sources = list(Path(start_directory).rglob("*.bib"))
sources

[WindowsPath('Hierarchical Forecasting/ACM Digital Library/scan.bib'),
 WindowsPath('Hierarchical Forecasting/EBSCO Host/scan.bib'),
 WindowsPath('Hierarchical Forecasting/Scopus/scan.bib'),
 WindowsPath('Hierarchical Forecasting/Web of Science/scan.bib')]

Load

In [739]:
#sources = ["20_04_16 Nowcasting/Scopus/scan2.bib","20_04_16 Nowcasting/EBSCO/scan2.bib"]
bib_list,header_list,df_list = (zip(*[load_bibtex(source) for source in sources])) 

Hierarchical Forecasting\ACM Digital Library\scan.bib
Hierarchical Forecasting\EBSCO Host\scan.bib
Hierarchical Forecasting\Scopus\scan.bib
Hierarchical Forecasting\Web of Science\scan.bib


In [740]:
# Check if there are any entries which have been processed as comments, which may indicate a problem
pd.Series({str(source):bib.comments for bib,source in zip(bib_list,sources)})

Hierarchical Forecasting\ACM Digital Library\scan.bib    []
Hierarchical Forecasting\EBSCO Host\scan.bib             []
Hierarchical Forecasting\Scopus\scan.bib                 []
Hierarchical Forecasting\Web of Science\scan.bib         []
dtype: object

In [741]:
# Show number of duplicates in bibs
pd.Series({str(source):get_bib_duplicates(df).shape[0] for df,source in zip(df_list,sources)})

Hierarchical Forecasting\ACM Digital Library\scan.bib    0
Hierarchical Forecasting\EBSCO Host\scan.bib             0
Hierarchical Forecasting\Scopus\scan.bib                 0
Hierarchical Forecasting\Web of Science\scan.bib         0
dtype: int64

In [742]:
get_bib_duplicates(df_list[0])[["title"]]

Unnamed: 0,title


Match

In [743]:
max_dist = 7
all_entries = pd.concat(df_list,sort=False,ignore_index=True)
all_indexes = [all_entries.index[all_entries["path"]==path].tolist() for path in all_entries["path"].unique()]
to_compare = [item for list_comb in list(itertools.combinations(all_indexes,2)) for item in list(itertools.product(*list_comb))]
dists = {x:lev.distance(all_entries.loc[x[0],"title"].lower(), all_entries.loc[x[1],"title"].lower()) for x in to_compare}
small_dists = {i:(all_entries.loc[x[0],"title"],all_entries.loc[x[1],"title"],d,x[0],x[1],all_entries.loc[x[0],"path"],all_entries.loc[x[1],"path"]) for i, (x,d) in enumerate(dists.items()) if d<max_dist}
print(f"Found {len(small_dists)} matching pairs.")

Found 80 matching pairs.


Show most distant matches for quality control

In [744]:
matches = pd.DataFrame.from_dict(small_dists,orient="index",columns=["title_x","title_y","distance","index_x","index_y","path_x","path_y"])
with pd.option_context('display.max_colwidth', -1):
    display(matches.sort_values("distance",ascending=False).query("distance>1"))

Unnamed: 0,title_x,title_y,distance,index_x,index_y,path_x,path_y
2313,Grouped multivariate and functional time series forecasting:An application to annuity pricing.,Grouped multivariate and functional time series forecasting: An application to annuity pricing,2,14,90,Hierarchical Forecasting\EBSCO Host\scan.bib,Hierarchical Forecasting\Web of Science\scan.bib
1173,Grouped multivariate and functional time series forecasting:An application to annuity pricing.,Grouped multivariate and functional time series forecasting: An application to annuity pricing,2,14,61,Hierarchical Forecasting\EBSCO Host\scan.bib,Hierarchical Forecasting\Scopus\scan.bib
2434,Forecasting of a Hierarchical Functional Time Series on Example of Macromodel for the Day and Night Air Pollution in Silesia Region -- A Critical Overview.,Forecasting of a Hierarchical Functional Time Series on Example of Macromodel for the Day and Night Air Pollution in Silesia Region - A Critical Overview,2,18,95,Hierarchical Forecasting\EBSCO Host\scan.bib,Hierarchical Forecasting\Web of Science\scan.bib
1346,Forecasting of a Hierarchical Functional Time Series on Example of Macromodel for the Day and Night Air Pollution in Silesia Region -- A Critical Overview.,Forecasting of a hierarchical functional time series on example of macromodel for the day and night air pollution in silesia region - A critical overview,2,18,50,Hierarchical Forecasting\EBSCO Host\scan.bib,Hierarchical Forecasting\Scopus\scan.bib


Add match-IDs and check for unilateral matches

In [745]:
all_entries["match_id"] = [[] for _ in range(len(all_entries))]
for x,d in dists.items():
    if d<4:
        all_entries.loc[x[0],"match_id"].extend(x)
        all_entries.loc[x[1],"match_id"].extend(x)
all_entries["match_id"]=[str(sorted(list(set(entry)))) for entry in all_entries["match_id"]]
all_entries["match_count"] = all_entries.groupby("match_id")["take"].transform(lambda x: len(x))
print(f"Matched {len(all_entries['match_id'].unique())-1} different titles")

Matched 30 different titles


In [746]:
# Show entries which have not been correctly matched
print(all_entries.query("match_count<2"))
print("------------------------------------------------------")
print(all_entries.groupby("match_id")["take"].size().pipe(lambda x: x[x<2]))

Empty DataFrame
Columns: [index, year, url, title, take_explanation, take, series, publisher, path, pages, numpages, location, keywords, isbn, doi, booktitle, author, address, ENTRYTYPE, ID, articleno, volume, number, month, journal, issue_date, issn, abstract, source, note, document_type, author_keywords, affiliation, art_number, unique-id, researcherid-numbers, orcid-numbers, eissn, article-number, match_id, match_count]
Index: []

[0 rows x 41 columns]
------------------------------------------------------
Series([], Name: take, dtype: int64)


Copy take and take_explanation for all matches

In [747]:
def copy_func(x):
    if len(x.dropna().unique())==1: return x.dropna().iloc[0]
    if len(x.dropna().unique())>1: raise AssertionError(x)
    else: return None

all_entries.loc[all_entries["match_id"]!="[]","take"] = all_entries.loc[all_entries["match_id"]!="[]"].groupby("match_id")["take"].transform(copy_func)
all_entries.loc[all_entries["match_id"]!="[]","take_explanation"] = all_entries.loc[all_entries["match_id"]!="[]"].groupby("match_id")["take_explanation"].transform(copy_func)

Update bibs and write to file system

In [748]:
for bib,df,header,source in zip(bib_list,df_list,header_list,sources):
    df_updated = df.drop(["take","take_explanation"],axis=1,errors='ignore').merge(all_entries[["index","take","take_explanation","path"]],on=["index","path"],how="left")
    df_updated = df_updated.drop(["index"],axis=1)
    
    header['found'] = str(df_updated.shape[0])
    header['kept_after_scan'] = str(df_updated["take"].str.lower().str.contains("yes").sum())
    header['discarded_after_scan'] = str((1-df_updated["take"].str.lower().str.contains("yes")).sum())
    header['not_yet_decided'] = str(df_updated["take"].isnull().sum())
    
    df_updated["to_decide"]=None
    df_updated.loc[df_updated["take"].isnull(),"to_decide"]="Yes"
    
    bib.strings = header
    
    bib.entries = [{k:str(v) for k,v in m.items() if pd.notnull(v)} for m in df_updated.to_dict('records')]
    with open(source, 'w', encoding="latin-1") as bibtex_file:
        bibtexparser.dump(bib, bibtex_file)
    
    print(f"Exporting {source}")
    df.to_csv(str(source).replace(".bib",".csv"), index=False)
    export_readme(str(source).replace(os.path.basename(source),"readme.md"),df_updated,header)

Exporting Hierarchical Forecasting\ACM Digital Library\scan.bib
Exporting Hierarchical Forecasting\EBSCO Host\scan.bib
Exporting Hierarchical Forecasting\Scopus\scan.bib
Exporting Hierarchical Forecasting\Web of Science\scan.bib


### Export summary from all databases which have been searched

In [749]:
summary = pd.concat([all_entries[all_entries["match_id"]!="[]"].drop_duplicates(subset="match_id"),all_entries[all_entries["match_id"]=="[]"]]).drop(["index"],axis=1)
def to_rel_path(s,base):
    return str(s).replace("\\","/").replace(base,"").replace(" ","%20").replace(os.path.basename(s),"readme.md")
                     
header = {"where":"\n-  "+"\n - ".join([f"[{h['where']}](.{to_rel_path(s,start_directory)})" for h,s in zip(header_list,sources)]),
         "terms":header_list[-1]["terms"],"criterion":header_list[-1]["criterion"]}
header['found'] = str(summary.shape[0])
header['kept_after_scan'] = str(summary["take"].str.lower().str.contains("yes").sum())
header['discarded_after_scan'] = str((1-summary["take"].str.lower().str.contains("yes")).sum())
export_summary_readme(f"{start_directory}/readme.md",summary,header)