In [3]:
import os
import os.path as op
import re
import shutil
import sys
from collections import OrderedDict as od

import Bio.Entrez as Entrez
from PyPDF2 import PdfFileReader

# Definitions

In [34]:
def linklib(parent, check=None, parse_pdf=False, mode: str = "rename", verbose=False):
    """Link to each file at each level above it in the file hierarchy.

    Parameters
    ----------
    parent : str
        The path to the parent directory.
    check : str
        The path to the directory that checks all file names for
        duplicates with non-identical inodes.
    mode : str
        Determines behavior if a symbolic link already exists.
        "stop"
            An error is raised and the function stops running.
        "skip"
            The function skips to the next file and returns a list
            of src paths it failed to create links to.
        "rename"
            The file is renamed with a letter suffix. E.g. if the
            link file1.py exists, the code would try to create
            file1a.py. If file1a.py exists, it would try to create
            file1b.py, etc.

    Returns
    -------
    output : OrderedDict
        "renamed_files" : OrderedDict
        "failed_lookups" : list
        "duplicates" : list
        "files_renamed" : int
        "symlinks_created" : int
        "symlinks_removed" : int
    """
    if mode not in ("rename", "skip", "stop"):
        raise ValueError
    if check is not None and not op.exists(check):
        raise FileNotFoundError(check)

    # Find and remove broken symlinks.
    symlinks_removed = 0
    symlinks_removed += rm_symlinks(parent)
    if verbose and symlinks_removed > 0:
        print("Removed {} symlinks before new link creation.".format(symlinks_removed))

    # Get regular files (not dirs or symlinks).
    files = find_files_at_depth(
        parent,
        show_files=True,
        show_symlinks=False,
        show_hidden_files=False,
        keep_ext=[".pdf"],
    )
    if verbose:
        print("Found {} PDF files in {}".format(len(files), parent))

    # Create a symlink to each file in the master directory,
    # checking for and handling duplicate file names along the way.
    renamed_files = od([])
    failed_lookups = []
    duplicates = []
    files_renamed = 0
    symlinks_created = 0
    for src in files:
        srcdir, base = op.split(src)
        if check is not None:
            # Rename the paper by scraping the PDF and searching for
            # metadata in PubMed.
            if parse_pdf:
                try:
                    # ================
                    pdfinfo = rename_paper(src, check, inplace=True, verbose=verbose)
                    return src, pdfinfo
                    # ================
                    newsrc = rename_paper(src, check, inplace=True, verbose=verbose)
                    renamed_files[src] = newsrc
                    src = newsrc
                    srcdir, base = op.split(src)
                    files_renamed += 1
                except:
                    failed_lookups.append(src)
                    continue

            # Check symlink.
            dst = op.join(check, base)
            if op.exists(dst):
                if same_file(src, dst):
                    pass
                elif mode == "stop":
                    raise FileExistsError(dst)
                elif mode == "skip":
                    duplicates.append(src)
                    continue
                elif mode == "rename":
                    newsrc = get_unique_name(src, [check, srcdir])
                    os.rename(src, newsrc)
                    renamed_files[src] = newsrc
                    src = newsrc
                    dst = op.join(check, op.basename(src))
                    os.symlink(src, dst)
                    files_renamed += 1
                    symlinks_created += 1
            else:
                os.symlink(src, dst)
                symlinks_created += 1

    # Find and remove broken symlinks.
    symlinks_removed += rm_symlinks(parent)

    # Create links at each level from parent to each file's location.
    symlinks_created += fill_symlinks(parent)

    # Print runtime details.
    if verbose:
        if parse_pdf:
            print("Failed to parse {} files".format(len(failed_lookups)))
        if mode == "skip":
            print("Skipped {} duplicate files".format(len(duplicates)))
        print("Renamed {} files".format(files_renamed))
        print("Created {} symlinks".format(symlinks_created))
        print("Removed {} broken symlinks".format(symlinks_removed))

    output = od(
        [
            ("renamed_files", renamed_files),
            ("failed_lookups", failed_lookups),
            ("duplicates", duplicates),
            ("files_renamed", files_renamed),
            ("symlinks_created", symlinks_created),
            ("symlinks_removed", symlinks_removed),
        ]
    )
    return output


def alphabet_generator():
    current_letter = "a"
    while True:
        yield current_letter
        current_letter = next_letter(current_letter)


def next_letter(current_letter):
    if current_letter == "z":
        return "aa"
    elif len(current_letter) == 1:
        return chr(ord(current_letter) + 1)
    else:
        first_letter, second_letter = current_letter
        if second_letter == "z":
            return chr(ord(first_letter) + 1) + "a"
        else:
            return first_letter + chr(ord(second_letter) + 1)


def find_files_at_depth(
    parent,
    depth=None,
    show_files=True,
    show_symlinks=False,
    show_hidden_files=True,
    show_hidden_dirs=True,
    keep_ext=[],
):
    """Return all files in parent and its nested subdirs up to depth.

    Parameters
    ----------
    parent : str
        The path to the parent directory.
    depth : int, optional
        The depth of subdirectories to search.
        If None, all files in the hierarchy are found.
        If 0, only files in parent are found.
        If 1, only files in parent and its immediate subdirectories are
        found.
        Etc.
    show_files : bool, optional
        Whether to return regular files (i.e. not symlinks). The default
        value is True.
    show_symlinks : bool, optional
        Whether to return symlinks. The default value is False.
    show_hidden_files : bool, optional
        Whether to return hidden files. The default value is True.
    show_hidden_dirs : bool, optional
        Whether to search for files in hidden directories. The default
        value is True.
    keep_ext : list, optional
        A list of filename extensions. Only files with these extensions
        will be returned.

    Returns
    -------
    list
        A list of all the files found at the specified depth.
    """
    files = []

    # Walk through the directory tree and collect all files at the specified depth
    for root, dirs, filenames in os.walk(parent):
        # Skip hidden directories if show_hidden_dirs is False
        if not show_hidden_dirs:
            dirs[:] = [d for d in dirs if not d.startswith(".")]
        if depth is None or root.count(op.sep) - parent.count(op.sep) == depth:
            # Skip hidden files if hidden_files is False
            if not show_hidden_files:
                filenames = [f for f in filenames if not f.startswith(".")]
            # Skip regular files if show_files is False
            if not show_files:
                filenames = [f for f in filenames if op.islink(op.join(root, f))]
            # Skip symlinks if show_symlinks is False
            if not show_symlinks:
                filenames = [f for f in filenames if not op.islink(op.join(root, f))]
            # Keep only files with the specified extensions
            if keep_ext:
                keep_ext = [
                    ext if ext.startswith(".") else ".{}".format(ext)
                    for ext in keep_ext
                ]
                filenames = [f for f in filenames if op.splitext(f)[1] in keep_ext]
            for f in filenames:
                files.append(op.join(root, f))

    return files


def same_file(file1, file2):
    """Return whether two files are the same."""
    try:
        return os.stat(file1).st_ino == os.stat(file2).st_ino
    except FileNotFoundError:
        return None


def rm_symlinks(parent, assert_same_basename=True):
    """Recursively remove broken symlinks in parent.

    Returns the number of symlinks removed.
    """
    # Find and remove broken symlinks.
    n_removed = 0
    symlinks = find_files_at_depth(parent, show_files=False, show_symlinks=True)
    for link in symlinks:
        if op.islink(link) and not op.exists(link):
            os.remove(link)
            n_removed += 1
        elif assert_same_basename and not (
            op.basename(link) == op.basename(op.realpath(link))
        ):
            os.remove(link)
            n_removed += 1
    return n_removed


def fill_symlinks(parent):
    """Recursively link from parent down to each file's bottom dir.

    Returns the number of symlinks created.
    """
    n_created = 0
    files = find_files_at_depth(
        parent,
        show_files=True,
        show_symlinks=False,
        show_hidden_files=False,
        keep_ext=[".pdf"],
    )
    for src in files:
        srcdir, base = op.split(src)
        cwd = srcdir
        moveon = False
        while not moveon:
            moveon = cwd == parent
            dst = op.join(cwd, base)
            if not op.exists(dst):
                os.symlink(src, dst)
                n_created += 1
            cwd = op.dirname(cwd)
    return n_created


def get_unique_name(src, check):
    """Return a unique filename checking src file against check dirs."""
    srcdir = op.dirname(src)
    base = "{}.pdf".format(op.splitext(op.basename(src))[0])
    gen = alphabet_generator()
    if isinstance(check, str):
        check = [check]
    for _check in check:
        dst = op.join(_check, base)
        if op.exists(dst) and not same_file(src, dst):
            name, ext = op.splitext(base)
            while op.exists(dst):
                base = name + next(gen) + ext
                dst = op.join(_check, base)
    return op.join(srcdir, base)


def rename_paper(infile, check, inplace=False, verbose=False):
    """Rename paper by looking up author and publication year.
    Based on last name of the first author and year of publication.

    Parameters
    ----------
    infile : str
        The path to the file to be renamed.
    check : str
        The path to the directory that checks all file names for
        duplicates with non-identical inodes.

    Returns
    -------
    outfile : str or None
        Path to the renamed file. Outfile is infile if no renaming was
        needed, or is None if the PubMed search failed.
    """
    # Scrape document info from the PDF.
    pdf_info = PdfFileReader(open(infile, "rb")).getDocumentInfo()

    # Create a Pubmed query to find the article.
    qry = ""
    if "/doi" in pdf_info and pdf_info["/doi"]:
        qry += "{}[aid]".format(pdf_info["/doi"])
    else:
        if "/Author" in pdf_info and pdf_info["/Author"]:
            qry += "{}[Author] AND ".format(pdf_info["/Author"])
        if "/Title" in pdf_info and pdf_info["/Title"]:
            qry += "{}[Title] AND ".format(pdf_info["/Title"])
        if "/Subject" in pdf_info and pdf_info["/Subject"]:
            if pdf_info["/Subject"].find(",") > 0:
                journal = pdf_info["/Subject"].split(",")[0]
                qry += "{}[Journal] AND ".format(journal)
            # year = re.findall("(\d\d\d\d)", pdf_info["/Subject"])
            # if year:
            #     year = int(year[0])
            #     qry += "{}:{}[Publication Date] AND ".format(year - 1, year + 1)
        if qry.endwith(" AND "):
            qry = qry[:-5]

    # Run the query and create a preliminary name for the paper if
    # PubMed finds a single matching result.
    id_list = pubmed_search(qry)["IdList"]
    if verbose:
        print("Found {} results".format(len(id_list)))
    if len(id_list) == 1:
        paper = fetch_details(id_list[:1])["PubmedArticle"][0]
        try:
            name = "{}{}".format(
                paper["MedlineCitation"]["Article"]["AuthorList"][0]["LastName"],
                paper["MedlineCitation"]["Article"]["ArticleDate"][0]["Year"],
            )
        except (KeyError, TypeError) as err:
            if verbose:
                print(
                    "Failed to find author name and publication year in PubMed result"
                )
            return None

    # Check the name against the database, and get a unique name if
    # needed.
    outfile = op.join(op.dirname(infile), "{}.pdf".format(name))
    outfile = get_unique_name(src, check)

    # Rename the paper.
    if inplace and not (infile == outfile):
        os.rename(iverbosele, outfile)
        if verbose:
            print("Renamed {} to {}".format(infile, outfile))

    return outfile


def pubmed_search(query):
    Entrez.email = "your.email@example.com"
    handle = Entrez.esearch(
        db="pubmed", sort="relevance", retmax="20", retmode="xml", term=query
    )
    results = Entrez.read(handle)
    return results


def fetch_details(id_list):
    ids = ",".join(id_list)
    Entrez.email = "your.email@example.com"
    handle = Entrez.efetch(db="pubmed", retmode="xml", id=ids)
    results = Entrez.read(handle)
    return results

# Execute library linking

In [36]:
# Organize file names and links in library.
lib = "/Users/dschonhaut/Box/library"
parent = lib
check = op.join(lib, ".all")
parse_pdf = False
mode = "rename"
verbose = True
# ------------------------

# renamed_files, failed_lookups, duplicates = linklib(parent, check, lookup_papers, mode, verbose)
output = linklib(parent, check=check, parse_pdf=parse_pdf, mode=mode, verbose=verbose)

Found 186 PDF files in /Users/dschonhaut/Box/library
Renamed 0 files
Created 0 symlinks
Removed 0 broken symlinks


# Parse PDF for metadata

In [8]:
infile = op.join(lib, "ndd", "cohorts", "adni", "1-s2.0-S0197458022002007-main.pdf")
pdf_toread = PdfFileReader(open(infile, "rb"))
pdf_info = pdf_toread.getDocumentInfo()

Xref table not zero-indexed. ID numbers for objects will be corrected.


In [21]:
qry = "{}[aid]".format(pdf_info["/doi"])
results = pubmed_search(qry)
id_list = results["IdList"]
papers = fetch_details(id_list)
for i, paper in enumerate(papers["PubmedArticle"]):
    print("{}) {}".format(i + 1, paper["MedlineCitation"]["Article"]["ArticleTitle"]))

1) Genome-wide association study of brain tau deposition as measured by <sup>18</sup>F-flortaucipir positron emission tomography imaging.


In [78]:
pdf_info
qry = ""
if pdf_info["/Author"]:
    qry += "{}[Author] ".format(pdf_info["/Author"])
if pdf_info["/Title"]:
    qry += "{}[Title] ".format(pdf_info["/Title"])
if pdf_info["/Subject"]:
    journal = pdf_info["/Subject"].split(",")[0]
    qry += "{}[Journal] ".format(journal)
    year = re.findall("(\d\d\d\d)", pdf_info["/Subject"])
    if year:
        year = int(year[0])
        qry += "{}:{}[Publication Date]".format(year - 1, year + 1)
results = pubmed_search(qry)

'Yu Guo[Author] Genome-wide association study of brain tau deposition as measured by 18F-flortaucipir positron emission tomography imaging[Title] Neurobiology of Aging[Journal] 2021:2023[Publication Date]'

In [139]:
id_list[:1]

['36195041']

In [90]:
results["IdList"]["Year"]

TypeError: list indices must be integers or slices, not str

In [66]:
print(
    "{}{}".format(
        paper["MedlineCitation"]["Article"]["AuthorList"][0]["LastName"],
        paper["MedlineCitation"]["Article"]["ArticleDate"][0]["Year"],
    )
)

Guo2022
