In [21]:
import configparser
import datetime as dt
import inspect
import logging
import os
import sys
from collections import defaultdict
from datetime import datetime

from pyzotero import zotero

DATE_FMT = "%Y-%m-%dT%XZ"
STATUS_OK = True

# Setup flags
These variables should be updated before start

## Delete own attachments 
<a id='del-attach'></a>

Important when merging duplicate items. See explanations in 
[this cell](merge_duplicate_items.ipynb#del-attach).


## Delete Tags 

Zotero creates special tags to mark duplicates (`duplicate-citation-key`).
Set this to `True` to remove them in this [this cell](remove_duplicate_attachments.ipynb#tags).

In [2]:
DELETE_OWN_ATTACHMENTS = True

In [3]:
DELETE_TAGS = False  # @todo: check if this is necessary

# Load config

In [4]:
config_file = r"config.cfg"
if not os.path.exists(config_file):
    print(f"config file {config_file} does not exist!")
    sys.exit(0)
    
configFilePath = config_file 
configParser = configparser.RawConfigParser()
configParser.read(configFilePath)
library_id = int(configParser.get("zotero-config", "library_id"))
api_key = configParser.get("zotero-config", "api_key")
library_type = configParser.get("zotero-config", "library_type")

# Define functions

In [5]:
def date_added(_item):
    return datetime.strptime(_item["data"]["dateAdded"], DATE_FMT)

In [22]:
logfile = "zotero.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s - %(asctime)s - %(message)s",
    handlers=[logging.FileHandler(filename=logfile), logging.StreamHandler(sys.stdout)],
)
log = logging.getLogger()
## If file exists, delete it ##
if os.path.isfile(logfile):
    os.remove(logfile)

In [18]:
def attachment_is_pdf(_child):
    return (
        _child["data"]["itemType"] == "attachment"
        and _child["data"]["contentType"] == "application/pdf"
        and _child["data"]["linkMode"] in ["imported_file", "linked_file", "imported_url"]
    )
# https://www.zotero.org/support/dev/web_api/v3/file_upload

In [7]:
def get_items_with_duplicate_pdf(_zot, _items):
    _items_duplicate_attach = []
    _pdf_attachments = defaultdict(list)
    for _item in _items:
        if is_standalone(_item):
            continue
            
        key = _item["key"]
        cs = _zot.children(key)
        for c in cs:
            if attachment_is_pdf(c):
                _pdf_attachments[key].append(c["data"]["filename"])

        if len(_pdf_attachments[key]) > 1:
            _items_duplicate_attach.append(_item)

    return _items_duplicate_attach, _pdf_attachments

In [None]:
def get_items_with_no_pdf_attachments(_zot, _items):
    _items_without_attach = []
    for _item in _items:
        has_attach = False
        if is_standalone(_item):
            continue
            
        key = _item["key"]
        cs = _zot.children(key)
        for c in cs:
            if attachment_is_pdf(c):
                has_attach = True
                break

        if not has_attach:
            _items_without_attach.append(_item)

    return _items_without_attach

In [8]:
def get_standalone_items(_items):
    standalone = []
    for _item in _items:
        if is_standalone(_item):
            standalone.append(_item)

    return standalone                

In [9]:
def is_standalone(_item):      
    return _item["data"]["itemType"] in ['note', 'attachment']

In [10]:
def retrieve_data():
    log.info("Retrieving Library...")
    zot = zotero.Zotero(library_id, library_type, api_key)
    lib_items = zot.everything(zot.top())
    T = dt.datetime.now()
    log.info(f"Got {len(lib_items)} items")
    log.info(f"Done at {T.hour}:{T.minute}:{T.second}")
    return zot, lib_items

In [11]:
def get_items_by_doi_or_isbn(_lib_items):
    _items_by_doi_isbn = defaultdict(list)
    for _item in _lib_items:
        if "DOI" in _item["data"]:
            _items_by_doi_isbn[_item["data"]["DOI"]].append(_item)
        elif "ISBN" in _item["data"]:
            _items_by_doi_isbn[_item["data"]["ISBN"]].append(_item)
        
    return _items_by_doi_isbn 

In [16]:
def log_item(_item):
    if "title" in _item["data"].keys():
        ttt = f"{_item['data']['title']}"
    else:
        ttt = ""
        
 
    msg = f"""Item:
        Title: {ttt}
        Key: {_item['data']['key']}
        ItemType: {_item['data']['itemType']}
        """
    if not is_standalone(_item):    
        creators = item["data"]["creators"]  # could be author or editor
        firstname = ""
        lastname = ""
        for creator in creators:
            if creator["creatorType"] == "author":
                firstname = creator["firstName"]
                lastname = creator["lastName"]
                break

        if "attachment" in item["links"].keys():
            attach = item["links"]["attachment"]["href"].split("/")[-1]
            type_attach = item["links"]["attachment"]["attachmentType"]
        else:
            attach = "NO_ATTACHMENT"
            type_attach = "NO_TYPE"
        
        msg += f"""
            Author: {firstname}, {lastname}
            File: {attach} | Type: {type_attach}
            Num Attach: {item['meta']['numChildren']}
            ----"""
   
    log.info(inspect.cleandoc(msg))

In [13]:
def delete_pdf_attachments(_children, ask=False):
    deleted_attachment = False
    for child in _children[1:]:
        if not attachment_is_pdf(child):
            continue  # only for pdf files. Other files, like notes, zip, etc, should not be deleted, anyway.

        if ask:
            answer = input(f"delete {child['data']['filename']}? (y[N])")
            if answer == "y":
                log.warning(f"deleting {child['data']['filename']}")
                zot.delete_item(child)
                deleted_attachment = True

        else:
            log.warning(f"deleting {child['data']['filename']}")
            zot.delete_item(child)
            deleted_attachment = True

    return deleted_attachment

In [19]:
def log_title(_item):
    if "title" in _item["data"].keys():
        ttt = f"{_item['data']['title']}"
    else:
        ttt = ""
        
    msg = f"Title: {ttt}"
    log.info(msg)