# Get config parameters

## Data needed
We need three informations:
- the **ID of the group** library.  
  Can be found by opening the group’s page: https://www.zotero.org/groups/groupname,   
  and hovering over the group settings link.
- the **API key** from the Zotero [site](https://www.zotero.org/settings/keys/new)
- **library_type** 
  - own Zotero library --> user
  - shared library --> group
  
## Config file

Rename `config_template.cfg` to `config.cfg` and populate it with the three information as explained above.

In [None]:
import configparser

configFilePath = r'config.cfg'
configParser = configparser.RawConfigParser()   
configParser.read(configFilePath)
library_id = int(configParser.get('zotero-config', 'library_id'))
api_key = configParser.get('zotero-config', 'api_key')
library_type = configParser.get('zotero-config', 'library_type')

# Find items with duplicate attachments

In [None]:
from pyzotero import zotero
from datetime import datetime
from collections import defaultdict


DATE_FMT = "%Y-%m-%dT%XZ"

def date_added(item):
    return datetime.strptime(item['data']['dateAdded'], DATE_FMT)


print("Retrieving Library...")
zot = zotero.Zotero(library_id, library_type, api_key)
lib_items = zot.everything(zot.top())

print("Resolving duplicates...")
items_duplicate_attach = []
for item in lib_items:
    if item['meta']['numChildren'] > 1:
        items_duplicate_attach.append(item)

    if 'attachment' in item['links'].keys():
        attach = item['links']['attachment']['href'].split("/")[-1]
        type_attach = item['links']['attachment']['attachmentType']
        
    else:
        attach = "NO_ATTACHMENT"
        type_attach = "NO_TYPE"

    print(f"""
    Key: {item['data']['key']}\n
    Title: {item['data']['title']}\n
    Author: {item['data']['creators'][0]['firstName']}, {item['data']['creators'][0]['lastName']}\n
    File: {attach} | Type: {type_attach}\n
    Num Attach: {item['meta']['numChildren']}\n
    ----""")

num_duplicates = len(items_duplicate_attach)    
print(f">> Found {num_duplicates} items with duplicate attach: ")

#found_duplicate_attachements = True if num_duplicates else False

# Check if Trash is empty

In [None]:
if len(zot.trash()) > 0:
    print("Trash is not empty. Consider emptying it!")

# Report items with duplicate attachements

In [None]:
#lib_items.sort(key=date_added)
for item in lib_items:
    cs = zot.children(item['key'])
    for c in cs:
        print(f"key: {item['key']} | child: {c['key']}")
    print("---")    

# Remove items with duplicate attachements

**WARNING**: This cell is dangerous!


In [None]:
# children of first
for item in items_duplicate_attach:
    cs = zot.children(item['key'])
    files = []
    for child in cs:
        if 'filename' in child['data'].keys(): # notes have no filename!
            print(child['data']['filename'], item['data']['title'])
            files.append(child['data']['filename'])
        
    # DANGER AREA!!    
    if len(set(files)) == 1: # some items have different pdf files, like suppl materials. Should not delete
        # here they are all named the samen, meaning --> duplicates
        print("all files are the same. Proceed deleting ..")
        zot.delete_tags('#duplicate-citation-key')
        for child in cs[1:]:
            if 'filename' in child['data'].keys():
                print(f"delete {child['data']['filename']}")
            zot.delete_item(child)
    else:
        for child in cs[1:]:
            if 'filename' in child['data'].keys():
                print(f"delete {child['data']['filename']}")
                
            answer = input("y[N]?")
            if answer == "y":
                zot.delete_item(child)

# Merge duplicates 

In [None]:
print("Resolving duplicates...")
# sort items by DOI
by_doi = defaultdict(list)
for item in lib_items:
    if 'DOI' in item['data']:
        by_doi[item['data']['DOI']].append(item)
    elif 'ISBN' in item['data']:
        by_doi[item['data']['ISBN']].append(item)
        
delete_items = []
update_items = []
for doi, items in by_doi.items():
    if len(items) == 1:
        continue

    # sort by age
    items.sort(key=date_added)

    # keep oldest item
    keep = items[0]

    # keep latest attachments
    keep_cs = zot.children(keep['key'])
    for item in items[-1:0:-1]:
        cs = zot.children(item['key'])
        if cs:
            for c in cs:
                c['data']['parentItem'] = keep['key']
                
            update_items.extend(cs)
            delete_items.extend(keep_cs)
            break

        delete_items.extend(items[1:])
        
print(f"Creates list of items to update. {len(update_items)} items found.")
print(f"Creates list of items to delete. {len(delete_items)} items found.")

# Update and delete duplicate items

**WARNING**: This cell is dangerous!


In [None]:
print("Updating library...")
# update first, so we don't delete parents of items we want to keep
for update_item in update_items:
    zot.update_item(update_item)

# now delete: DANGER AREA! 
for delete_item in delete_items:
    print(f"""deleting ... 
    Key: {delete_item['data']['key']}\n
    Title: {delete_item['data']['title']}\n
    Author: {delete_item['data']['creators'][0]['firstName']}, {delete_item['data']['creators'][0]['lastName']}
    """)
    if 'attachment' in delete_item['links'].keys():
        print(f"item has attachment. Can not delete")
        continue
    else:
        zot.delete_item(delete_item)
        
print("done!")        
        

In [None]:
found_duplicates
