# DSA Annotation Cleanup
Query the annotation documents found in a DSA location, and delete some to clean up the annotations in the server.

In [None]:
# Make custom neurotk Python package importable.
import sys
sys.path.append('..')

# imports
from neurotk import login
from multiprocessing import Pool
from tqdm import tqdm
from collections import Counter

In [None]:
# authenticate girder client
gc = login('https://megabrain.neurology.emory.edu/api/v1')

In [None]:
# some params
nproc = 10  # number of processes to use for parallelization
parent_id = '641ba814867536bb7a225533'

DOCS_TO_DELETE = {
}

print('Will delete annotation documents in the following list of names:')
for i, name in enumerate(DOCS_TO_DELETE):
    print(f'  {i+1}. {name}')

In [None]:
def process_item(item_id: str, verbose: bool = False) -> list[str]:
    """Process an item for annotations.
    
    Args:
        item_id: Item DSA id.
        verbose: True to print out statements.
        
    Returns:
        List of annotation document names.
    
    """
    # Get annotation document list.
    docs = gc.get(f'annotation?itemId={item_id}&limit=0&offset=0&sort=lowerName&sortdir=1')
    
    # Get a dictionary of annotation document names.
    names = []
    
    for doc in docs:
        if doc.get('annotation'):
            if doc.get('annotation').get('name'):
                name = doc.get('annotation').get('name')
                
                if name in DOCS_TO_DELETE:
                    # Delete the doc.
                    if verbose:
                        print(f"Deleting annotation document {name} (id: {doc['_id']}).")
                        
                    _ = gc.delete(f"annotation/{doc['_id']}")
                else:
                    # Track the doc.
                    names.append(name)
                
    return names

In [None]:
# Get a list of all the annotated images (aka items).
print('Getting list of items with annotation documents...')

items = gc.get(
    f'annotation/images?creatorId={parent_id}&limit=0&offset=0&sort=updated&sortdir=-1'
)

# Use parallel processing on items.
print('\nProcessing the items, deleting annotation documents...')

with Pool(nproc) as pool:
    jobs = [
        pool.apply_async(
            func=process_item, 
            args=(item['_id'], False,)
        ) 
        for item in items]
    
    doc_names = []
    
    for job in tqdm(jobs):
        doc_names.extend(job.get())
        
print('\nDocuments remaining:')
for k, v in Counter(doc_names).items():
    print(f'  {k} (n={v})')