# Get annotations for annotated manuscripts

Currently uses the CGType:General annotations as an indication that we should pull down the annotations. There is no PMID tag, so we have to pull the PMID from the text of the CGType:General tag.

**Note**: In order to use this, you have to have a hypothes.is API token with permissions to access the relevant hypothes.is group. It should be included in a file called "token" as a single line.

In [None]:
import requests
import yaml
import os
import re
import json

In [None]:
token = open('token').read().rstrip()
api_endpoint = 'https://hypothes.is/api'
group = 'DRL6xW1v'

In [None]:
def search_for_annotations(querydata, token=token):
    '''Fetches, using the already-defined token and handling pagination'''
    # FIXME: search_after is described as more efficient in the API docs, but I don't know if it handles timestamp collisions
    fetched = []
    while True:
        response = requests.post(api_endpoint + '/search',
                            data=querydata,
                            headers={'Authorization': 'Bearer %s'%token}).json()
        fetched += response['rows']
        if response['total'] == len(fetched):
            # FIXME: kludge! only here until these broken annotations are removed
            return [x for x in fetched if not 'libproxy.lib.unc.edu' in x['uri']]
        querydata['offset'] = len(fetched)

In [None]:
annotations_tagged_manuscript = search_for_annotations({
        'group': group,
        'sort': 'created',
        'order': 'desc',
        'limit': 200,
        'tag': 'CGType:Manuscript'
})

In [None]:
# It appears as though hyp.is stores urls with query parameters-- which might be necessary in some
# cases, but I'm also not sure this is completely safe, so I don't want to keep them. So that
# is what this mess is about.

# later on, we'll call this for all of the downloaded data

def json_recursive(data, pre=None, post=None):
    '''Recursively works through data, applying the functions pre/post(value, key) as it goes'''
    if isinstance(data, dict):
        output = {}
        for (k,v) in data.items():
            if pre is not None:
                v, k = pre(v, k)
            v = json_recursive(v, pre, post)
            if post is not None:
                v, k = post(v, k)
            output[k] = v
        return output
    if isinstance(data, list):
        output = []
        for i,v in enumerate(data):
            if pre is not None:
                v, k = pre(v, i)
            v = json_recursive(v, pre, post)
            if post is not None:
                v, k = post(v, i)
            output.append(v)
        return output
    return data

def urls_without_queryparams(v,k):
    if k in ('uri', 'source', 'incontext'):
        v = v.split('?',1)[0]
    return v,k

In [None]:
def find_pmid(cg_manuscript_annotation):
    for t in cg_manuscript_annotation['tags']:
        m = re.search(r'PMID:\s*(\d+)', t)
        if m:
            return m.group(1)
    return None

In [None]:
os.makedirs('downloaded_data', exist_ok=True)
for manuscript_tag_annotation in annotations_tagged_manuscript:
    pmid = find_pmid(manuscript_tag_annotation)
    if pmid is not None:
        manuscript_annotations_json = search_for_annotations(
            querydata={
                'group': group,
                'sort': 'created',
                'order': 'asc',
                'limit': 200,
                'uri': manuscript_tag_annotation['uri']
            })
        with open(os.path.join('downloaded_data', 'PMID%s.json'%pmid), 'wt') as jsonf:
            json.dump(manuscript_annotations_json, jsonf, indent=2)
        with open(os.path.join('downloaded_data', 'PMID%s.yaml'%pmid), 'wt') as yamlf:
            yaml.dump(manuscript_annotations_json, yamlf)