In [None]:
import json
import yaml
import re
import collections
import IPython.display as display
from jinja2 import Template

In [None]:
# a little magic-- use literal '|' form for multiline strings or long strings in yaml
yaml.add_representer(str, lambda dumper, data:
                     dumper.represent_scalar('tag:yaml.org,2002:str', data, style="|" if '\n' in data or len(data) > 100 else None))

In [None]:
with open('downloaded_data/PMID16261627.json', 'rt') as jsonin:
    annotations = json.load(jsonin)

In [None]:
def multidict_constructor(items):
    '''Creates a pseudo multidict-- only multidict for the items it gets initially called with.
    
    Not a true multidict, but easily representable in json/yaml...'''
    data = {}
    for i in items:
        if 2 == len(i):
            k,v = i
        else:
            k,v = i[0], ''
        try:
            oldv = data[k]
            if isinstance(oldv, list):
                oldv.append(v)
            else:
                data[k] = [oldv, v]
        except KeyError:
            data[k] = v
    return data

In [None]:
def tags2dict(tags):
    return multidict_constructor(t.split(':', 1) if ':' in t else (t,None) for t in tags)

def annotation2dict(annotation_text):
    return multidict_constructor(re.split(r'\*\*:\s?', x.lstrip('**'), 1) for x in annotation_text.split('\n\n**') if '**:' in x)

In [None]:
def show_annotation(ann):
    tagdict = tags2dict(ann['tags'])
    print('-------')
    print("CGType: %s"%tagdict.get('CGType', ''))
    print('tags:')
    print('\t' + '\n\t'.join(tagdict.data))
    print(ann['text'])

In [None]:
manuscript_variants = []
manuscript_variants_by_id = {}
manuscript_experiments = {}
manuscript_results = []

for a in annotations:
    td = tags2dict(a['tags'])
    a['tagdict'] = td
    cgtype = td.get('CGType', None)
    a['cgtype'] = cgtype
    try:
        text = a['text'].strip()
    except KeyError:
        print("WARNING: annotation without text:")
        show_annotation(a)
        text = ''
    attributes = annotation2dict(text)
    record = {
        'tags': td,
        'annotationLinks': a['links']
    }
    
    if len(attributes) == 0:
        record['text'] = text
    else:
        record.update(attributes)

    if 'Manuscript' == cgtype:
        manuscript_general_info = record

    elif 'Variant' == cgtype:
        manuscript_variants.append(record)
        for namespace in ('Variant', 'CAID', 'ClinVarID'):
            if namespace in td:
                identifier = '%s:%s'%(namespace, td[namespace])
                if identifier in manuscript_variants_by_id:
                    print("WARNING: possibly redefining variant, identifier %s already seen"%identifier)
                    manuscript_variants_by_id[identifier].update(record)
                else:
                    manuscript_variants_by_id[identifier] = record

    elif 'FunctionalAssay' == cgtype:
        record.update({
            'assayType': 'BAO:%s'%td.get('BAO', ''),
            'assayMaterial': 'CLO:%s'%td['CLO'],
            'results': []
        })
        manuscript_experiments[td['FuncAssay']] = record

    elif 'FunctionalAssayResult' == cgtype:
        manuscript_results.append(record)

In [None]:
# Link what we can

manuscript_results_by_variant = collections.defaultdict(list)

# first go through results and lookup variant and experiment
for r in manuscript_results:
    tags = r['tags']
    r['experiment'] = manuscript_experiments[tags['FuncAssay']]
    r['source'] = manuscript_general_info['tags']['PMID']
    for namespace in ('Variant', 'CAID', 'ClinVarID'):
        if namespace in tags:
            v = manuscript_variants_by_id['%s:%s'%(namespace, tags[namespace])]
            r['evaluatedVariantRecord'] = v
            # prefer CAID, accept ClinVarID
            v_identifier = next(('%s:%s'%(k, v['tags'][k])
                                 for k in ('CAID', 'ClinVarID')
                                 if k in v['tags']), '')
            if '' != v_identifier:
                r['evaluatedVariant'] = v_identifier
                manuscript_results_by_variant[v_identifier].append(r)
            else:
                print('ERROR: %s does not have an acceptable external identifier'%v)
            break # only use the first of the namespaces found in tags of result
    # could do this, but it makes the data structure have loops...
    r['experiment']['results'].append(r)

In [None]:
manuscript_experiments['1']

In [None]:
print(yaml.dump(manuscript_experiments))

In [None]:
manuscript_results[0]

In [None]:
def render_result(r, recurse=True):
    h = '<dl>'
    #h = '<html>\n'
    for (k,v) in r.items():
        if k == 'tags':
            h += '<dt>Tags</dt><dd>'
            for tk, tv in v.items():
                if tk in ('Variant', 'FuncAssay'):
                    continue
                h += '      <li>%s:%s</li>\n'%(tk,tv)
            h += '    </dd>\n'
        elif type(v) == str:
            h += '  <dt>%s</dt><dd>%s</dd>\n'%(k, v)
        elif recurse:
            h += '  <dt>%s</dt>\n<dd>%s</dd>\n'%(k, render_result(v, False))
    h+= '</dl>'
    return(h)

def render_assay_result(ar, recurse=True):
    h = '<dl>'
    ordered_fields = ('evaluatedVariant', 'AssayResult', 'AssayResultAssertion', 'source', 'Comment')
    for field in ordered_fields:
        if field in ar:
            h += '<dt>%s</dt><dd>%s</dd>'%(field[5:] if field.startswith('Assay') else field, ar[field])
    h += '<dt>Tags</dt><dd><ul>'
    for tk, tv in ar['tags'].items():
        if tk in ('Variant', 'FuncAssay'):
            continue
        h += '<li>%s:%s</li>'%(tk,tv)
    h += '</ul></dd>'
    for (k,v) in ar.items():
        if k in ordered_fields or k == 'tags':
            continue
        if type(v) == str:
            h += '  <dt>%s</dt><dd>%s</dd>\n'%(k, v)
        elif recurse:
            h += '  <dt>%s</dt>\n<dd>%s</dd>\n'%(k, render_result(v, False))
    return(h)

In [None]:
display.display(display.HTML(render_assay_result(manuscript_results[2], True)))

In [None]:
manuscript_results_by_variant['CAID:CA395144928']

In [None]:
variants = [v for v in manuscript_variants_by_id.keys() if v.startswith('CAID') or v.startswith('ClinVarID')]
variants

In [None]:
variant_results_template = Template('''
<style> 
table td, table th, table tr {text-align:left !important; vertical-align: top !important;}
</style>
{% set v = manuscript_variants_by_id[variant_id] %}
<h3>Results for <a href="http://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_canonicalid?canonicalid={{ v['tags']['CAID'] }}">{{ v['tags']['CAID'] }}</a>
   {% if 'HGVS' in v %} ({{ v['HGVS'] }}){% endif %}
   <a href="{{ v['annotationLinks']['html'] }}">&#128279;</a><h3>
{% for assay_result in manuscript_results_by_id[variant_id] %}
{% set assay = assay_result['experiment'] %}
<div>
<table>
<tr><td>Result: <a href="{{ assay_result['annotationLinks']['html'] }}">&#128279;</a></td><td>{{ assay_result['AssayResult'] }}</td></tr>
<tr><td>Author classification:</td><td>{{ assay_result['AssayResultAssertion'] }}</td></tr>
{% if 'Comment' in assay_result %}
<tr><td>Comment:</td><td>{{ assay_result['Comment'] }}</td></tr>
{% endif %}
<tr><td>Assay Information: <a href="{{ assay['annotationLinks']['html'] }}">&#128279;</a></td>
    <td>
        <table>
            <tr><td>Source (PMID):</td><td>
                <a href="https://pubmed.ncbi.nlm.nih.gov/{{ assay_result['source']  }}/">{{ assay_result['source'] }}</a></td></tr>
            <tr><td>General Class:</td><td>{{ assay['AssayGeneralClass'] }}</td></tr>
            <tr><td>Material Used:</td><td>{{ assay['AssayMaterialUsed'] }}</td></tr>
            <tr><td>Description:</td><td>{{ assay['AssayDescription'] }}</td></tr>
            <tr><td>Statistical analysis description</td><td>{{ assay['StatisticalAnalysisDescription'] }}</td></tr>
            <tr><td>Replication:</td><td>{{ assay['Replication'] }}</td></tr>
            <tr><td>Readout:</td><td>{{ assay['AssayReadOutDescription'] }}</td></tr>
            <tr><td>Ranges:</td><td>
                <table>
                    <tr><td>Abnormal:</td><td>{{ assay['AssayAbnormalRange'] }}</td></tr>
                    <tr><td>Indeterminate:</td><td>{{ assay['AssayIndeterminateRange'] }}</td></tr>
                    <tr><td>Normal:</td><td>{{ assay['AssayNormalRange'] }}</td></tr>
                </table>
            </td></tr>
            <tr><td>Validation Controls:</td><td>Pathogenic: {{ assay['ValidationControlPathogenic'] }}<br/>Benign: {{ assay['ValidationControlBenign'] }}</td></tr>
        </table>
    </td></tr>
</table>
<hr>
</div>
{% endfor %}
''')

def html_for_variant_results(variant_id):
    return variant_results_template.render(variant_id=variant_id,
                                           manuscript_results_by_id=manuscript_results_by_variant,
                                           manuscript_variants_by_id=manuscript_variants_by_id)

def show_variant_results(variant_id):
    display.display(display.HTML(html_for_variant_results(variant_id)))

In [None]:
for variant_id in variants:
    show_variant_results(variant_id)