In [1]:
import json
import yaml
import collections

In [2]:
# a little magic-- use literal '|' form for multiline strings or long strings in yaml
yaml.add_representer(str, lambda dumper, data:
                     dumper.represent_scalar('tag:yaml.org,2002:str', data, style="|" if '\n' in data or len(data) > 100 else None))

In [3]:
with open('downloaded_data/PMID16261627.json', 'rt') as jsonin:
    annotations = json.load(jsonin)

In [4]:
def multidict_constructor(items):
    '''Creates a pseudo multidict-- only multidict for the items it gets initially called with.
    
    Not a true multidict, but easily representable in json/yaml...'''
    data = {}
    for i in items:
        if 2 == len(i):
            k,v = i
        else:
            k,v = i[0], ''
        try:
            oldv = data[k]
            if isinstance(oldv, list):
                oldv.append(v)
            else:
                data[k] = [oldv, v]
        except KeyError:
            data[k] = v
    return data

In [5]:
def tags2dict(tags):
    return multidict_constructor(t.split(':', 1) if ':' in t else (t,None) for t in tags)

def annotation2dict(annotation_text):
    return multidict_constructor(x.lstrip('**').split('**: ',1) for x in annotation_text.split('\n\n**') if '**:' in x)

In [6]:
def show_annotation(ann):
    tagdict = tags2dict(ann['tags'])
    print('-------')
    print("CGType: %s"%tagdict.get('CGType', ''))
    print('tags:')
    print('\t' + '\n\t'.join(tagdict.data))
    print(ann['text'])

In [7]:
manuscript_variants = []
manuscript_variants_by_id = {}
manuscript_experiments = {}
manuscript_results = []

for a in annotations:
    td = tags2dict(a['tags'])
    a['tagdict'] = td
    cgtype = td.get('CGType', None)
    a['cgtype'] = cgtype
    try:
        text = a['text'].strip()
    except KeyError:
        print("WARNING: annotation without text:")
        show_annotation(a)
        text = ''
    attributes = annotation2dict(text)
    record = {
        'tags': td
    }
    
    if len(attributes) == 0:
        record['text'] = text
    else:
        record.update(attributes)

    if 'Manuscript' == cgtype:
        manuscript_general_info = record

    elif 'Variant' == cgtype:
        manuscript_variants.append(record)
        for namespace in ('Variant', 'CAID', 'ClinVarID'):
            if namespace in td:
                identifier = '%s:%s'%(namespace, td[namespace])
                if identifier in manuscript_variants_by_id:
                    print("WARNING: possibly redefining variant, identifier %s already seen"%identifier)
                    manuscript_variants_by_id[identifier].update(record)
                else:
                    manuscript_variants_by_id[identifier] = record

    elif 'FunctionalAssay' == cgtype:
        record.update({
            'assayType': 'BAO:%s'%td.get('BAO', ''),
            'assayMaterial': 'CLO:%s'%td['CLO'],
            'results': []
        })
        manuscript_experiments[td['FuncAssay']] = record

    elif 'FunctionalAssayResult' == cgtype:
        manuscript_results.append(record)

In [8]:
# Link what we can

manuscript_results_by_variant = collections.defaultdict(list)

# first go through results and lookup variant and experiment
for r in manuscript_results:
    tags = r['tags']
    r['experiment'] = manuscript_experiments[tags['FuncAssay']]
    r['source'] = manuscript_general_info['tags']['PMID']
    for namespace in ('Variant', 'CAID', 'ClinVarID'):
        if namespace in tags:
            v = manuscript_variants_by_id['%s:%s'%(namespace, tags[namespace])]
            r['evaluatedVariantRecord'] = v
            # prefer CAID, accept ClinVarID
            v_identifier = next(('%s:%s'%(k, v['tags'][k])
                                 for k in ('CAID', 'ClinVarID')
                                 if k in v['tags']), '')
            if '' != v_identifier:
                r['evaluatedVariant'] = v_identifier
                manuscript_results_by_variant[v_identifier].append(r)
            else:
                print('ERROR: %s does not have an acceptable external identifier'%v)
            break # only use the first of the namespaces found in tags of result
    # could do this, but it makes the data structure have loops...
    r['experiment']['results'].append(r)

In [9]:
print(yaml.dump(manuscript_experiments))

'1': &id001
  AssayAbnormalRange: Absence or decreased abundance of full-length cochlin/altered
    migration pattern
  AssayDescription: "Whole cell lysates from HEK293 cells transiently transfected\
    \ with HA-tagged human COCH constructs were analyzed by western blot following\
    \ non-reducing gel electrophoresis to assess steady-state levels of the protein\
    \ and disulfide bond formation. "
  AssayGeneralClass: BAO:0002424 western blot
  AssayIndeterminateRange: Not reported
  AssayMaterialUsed: CLO:0001230 HEK293 cells
  AssayNormalRange: 'Presence of full-length cochlin (~60 kDa) '
  AssayRange: Presence, absence, increased abundance, or decreased abundance of full
    length cochlin at ~60 kDa
  AssayReadOutDescription: Protein band size and intensity
  PValueThreshold: Not reported
  Replication: Not reported
  StatisticalAnalysisDescription: Not reported
  ValidationControlBenign: '0'
  ValidationControlPathogenic: '0'
  assayMaterial: CLO:0001230
  assayType: BAO:00

In [10]:
print(yaml.dump(manuscript_results))

- &id001
  AssayResult: Absence of ~60 kDa band
  AssayResultAssertion: Abnormal
  ControlType: Untagged wild type COCH (control for non-specificity in immunoprecipitation)
  experiment: &id002
    AssayAbnormalRange: Absence or decreased abundance of full-length cochlin/altered
      migration pattern
    AssayDescription: "Whole cell lysates from HEK293 cells transiently transfected\
      \ with HA-tagged human COCH constructs were analyzed by western blot following\
      \ non-reducing gel electrophoresis to assess steady-state levels of the protein\
      \ and disulfide bond formation. "
    AssayGeneralClass: BAO:0002424 western blot
    AssayIndeterminateRange: Not reported
    AssayMaterialUsed: CLO:0001230 HEK293 cells
    AssayNormalRange: 'Presence of full-length cochlin (~60 kDa) '
    AssayRange: Presence, absence, increased abundance, or decreased abundance of
      full length cochlin at ~60 kDa
    AssayReadOutDescription: Protein band size and intensity
    PValueThr