In [11]:
import json
import yaml
import collections

In [12]:
# a little magic-- use literal '|' form for multiline strings or long strings in yaml
yaml.add_representer(str, lambda dumper, data:
                     dumper.represent_scalar('tag:yaml.org,2002:str', data, style="|" if '\n' in data or len(data) > 100 else None))

In [13]:
with open('downloaded_data/PMID9535769.json', 'rt') as jsonin:
    annotations = json.load(jsonin)['rows']

In [14]:
def multidict_constructor(items):
    '''Creates a pseudo multidict-- only multidict for the items it gets initially called with.
    
    Not a true multidict, but easily representable in json/yaml...'''
    data = {}
    for k,v in items:
        try:
            oldv = data[k]
            if isinstance(oldv, list):
                oldv.append(v)
            else:
                data[k] = [oldv, v]
        except KeyError:
            data[k] = v
    return data

In [15]:
def tags2dict(tags):
    return multidict_constructor(t.split(':') if ':' in t else (t,None) for t in tags)

def annotation2dict(annotation_text):
    return multidict_constructor(x.lstrip('**').split('**: ',1) for x in annotation_text.split('\n\n**') if '**:' in x)

In [16]:
def show_annotation(ann):
    tagdict = tags2dict(ann['tags'])
    print('-------')
    print("CGType: %s"%tagdict.get('CGType', ''))
    print('tags:')
    print('\t' + '\n\t'.join(tagdict.data))
    print(ann['text'])

In [17]:
manuscript_variants = []
manuscript_variants_by_id = {}
manuscript_experiments = {}
manuscript_results = []

for a in annotations:
    td = tags2dict(a['tags'])
    a['tagdict'] = td
    cgtype = td.get('CGType', None)
    a['cgtype'] = cgtype
    try:
        text = a['text'].strip()
    except KeyError:
        print("WARNING: annotation without text:")
        show_annotation(a)
        text = ''
    attributes = annotation2dict(text)
    record = {
        'tags': td
    }
    
    if len(attributes) == 0:
        record['text'] = text
    else:
        record.update(attributes)

    if 'Manuscript' == cgtype:
        manuscript_general_info = record

    elif 'Variant' == cgtype:
        manuscript_variants.append(record)
        for namespace in ('Variant', 'CAID', 'ClinVarID'):
            if namespace in td:
                identifier = '%s:%s'%(namespace, td[namespace])
                if identifier in manuscript_variants_by_id:
                    print("WARNING: possibly redefining variant, identifier %s already seen"%identifier)
                    manuscript_variants_by_id[identifier].update(record)
                else:
                    manuscript_variants_by_id[identifier] = record

    elif 'FunctionalAssay' == cgtype:
        record.update({
            'assayType': 'BAO:%s'%td.get('BAO', ''),
            'assayMaterial': 'CLO:%s'%td['CLO'],
            'results': []
        })
        manuscript_experiments[td['FuncAssay']] = record

    elif 'FunctionalAssayResult' == cgtype:
        manuscript_results.append(record)

In [22]:
# Link what we can

manuscript_results_by_variant = collections.defaultdict(list)

# first go through results and lookup variant and experiment
for r in manuscript_results:
    tags = r['tags']
    r['experiment'] = manuscript_experiments[tags['FuncAssay']]
    r['source'] = manuscript_general_info['tags']['PMID']
    for namespace in ('Variant', 'CAID', 'ClinVarID'):
        if namespace in tags:
            v = manuscript_variants_by_id['%s:%s'%(namespace, tags[namespace])]
            r['evaluatedVariantRecord'] = v
            # prefer CAID, accept ClinVarID
            v_identifier = next(('%s:%s'%(k, v['tags'][k])
                                 for k in ('CAID', 'ClinVarID')
                                 if k in v['tags']), '')
            if '' != v_identifier:
                r['evaluatedVariant'] = v_identifier
                manuscript_results_by_variant[v_identifier].append(r)
            else:
                print('ERROR: %s does not have an acceptable external identifier'%v)
            break # only use the first of the namespaces found in tags of result
    # could do this, but it makes the data structure have loops...
    r['experiment']['results'].append(r)

In [23]:
print(yaml.dump(manuscript_experiments))

'1': &id001
  AssayAbnormalRange: similar to negative control
  AssayDescription: |-
    TR4912 cells were transfected with control and variant cDNA. Cells were harvested after 48 hours and the enzyme activity was measured using the 4-MUG substrate
  AssayGeneralClass: enzyme activity assay
  AssayMaterialUsed: |-
    SV40 transformed acid alpha-glucosidase deficient cell line TR4912 (derived from GM04912 -CLO:0025608)
  AssayNormalRange: similar to wild type
  AssayRange: Nmol/min/gm protein
  AssayReadOutDescription: enzyme activity +/- S.E.M.
  PValueThreshold: p=0.02
  Replication: |-
    two independent clones for each mutation  was isolated and expressed, except for the 2015G>A variant, where only a single clone was isolated and expressed. Each results is an average of 3-12 enzyme activity values
  StatisticalAnalysisDescription: "statistically different from mean enzyme expression\
    \ of antisense normal cDNA. Significance determined with unpaired Welch\u2019\
    s 2 tailed 