In [1]:
import json
import yaml

In [2]:
with open('downloaded_data/PMID9535769.json', 'rt') as jsonin:
    annotations = json.load(jsonin)['rows']

In [3]:
# annotations

In [4]:
def fix_funcexp(tags):
    # until we fix/standardize these tags...
    for t in tags:
        if t.startswith('FuncExp'):
            v = t.split(':')[1]
            if v.endswith('Results'):
                v = v[:-7]
            t = 'FuncExp:%s'%v
        yield t

In [5]:
def tags2dict(tags):
    # FIXME: should be a multidict (e.g., if multiple materials are used)
    return dict(t.split(':') if ':' in t else (t,None) for t in fix_funcexp(tags))

In [24]:
def show_annotation(ann):
    tagdict = tags2dict(ann['tags'])
    print('-------')
    print("CGType: %s"%tagdict.get('CGType', ''))
    print('tags:')
    print('\t' + '\n\t'.join(tagdict))
    print(ann['text'])

In [34]:
paper_variants = {}
paper_variants_by_external_id = {}
paper_experiments = {}
paper_results_by_allele = {}
paper_results_by_experiment = {}
for a in annotations:
    td = tags2dict(a['tags'])
    a['tagdict'] = td
    cgtype = td.get('CGType', None)
    a['cgtype'] = cgtype
    if 'VariantID' == cgtype:
        try:
            variant = {
                'label': a['text'].strip(),
                'tags': td
            }
            if 'Variant' in td:
                paper_variants['Variant:%s'%td['Variant']] = variant
            if 'CAID' in td:
                paper_variants['CAID:%s'%td['CAID']] = variant
            if 'ClinVarID' in td:
                paper_variants['ClinVarID:%s'%td['ClinVarID']] = variant
        except KeyError:
            print("No text for variant:")
            show_annotation(a)
    elif 'ExperimentDescription' == cgtype:
        paper_experiments[td['FuncExp']] = {
            'assayType': 'BAO:%s'%td.get('BAO', ''),
            'assayMaterial': 'CLO:%s'%td['CLO'],
            'description': a['text'].strip(),
            'tags': td
        }
    elif 'ExperimentResult' == cgtype:
        # hack for now-- fix the tags!
        try:
            experiment = td['FuncExp']
        except KeyError:
            experiment = td['FuncExpResults']
        print(td)
        expresult = {
            'description': a['text'].strip()
        }
        # FIXME: handle when different id is used for variant in different annotations
        if 'Variant' in td:
            paper_results_by_allele['Variant:%s'%td['Variant']] = expresult
        elif 'CAID' in td:
            paper_results_by_allele['CAID:%s'%td['CAID']] = expresult
        elif 'ClinVarID' in td:
            paper_results_by_allele['ClinVarID:%s'%td['ClinVarID']] = expresult
        paper_results_by_experiment[experiment] = expresult



{'Variant': '1', 'ClinVarID': '188728', 'CGType': 'ExperimentResult', 'FuncExp': '1'}
{'Variant': '5', 'ClinVarID': '550327', 'CGType': 'ExperimentResult', 'FuncExp': '1'}
{'Variant': '2', 'ClinVarID': '188902', 'CGType': 'ExperimentResult', 'FuncExp': '1'}
{'Variant': '3', 'ClinVarID': '188773', 'CGType': 'ExperimentResult', 'FuncExp': '1'}
{'Variant': '4', 'ClinVarID': '371126', 'CGType': 'ExperimentResult', 'FuncExp': '1'}
{'ValidationControl': 'Pathogenic', 'CAID': 'CA401367186', 'CGType': 'ExperimentResult', 'FuncExp': '1'}
{'ClinVarID': '265160', 'ValidationControl': 'Pathogenic', 'CGType': 'ExperimentResult', 'FuncExp': '1'}
{'ClinVarID': '4030', 'ValidationControl': 'Benign', 'CGType': 'ExperimentResult', 'FuncExp': '1'}


In [37]:
print(yaml.dump(paper_variants))

CAID:CA116612: &id006
  label: NM_000152.3:c.2065G>A (Glu689Lys)
  tags:
    CAID: CA116612
    CGType: VariantID
    ClinVarID: '4030'
    ValidationControl: Benign
CAID:CA273892: &id001
  label: NM_000152.3:c.1933G>A (Asp645Asn)
  tags:
    CAID: CA273892
    CGType: VariantID
    ClinVarID: '188728'
    Variant: '1'
CAID:CA273939: &id002
  label: NM_000152.3:c.2014C>T (Arg672Trp)
  tags:
    CAID: CA273939
    CGType: VariantID
    ClinVarID: '188773'
    Variant: '3'
CAID:CA274102: &id003
  label: NM_000152.3:c.1942G>A (Gly648Ser)
  tags:
    CAID: CA274102
    CGType: VariantID
    ClinVarID: '188902'
    Variant: '2'
CAID:CA401367186:
  label: NM_000152.3:c.1555A>G (Met519Val)
  tags:
    CAID: CA401367186
    CGType: VariantID
    ValidationControl: Pathogenic
CAID:CA8815554: &id007
  label: NM_000152.3:c.1941C>G (Cys647Trp)
  tags:
    CAID: CA8815554
    CGType: VariantID
    ClinVarID: '550327'
    Variant: '5'
CAID:CA8815566: &id005
  label: NM_000152.3:c.2015G>A (Arg672Gln)

In [38]:
print(yaml.dump(paper_experiments))

'1':
  assayMaterial: CLO:0025608
  assayType: BAO:0002994
  description: "**Experiment 1 Description**\n\n**GeneralClass**: BAO:0002994\n\n\
    **MaterialUsed**: TR4912 (derived from GM4912 -CLO:0025608)\n\n**AssayDescription**:\
    \ TR4912 cells (SV40 transformed acid alpha-glucosidase deficient cell line) were\
    \ transfected with control and variant cDNA. Cells were harvested after 48 hours\
    \ and the enzyme activity was measured using the 4-MUG substrate\n\n**AssayReadOutDescription**:\
    \ enzyme activity +/- S.E.M.\n\n**AssayReadOutValues**: Nmol/min/gm protein\n\n\
    **AssayReadOutNormal**: similar to wild type, 2641 +/- 372\n\n**AssayReadOutAbormal**:\
    \ similar to negative control, 0.62 +/- 0.14\n\n**AssayReadOutIndeterminate**:\
    \ unclear results\n\n**AssayControlNormalControl**: WT in sense orientation\n\n\
    **AssayControlAbnormalControl**: WT in antisense orientation \n\n**ValidationControlPathogenic**:\
    \ 2\n\n**ValidationControlBenign**: 1\n\

In [40]:
print(yaml.dump(paper_results_by_allele))

CAID:CA401367186:
  description: '**Pathogenic Control Experiment 1 Result**


    M519V


    **AssayResult**: 22.10 +/- 3.5


    **ExperimentResultAssertion**: abnormal'
ClinVarID:265160:
  description: '**Pathogenic Control Experiment 1 Result**


    W746C


    **AssayResult**: 352.0 +/- 78.1


    **ExperimentResultAssertion**: abnormal'
ClinVarID:4030:
  description: '**Benign Control Experiment 1 Result**


    E689K


    **AssayResult**: 1906.0 +/- 38.1


    **ExperimentResultAssertion**: normal'
Variant:1:
  description: '**Variant 1 Experiment 1 Result**


    D645N


    **AssayResult**: 0.49 +/- 0.10


    **ExperimentResultAssertion**: abnormal'
Variant:2:
  description: '**Variant 2 Experiment 1 Result**


    G648S


    **AssayResult**: 0.68 +/- 0.12


    **ExperimentResultAssertion**: abnormal'
Variant:3:
  description: '**Variant 3 Experiment 1 Result**


    R672W


    **AssayResult**: 1.57 +/- 0.34


    **ExperimentResultAssertion**: abnormal'
Variant:4:
  de

In [41]:
print(yaml.dump(paper_results_by_experiment))

'1':
  description: '**Benign Control Experiment 1 Result**


    E689K


    **AssayResult**: 1906.0 +/- 38.1


    **ExperimentResultAssertion**: normal'



In [8]:
print(yaml.dump({
    'variants': paper_variants,
    'experiments': paper_experiments,
    'by_allele': paper_results_by_allele,
    'by_experiment': paper_results_by_experiment
}))

# link together what we can...
for v in paper_variants:
    paper_results_by_allele[v]['evaluatedVariant'] = paper_variants[v]
for e in paper_experiments:
    paper_results_by_experiment[e]['experimentDescription'] = paper_experiments[e]

{'1': {'description': '**Variant 1 Experiment 1 Result**\n\nD645N\n\n**AssayResult**: 0.49 +/- 0.10\n\n**ExperimentResultAssertion**: abnormal'},
 '5': {'description': '**Variant 5 Experiment 1 Result**\n\nC647W\n\n**AssayResult**: 0.84 +/- 0.52\n\n**ExperimentResultAssertion**: abnormal'},
 '2': {'description': '**Variant 2 Experiment 1 Result**\n\nG648S\n\n**AssayResult**: 0.68 +/- 0.12\n\n**ExperimentResultAssertion**: abnormal'},
 '3': {'description': '**Variant 3 Experiment 1 Result**\n\nR672W\n\n**AssayResult**: 1.57 +/- 0.34\n\n**ExperimentResultAssertion**: abnormal'},
 '4': {'description': '**Variant 4 Experiment 1 Resul\n\nR672Q\n\n**AssayResult**: 0.63 +/- 0.19\n\n**ExperimentResultAssertion**: abnormal'}}

In [36]:
print(yaml.dump(paper_variants))

CAID:CA116612: &id006
  label: NM_000152.3:c.2065G>A (Glu689Lys)
  tags:
    CAID: CA116612
    CGType: VariantID
    ClinVarID: '4030'
    ValidationControl: Benign
CAID:CA273892: &id001
  label: NM_000152.3:c.1933G>A (Asp645Asn)
  tags:
    CAID: CA273892
    CGType: VariantID
    ClinVarID: '188728'
    Variant: '1'
CAID:CA273939: &id002
  label: NM_000152.3:c.2014C>T (Arg672Trp)
  tags:
    CAID: CA273939
    CGType: VariantID
    ClinVarID: '188773'
    Variant: '3'
CAID:CA274102: &id003
  label: NM_000152.3:c.1942G>A (Gly648Ser)
  tags:
    CAID: CA274102
    CGType: VariantID
    ClinVarID: '188902'
    Variant: '2'
CAID:CA401367186:
  label: NM_000152.3:c.1555A>G (Met519Val)
  tags:
    CAID: CA401367186
    CGType: VariantID
    ValidationControl: Pathogenic
CAID:CA8815554: &id007
  label: NM_000152.3:c.1941C>G (Cys647Trp)
  tags:
    CAID: CA8815554
    CGType: VariantID
    ClinVarID: '550327'
    Variant: '5'
CAID:CA8815566: &id005
  label: NM_000152.3:c.2015G>A (Arg672Gln)