# Demonstration

In [None]:
# Helper functions

import json
import os

from click.testing import CliRunner
import yaml
from fastavro import writer, reader, parse_schema, json_writer, json_reader
from pprint import pprint, pformat

from pfb import cli
from pfb.cli import main as pfb

def read_yaml(filepath):
    with open(filepath, "r") as yaml_file:
        return yaml.load(yaml_file, Loader=yaml.FullLoader)

def read_json(filepath, default=None):
    if (default is not None) and (not os.path.isfile(filepath)):
        return default

    with open(filepath, 'r') as data_file:
        return json.load(data_file)

def write_json(data, filepath, **kwargs):
    with open(filepath, 'w') as json_file:
        kwargs = {
            'indent': 4,
            'sort_keys': True
        }
        json.dump(data, json_file, **kwargs)
        
def pfb_invoke(*args, **kwargs):
    # Use CliRunner to call Click cli from python
    runner = CliRunner()
    result = runner.invoke(pfb, args, **kwargs)
    try:
        assert result.exit_code == 0, result.output
    except AssertionError:
        print(str(result.exc_info))

    return result

def print_avro(input_avro, output_json=None):
    with open(input_avro, 'rb') as fo:
        output = {}
        print(f'Avro file {input_avro} schema:')
        schema = reader(fo).metadata
        schema['avro.schema'] = json.loads(schema['avro.schema'])
        output.update(schema)
        fo.seek(0)
        print(f'\nAvro file {input_avro} data:')    
        output['data'] = [record for record in reader(fo)]
        pprint(output)
        if output_json:
            write_json(output, output_json)

## Vanilla Avro

In [12]:
# Output avro filepath
# data_file = 'data/kf-vanilla.avro'

from fastavro import writer, reader, parse_schema
from pprint import pprint

# Avro schema describing data that will go into avro file
schema = {
     "namespace": "kidsfirst",
     "type": "record",
     "name": "FileSchema",
     "fields": [
         {
             'name': 'object',
             'type': [
                {
                    'fields': [
                         {"name": "external_id", "type": ["null", "string"], 'default': None},
                         {"name": "gender",  "type": ["null", "string"], 'default': None},
                         {"name": "race",  "type": ["null", "string"], 'default': None},
                         {"name": "age",  "type": ["null", "int"], 'default': None}
                     ],
                     'type': 'record',
                     'name': 'patient'
                },
                {
                    'fields': [
                        {"name": "external_id", "type": ["null", "string"], 'default': None},
                        {"name": "analyte_type",  "type": ["null", "string"], 'default': None},
                        {"name": "composition",  "type": ["null", "string"], 'default': None},
                        {
                            "name": "duo_ids", 
                            "type": ['null', {
                                 'type': 'array',
                                 'items': 'string'
                             }]
                        }
                     ],
                     'type': 'record',
                     'name': 'specimen'
                } 
             ]
         }
     ]
}
# write_json(schema, 'data/kf-vanilla-avro-schema.json')

# Create some data 
records = [
    {'object': ("kidsfirst.patient", {"external_id": "Patient1", "gender": "female"})},
    {'object': ("kidsfirst.specimen", {"external_id": "Specimen1", "analyte_type": 0})},
]

# Write 
with open('out.avro', 'wb') as out:
    writer(out, parse_schema(schema), records, validator=True)
    

# Read
with open('out.avro', 'rb') as fo:
    for record in reader(fo, return_record_name=True):
        pprint(record)

ValidationError: [
  "kidsfirst.FileSchema.object.kidsfirst.specimen.analyte_type is <0> of type <class 'int'> expected null",
  "kidsfirst.FileSchema.object.kidsfirst.specimen.analyte_type is <0> of type <class 'int'> expected string"
]

## PFB Avro - Suitable for relational data

In [None]:
# Create test data using gen3 data simulator
# Requires the gen3 data dictionary to be stored on s3
data_dir = 'data/simulated/'
gen3_dd = 'data/kf-gen3-datadict.json'
schema_avro = 'data/kf-pfb-schema.avro'
output_avro = 'data/kf-pfb.avro'
program = 'kidsfirst'
project = 'drc'

# Execute if you don't have any test data yet
# !data-simulator simulate --url https://s3.amazonaws.com/singhn4-data-dict-bucket/kf-gen3-datadict.json --path data/simulated --program kidsfirst --project drc
# !ls -l data/simulated   

In [None]:
# Create schema avro file from gen3 data dict
kf_gen3_dd = read_yaml(gen3_dd)

print('Writing PFB Schema')
result = pfb_invoke('from', '-o', schema_avro, 'dict', gen3_dd)

print('************ Display the PFB with relational model only *********')
print_avro(schema_avro)

# Write the test data to the output avro file
print('Writing data to PFB file')
result = pfb_invoke('from', '-o', output_avro, 'json',
           '-s', schema_avro, 
           '--program', program,
          '--project', project,
          data_dir)

print('\n************ Display the PFB with the data and relational model *********')
print_avro(output_avro, output_json='data/kf-pfb.json')

print('\n************ PFB CLI Specific Functions ************')
# Show the avro schema in pfb file
print('-- pfb show schema -- ')
result = pfb_invoke('show', '-i', output_avro, 'schema')
pprint(json.loads(result.output))

# Read the data back out from the pfb file
print('-- pfb show nodes -- ')
result = pfb_invoke('show', '-i', output_avro, 'nodes')
print(result.output)

In [None]:
import json
import os
from itertools import chain

from fastavro import writer, reader, parse_schema
from pprint import pprint, pformat


# test_schema_file = '../../tests/data/test_schema0.json'
# test_schema_file = '../../tests/data/test_schema.json'
test_schema_file = '../../tests/data/pfb_export/pfb_schema.json'

pfb_file = '../../tests/data/test_pfb.avro'
metadata_file = '../../tests/data/pfb_export/metadata.json'

with open(test_schema_file) as json_file:
    test_schema = json.load(json_file)

with open(metadata_file) as json_file:
    metadata = json.load(json_file)

records = [
    metadata,
    {'id': 'BS_082CXWXG',
     'name': 'biospecimen',
     'namespace': 'pfb.biospecimen',
     'object': {'age_at_event_days': None,
                'analyte_type': 'DNA',
                'composition': 'Blood',
                'concentration_mg_per_ml': None,
                'consent_type': 'DS-MUS-SKEL-IRB',
                'created_at': '2018-09-04T14:43:03.277375+00:00',
                'dbgap_consent_code': 'phs001410.c2',
                'duo_ids': None,
                'external_aliquot_id': '6355001',
                'external_sample_id': '6355001',
                'kf_id': 'BS_082CXWXG',
                'method_of_sample_procurement': None,
                'modified_at': '2018-10-23T04:39:39.365667+00:00',
                'ncit_id_anatomical_site': None,
                'ncit_id_tissue_type': 'NCIT:C14165',
                'participant_id': 'PT_VVP2BNC9',
                'sequencing_center_id': 'SC_X1N69WJM',
                'shipment_date': None,
                'shipment_origin': None,
                'source_text_anatomical_site': None,
                'source_text_tissue_type': 'Normal',
                'source_text_tumor_descriptor': None,
                'spatial_descriptor': None,
                'uberon_id_anatomical_site': None,
                'visible': True,
                'volume_ul': None},
     'relations': []
    }
]



def yield_metadata():
    yield metadata

def yield_row_entities():
    for ent in records:
        yield ent

def yield_entities():
    return chain(yield_metadata(), yield_row_entities())

if os.path.isfile(pfb_file):
    os.remove(pfb_file)

with open(pfb_file, 'a+b') as avro_file:
    for ent in yield_entities():
        writer(avro_file, test_schema, [ent], validator=True)

with open(pfb_file, 'rb') as fo:
    avro_reader = reader(fo)
    for record in avro_reader:
        pprint(record)


In [None]:
data_dir = '/Users/singhn4/Projects/kids_first/kf-ingest-packages/data'
output_dir = '../../tests/data/input'
