# Demonstration

In [2]:
# Helper functions

import json
import os

from click.testing import CliRunner
import yaml
from fastavro import writer, reader, parse_schema
from pprint import pprint, pformat

from pfb import cli
from pfb.cli import main as pfb

def read_yaml(filepath):
    with open(filepath, "r") as yaml_file:
        return yaml.load(yaml_file, Loader=yaml.FullLoader)

def read_json(filepath, default=None):
    if (default is not None) and (not os.path.isfile(filepath)):
        return default

    with open(filepath, 'r') as data_file:
        return json.load(data_file)

def write_json(data, filepath, **kwargs):
    with open(filepath, 'w') as json_file:
        kwargs = {
            'indent': 4,
            'sort_keys': True
        }
        json.dump(data, json_file, **kwargs)
        
def pfb_invoke(*args, **kwargs):
    # Use CliRunner to call Click cli from python
    runner = CliRunner()
    result = runner.invoke(pfb, args, **kwargs)
    try:
        assert result.exit_code == 0, result.output
    except AssertionError:
        print(str(result.exc_info))

    return result

def print_avro(input_avro, output_json=None):
    with open(input_avro, 'rb') as fo:
        output = {}
        print(f'Avro file {input_avro} schema:')
        schema = reader(fo).metadata
        schema['avro.schema'] = json.loads(schema['avro.schema'])
        output.update(schema)
        fo.seek(0)
        print(f'\nAvro file {input_avro} data:')    
        output['data'] = [record for record in reader(fo)]
        pprint(output)
        if output_json:
            write_json(output, output_json)

## Vanilla Avro - using fastavro

In [3]:
# Output avro filepath
data_file = 'data/kf-vanilla.avro'

# Avro schema describing data that will go into avro file
schema = {
     "namespace": "kidsfirst",
     "type": "record",
     "name": "FileSchema",
     "fields": [
         {
             'name': 'object',
             'type': [
                {
                    'fields': [
                         {"name": "external_id", "type": ["null", "string"], 'default': None},
                         {"name": "gender",  "type": ["null", "string"], 'default': None},
                         {"name": "race",  "type": ["null", "string"], 'default': None},
                         {"name": "age",  "type": ["null", "int"], 'default': None}
                     ],
                     'type': 'record',
                     'name': 'patient'
                },
                {
                    'fields': [
                        {"name": "external_id", "type": ["null", "string"], 'default': None},
                        {"name": "analyte_type",  "type": ["null", "string"], 'default': None},
                        {"name": "composition",  "type": ["null", "string"], 'default': None},
                        {
                            "name": "duo_ids", 
                            "type": ['null', {
                                 'type': 'array',
                                 'items': 'string'
                             }]
                        }
                     ],
                     'type': 'record',
                     'name': 'specimen'
                } 
             ]
         }
     ]
}
write_json(schema, 'data/kf-vanilla-avro-schema.json')

# Create some data 
records = [
    {'object': ("kidsfirst.patient", {"external_id": "Patient1", "gender": "female"})},
    {'object': ("kidsfirst.specimen", {"external_id": "Specimen1", "analyte_type": "DNA"})},
]

# Write 
with open(data_file, 'wb') as out:
    writer(out, parse_schema(schema), records, validator=True)
    

# Read
with open(data_file, 'rb') as fo:
    for record in reader(fo, return_record_name=True):
        pprint(record)

{'object': ('kidsfirst.patient',
            {'age': None,
             'external_id': 'Patient1',
             'gender': 'female',
             'race': None})}
{'object': ('kidsfirst.specimen',
            {'analyte_type': 'DNA',
             'composition': None,
             'duo_ids': None,
             'external_id': 'Specimen1'})}


## PFB Avro - using pypfb

In [4]:
# Create test data using gen3 data simulator
# Requires the gen3 data dictionary to be stored on s3
data_dir = 'data/simulated/'
gen3_dd = 'data/kf-gen3-datadict.json'
schema_avro = 'data/kf-pfb-schema.avro'
output_avro = 'data/kf-pfb.avro'
program = 'kidsfirst'
project = 'drc'

# Execute if you don't have any test data yet
# !data-simulator simulate --url https://s3.amazonaws.com/singhn4-data-dict-bucket/kf-gen3-datadict.json --path data/simulated --program kidsfirst --project drc
# !ls -l data/simulated   

In [5]:
# Create schema avro file from gen3 data dict
kf_gen3_dd = read_yaml(gen3_dd)

print('Writing PFB Schema')
result = pfb_invoke('from', '-o', schema_avro, 'dict', gen3_dd)

print('************ Display the PFB with relational model only *********')
print_avro(schema_avro)

# Write the test data to the output avro file
print('Writing data to PFB file')
result = pfb_invoke('from', '-o', output_avro, 'json',
           '-s', schema_avro, 
           '--program', program,
          '--project', project,
          data_dir)

print('\n************ Display the PFB with the data and relational model *********')
print_avro(output_avro, output_json='data/kf-pfb.json')

print('\n************ PFB CLI Specific Functions ************')
# Show the avro schema in pfb file
print('-- pfb show schema -- ')
result = pfb_invoke('show', '-i', output_avro, 'schema')
pprint(json.loads(result.output))

# Read the data back out from the pfb file
print('-- pfb show nodes -- ')
result = pfb_invoke('show', '-i', output_avro, 'nodes')
print(result.output)

Writing PFB Schema
************ Display the PFB with relational model only *********
Avro file data/kf-pfb-schema.avro schema:

Avro file data/kf-pfb-schema.avro data:
{'avro.codec': 'null',
 'avro.schema': {'fields': [{'default': None,
                             'name': 'id',
                             'type': ['null', 'string']},
                            {'name': 'name', 'type': 'string'},
                            {'name': 'object',
                             'type': [{'fields': [{'name': 'nodes',
                                                   'type': {'items': {'fields': [{'name': 'name',
                                                                                  'type': 'string'},
                                                                                 {'name': 'ontology_reference',
                                                                                  'type': 'string'},
                                                                       


************ Display the PFB with the data and relational model *********
Avro file data/kf-pfb.avro schema:

Avro file data/kf-pfb.avro data:
{'avro.codec': 'null',
 'avro.schema': {'fields': [{'default': None,
                             'name': 'id',
                             'type': ['null', 'string']},
                            {'name': 'name', 'type': 'string'},
                            {'name': 'object',
                             'type': [{'fields': [{'name': 'nodes',
                                                   'type': {'items': {'fields': [{'name': 'name',
                                                                                  'type': 'string'},
                                                                                 {'name': 'ontology_reference',
                                                                                  'type': 'string'},
                                                                                 {'name': 'valu

             {'default': None,
              'doc': 'Name of organization which governs data access',
              'name': 'data_access_authority',
              'type': ['null', 'string']},
             {'default': None,
              'doc': 'Release status of the study.',
              'name': 'release_status',
              'type': ['null',
                       {'name': 'project_release_status',
                        'symbols': ['Pending',
                                    'Waiting',
                                    'Running',
                                    'Staged',
                                    'Publishing',
                                    'Published',
                                    'Failed',
                                    'Canceled'],
                        'type': 'enum'}]},
             {'default': None,
              'doc': "Tracks a Project's intended release date.",
              'name': 'intended_release_date',
              'type': ['nul