## Using Frictionless Data [datapackage-py](https://github.com/frictionlessdata/datapackage-py) to package data from Figure 2 of this article: https://doi.org/10.7554/eLife.42690.001

In [None]:
pip install datapackage goodtables

In [2]:
from datapackage import Package

In [3]:
# create an empty data package
package = Package()

In [4]:
#infer the JSON schema from our csv data file & save the package
#note, make sure your data is in the same folder where you are running this notebook
package.infer('elife.csv')
package.save('datapackage.json')

True

In [5]:
package.descriptor['resources']

[{'encoding': 'utf-8',
  'format': 'csv',
  'mediatype': 'text/csv',
  'name': 'elife',
  'path': 'elife.csv',
  'profile': 'tabular-data-resource',
  'schema': {'fields': [{'format': 'default',
     'name': 'Genotype',
     'type': 'string'},
    {'format': 'default', 'name': 'Mean', 'type': 'number'},
    {'format': 'default', 'name': 'Std. Error', 'type': 'number'},
    {'format': 'default', 'name': 'SEM', 'type': 'number'},
    {'format': 'default', 'name': 'N (animals/hemisegment)', 'type': 'string'},
    {'format': 'default', 'name': 'p-value', 'type': 'string'}],
   'missingValues': ['']}}]

In [6]:
means = ([float(e['Mean']) for e in package.resources[0].read(keyed=True)])

In [7]:
print(means)

[0.767, 0.0, 0.0, 0.904, 0.967, 0.897, 0.771, 0.0, 0.0, 0.9, 0.567, 0.833, 0.429]


In [8]:
print([(e['Genotype'], float(e['Mean'])) for e in package.resources[0].read(keyed=True) if float(e['Mean']) < 0.6])

[('DIP-α-GAL4>EGFP', 0.0), ('DIP-α1-178', 0.0), ('Elav-GAL4>DIP-α', 0.0), ('Mef2-GAL4>DIP-α', 0.0), ('Elav-GAL4 >UAS-DIP-α RNAi', 0.567), ('Mef2-GAL4 >UAS-DIP-α-RNAi', 0.429)]


In [9]:
# let's validate our datapackage now
from goodtables import validate

In [10]:
#validate without a schema
report = validate('elife.csv')
report['valid']

True

In [11]:
#validate with a schema
#remember to only use the JSON 'schema' portion of datapackage.JSON
reportValid = validate('elife.csv', schema = 'schema.json', order_fields = True)
reportValid['valid']

True

In [12]:
#what happens when we validate an invalid csv file?
reportInvalid = validate('elife_invalid.csv')
reportInvalid['valid']

False

In [13]:
reportInvalid['error-count']

4

In [14]:
reportInvalid['tables'][0]['errors']

[{'code': 'missing-value',
  'column-number': 6,
  'message': 'Row 2 has a missing value in column 6',
  'message-data': {},
  'row-number': 2},
 {'code': 'missing-value',
  'column-number': 6,
  'message': 'Row 5 has a missing value in column 6',
  'message-data': {},
  'row-number': 5},
 {'code': 'missing-value',
  'column-number': 6,
  'message': 'Row 8 has a missing value in column 6',
  'message-data': {},
  'row-number': 8},
 {'code': 'missing-value',
  'column-number': 6,
  'message': 'Row 11 has a missing value in column 6',
  'message-data': {},
  'row-number': 11}]