/
schema.py
610 lines (482 loc) · 18.8 KB
/
schema.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
# -*- coding: utf-8 -*-
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
import io
import six
import json
from collections import OrderedDict
from copy import deepcopy
from six.moves import zip_longest
from .profile import Profile
from .field import Field
from . import exceptions
from . import helpers
from . import config
from . import types
# Module API
class Schema(object):
"""Schema representation
# Arguments
descriptor (str/dict): schema descriptor one of:
- local path
- remote url
- dictionary
strict (bool): flag to specify validation behaviour:
- if false, errors will not be raised but instead collected in `schema.errors`
- if true, validation errors are raised immediately
# Raises
TableSchemaException: raise any error that occurs during the process
"""
# Public
def __init__(self, descriptor={}, strict=False):
# Process descriptor
descriptor = helpers.retrieve_descriptor(descriptor)
# Set attributes
self.__strict = strict
self.__current_descriptor = deepcopy(descriptor)
self.__next_descriptor = deepcopy(descriptor)
self.__profile = Profile('table-schema')
self.__errors = []
self.__fields = []
# Build instance
self.__build()
@property
def valid(self):
"""Validation status
Always true in strict mode.
# Returns
bool: validation status
"""
return not bool(self.__errors)
@property
def errors(self):
"""Validation errors
Always empty in strict mode.
# Returns
Exception[]: validation errors
"""
return self.__errors
@property
def descriptor(self):
"""Schema's descriptor
# Returns
dict: descriptor
"""
# Never use this.descriptor inside this class (!!!)
return self.__next_descriptor
@property
def missing_values(self):
"""Schema's missing values
# Returns
str[]: missing values
"""
return self.__current_descriptor.get('missingValues', [])
@property
def primary_key(self):
"""Schema's primary keys
# Returns
str[]: primary keys
"""
primary_key = self.__current_descriptor.get('primaryKey', [])
if not isinstance(primary_key, list):
primary_key = [primary_key]
return primary_key
@property
def foreign_keys(self):
"""Schema's foreign keys
# Returns
dict[]: foreign keys
"""
foreign_keys = self.__current_descriptor.get('foreignKeys', [])
for key in foreign_keys:
key.setdefault('fields', [])
key.setdefault('reference', {})
key['reference'].setdefault('resource', '')
key['reference'].setdefault('fields', [])
if not isinstance(key['fields'], list):
key['fields'] = [key['fields']]
if not isinstance(key['reference']['fields'], list):
key['reference']['fields'] = [key['reference']['fields']]
return foreign_keys
@property
def fields(self):
"""Schema's fields
# Returns
Field[]: an array of field instances
"""
return self.__fields
@property
def field_names(self):
"""Schema's field names
# Returns
str[]: an array of field names
"""
return [field.name for field in self.fields]
def get_field(self, name):
"""Get schema's field by name.
> Use `table.update_field` if you want to modify the field descriptor
# Arguments
name (str): schema field name
# Returns
Field/None: `Field` instance or `None` if not found
"""
for field in self.fields:
if field.name == name:
return field
return None
def add_field(self, descriptor):
""" Add new field to schema.
The schema descriptor will be validated with newly added field descriptor.
# Arguments
descriptor (dict): field descriptor
# Raises
TableSchemaException: raises any error that occurs during the process
# Returns
Field/None: added `Field` instance or `None` if not added
"""
self.__current_descriptor.setdefault('fields', [])
self.__current_descriptor['fields'].append(descriptor)
self.__build()
return self.__fields[-1]
def update_field(self, name, update):
"""Update existing descriptor field by name
# Arguments
name (str): schema field name
update (dict): update to apply to field's descriptor
# Returns
bool: true on success and false if no field is found to be modified
"""
for field in self.__next_descriptor['fields']:
if field['name'] == name:
field.update(update)
return True
return False
def remove_field(self, name):
"""Remove field resource by name.
The schema descriptor will be validated after field descriptor removal.
# Arguments
name (str): schema field name
# Raises
TableSchemaException: raises any error that occurs during the process
# Returns
Field/None: removed `Field` instances or `None` if not found
"""
field = self.get_field(name)
if field:
predicat = lambda field: field.get('name') != name
self.__current_descriptor['fields'] = list(filter(
predicat, self.__current_descriptor['fields']))
self.__build()
return field
def cast_row(self, row, fail_fast=False, row_number=None, exc_handler=None):
"""Cast row based on field types and formats.
# Arguments
row (any[]: data row as an array of values
# Returns
any[]: returns cast data row
"""
exc_handler = helpers.default_exc_handler if exc_handler is None else \
exc_handler
# Prepare
result = []
errors = []
if row_number is not None:
row_number_info = ' for row "%s"' % row_number
else:
row_number_info = ''
# Check row length
if len(row) != len(self.fields):
message = (
'Row length %s doesn\'t match fields count %s' +
row_number_info) % (len(row), len(self.fields))
exc = exceptions.CastError(message)
# Some preparations for error reporting, relevant if custom error
# handling is in place.
if len(row) < len(self.fields):
# Treat missing col values as None
keyed_row = OrderedDict(
zip_longest((field.name for field in self.fields), row))
# Use added None values for further processing
row = list(keyed_row.values())
else:
fields = self.fields
keyed_row = OrderedDict(
# Use extra column number if value index exceeds fields
(fields[i].name if fields[i:]
else 'tableschema-cast-error-extra-col-{}'.format(i+1),
value)
for (i, value) in enumerate(row))
exc_handler(exc, row_number=row_number, row_data=keyed_row,
error_data=keyed_row)
# Cast row
for field, value in zip(self.fields, row):
try:
result.append(field.cast_value(value))
except exceptions.CastError as exception:
if fail_fast:
raise
# Wrap original value in a FailedCast object to be able to
# further process/yield values and to distinguish uncasted
# values on the consuming side.
result.append(FailedCast(value))
errors.append(exception)
# Raise errors
if errors:
message = (
'There are %s cast errors (see exception.errors)' +
row_number_info) % len(errors)
keyed_row = OrderedDict(zip(self.field_names, row))
# Add the cast failure-causing fields only to error data.
# Indexing results with the row field index should be ok at this
# point due to the previous processing.
error_data = OrderedDict(
(name, value)
for (i, (name, value)) in enumerate(keyed_row.items())
if isinstance(result[i], FailedCast))
exc_handler(
exceptions.CastError(message, errors=errors),
row_number=row_number, row_data=keyed_row,
error_data=error_data)
return result
def infer(self, rows, headers=1, confidence=0.75,
guesser_cls=None, resolver_cls=None):
"""Infer and set `schema.descriptor` based on data sample.
# Arguments
rows (list[]): array of arrays representing rows.
headers (int/str[]): data sample headers (one of):
- row number containing headers (`rows` should contain headers rows)
- array of headers (`rows` should NOT contain headers rows)
confidence (float): how many casting errors are allowed (as a ratio, between 0 and 1)
guesser_cls (class): you can implement inferring strategies by
providing type-guessing and type-resolving classes [experimental]
resolver_cls (class): you can implement inferring strategies by
providing type-guessing and type-resolving classes [experimental]
# Returns
dict: Table Schema descriptor
"""
# Get headers
if isinstance(headers, int):
headers_row = headers
while True:
headers_row -= 1
headers = rows.pop(0)
if not headers_row:
break
elif isinstance(headers, list):
seen_cells = []
headers = list(headers)
for index, cell in enumerate(headers):
count = seen_cells.count(cell) + 1
headers[index] = '%s%s' % (cell, count) if count > 1 else cell
seen_cells.append(cell)
elif not isinstance(headers, list):
headers = []
# Get descriptor
missing_values = self.__current_descriptor.get('missingValues', config.DEFAULT_MISSING_VALUES)
guesser = guesser_cls() if guesser_cls else _TypeGuesser(missing_values)
resolver = (resolver_cls or _TypeResolver)()
descriptor = {'fields': [], 'missingValues': missing_values}
type_matches = {}
for number, header in enumerate(headers, start=1):
descriptor['fields'].append({'name': header or 'field%s' % number})
for index, row in enumerate(rows):
# Normalize rows with invalid dimensions for sanity
row_length = len(row)
headers_length = len(headers)
if row_length > headers_length:
row = row[:len(headers)]
if row_length < headers_length:
diff = headers_length - row_length
fill = [''] * diff
row = row + fill
# build a column-wise lookup of type matches
for index, value in enumerate(row):
rv = guesser.cast(value)
if type_matches.get(index):
type_matches[index].extend(rv)
else:
type_matches[index] = list(rv)
# choose a type/format for each column based on the matches
for index, results in type_matches.items():
rv = resolver.get(results, confidence)
descriptor['fields'][index].update(**rv)
# Save descriptor
self.__current_descriptor = descriptor
self.__build()
return descriptor
def commit(self, strict=None):
"""Update schema instance if there are in-place changes in the descriptor.
# Example
```python
from tableschema import Schema
descriptor = {'fields': [{'name': 'my_field', 'title': 'My Field', 'type': 'string'}]}
schema = Schema(descriptor)
print(schema.get_field('my_field').descriptor['type']) # string
# Update descriptor by field position
schema.descriptor['fields'][0]['type'] = 'number'
# Update descriptor by field name
schema.update_field('my_field', {'title': 'My Pretty Field'}) # True
# Change are not committed
print(schema.get_field('my_field').descriptor['type']) # string
print(schema.get_field('my_field').descriptor['title']) # My Field
# Commit change
schema.commit()
print(schema.get_field('my_field').descriptor['type']) # number
print(schema.get_field('my_field').descriptor['title']) # My Pretty Field
```
# Arguments
strict (bool): alter `strict` mode for further work
# Raises
TableSchemaException: raises any error that occurs during the process
# Returns
bool: true on success and false if not modified
"""
if strict is not None:
self.__strict = strict
elif self.__current_descriptor == self.__next_descriptor:
return False
self.__current_descriptor = deepcopy(self.__next_descriptor)
self.__build()
return True
def save(self, target, ensure_ascii=True):
"""Save schema descriptor to target destination.
# Arguments
target (str): path where to save a descriptor
# Raises
TableSchemaException: raises any error that occurs during the process
# Returns
bool: true on success
"""
mode = 'w'
encoding = 'utf-8'
if six.PY2:
mode = 'wb'
encoding = None
helpers.ensure_dir(target)
with io.open(target, mode=mode, encoding=encoding) as file:
json.dump(self.__current_descriptor, file, indent=4, ensure_ascii=ensure_ascii)
# Internal
def __build(self):
# Process descriptor
expand = helpers.expand_schema_descriptor
self.__current_descriptor = expand(self.__current_descriptor)
self.__next_descriptor = deepcopy(self.__current_descriptor)
# Validate descriptor
try:
self.__profile.validate(self.__current_descriptor)
self.__errors = []
except exceptions.ValidationError as exception:
self.__errors = exception.errors
if self.__strict:
raise exception
# Populate fields
self.__fields = []
for field in self.__current_descriptor.get('fields', []):
missing_values = self.__current_descriptor['missingValues']
try:
field = Field(field, missing_values=missing_values, schema=self)
except exceptions.TableSchemaException as e:
if self.__strict:
raise e
else:
field = False
self.__fields.append(field)
# Deprecated
headers = field_names
has_field = get_field
class FailedCast(object):
"""Wrap an original data field value that failed to be properly casted.
FailedCast allows for further processing/yielding values but still be able
to distinguish uncasted values on the consuming side.
Delegates attribute access and the basic rich comparison methods to the
underlying object. Supports default user-defined classes hashability i.e.
is hashable based on object identity (not based on the wrapped value).
# Arguments
value (any): value
"""
# Make this "reasonably immutable": Don't support setting other attributes,
# don't support modifying re-setting value
__slots__ = ('_value',)
def __init__(self, value):
self._value = value
@property
def value(self):
return self._value
def __repr__(self):
return 'FailedCast(%r)' % self._value
def __getattr__(self, name):
return getattr(self._value, name)
def __lt__(self, other):
return self._value < other
def __le__(self, other):
return self._value <= other
def __eq__(self, other):
return self._value == other
def __ne__(self, other):
return self._value != other
def __gt__(self, other):
return self._value > other
def __ge__(self, other):
return self._value >= other
def __hash__(self):
return object.__hash__(self)
# Internal
_INFER_TYPE_ORDER = [
'duration',
'geojson',
'geopoint',
'object',
'array',
'datetime',
'time',
'date',
'integer',
'number',
'boolean',
'string',
'any',
]
class _TypeGuesser(object):
"""Guess the type for a value returning a tuple of ('type', 'format')
"""
# Public
def __init__(self, missing_values):
self.missing_values = missing_values
def cast(self, value):
for priority, name in enumerate(_INFER_TYPE_ORDER):
cast = getattr(types, 'cast_%s' % name)
if value not in self.missing_values:
result = cast('default', value)
if result != config.ERROR:
yield (name, 'default', priority)
class _TypeResolver(object):
"""Get the best matching type/format from a list of possible ones.
"""
# Public
def get(self, results, confidence):
variants = set(results)
# only one candidate... that's easy.
if len(variants) == 1:
rv = {'type': results[0][0], 'format': results[0][1]}
else:
counts = {}
for result in results:
if counts.get(result):
counts[result] += 1
else:
counts[result] = 1
# tuple representation of `counts` dict sorted by values
sorted_counts = sorted(counts.items(), key=lambda item: item[1], reverse=True)
if not sorted_counts:
return {'type': 'string', 'format': 'default'}
# Allow also counts that are not the max, based on the confidence
max_count = sorted_counts[0][1]
sorted_counts = filter(lambda item: item[1] >= max_count * confidence,
sorted_counts)
# Choose the most specific data type
sorted_counts = sorted(sorted_counts,
key=lambda item: item[0][2])
rv = {'type': sorted_counts[0][0][0], 'format': sorted_counts[0][0][1]}
return rv