diff --git a/README.md b/README.md index 54fbe8a..3a30cf6 100644 --- a/README.md +++ b/README.md @@ -210,11 +210,12 @@ Read the whole table and returns as array of rows. Count of rows could be limite - `(exceptions.TableSchemaException)` - raises any error occured in this process - `(list[])` - returns array of rows (see `table.iter`) -#### `table.infer(limit=100)` +#### `table.infer(limit=100, confidence=0.75)` Infer a schema for the table. It will infer and set Table Schema to `table.schema` based on table data. -- `limit (int)` - limit rows samle size +- `limit (int)` - limit rows sample size +- `confidence (float)` - how many casting errors are allowed (as a ratio, between 0 and 1) - `(dict)` - returns Table Schema descriptor #### `table.save(target, storage=None, **options)` @@ -360,7 +361,7 @@ Cast row based on field types and formats. - `row (any[])` - data row as an array of values - `(any[])` - returns cast data row -#### `schema.infer(rows, headers=1)` +#### `schema.infer(rows, headers=1, confidence=0.75)` Infer and set `schema.descriptor` based on data sample. @@ -368,6 +369,7 @@ Infer and set `schema.descriptor` based on data sample. - `headers (int/str[])` - data sample headers (one of): - row number containing headers (`rows` should contain headers rows) - array of headers (`rows` should NOT contain headers rows) +- `confidence (float)` - how many casting errors are allowed (as a ratio, between 0 and 1) - `{dict}` - returns Table Schema descriptor #### `schema.commit(strict=None)` @@ -533,12 +535,13 @@ descriptor = infer('data_to_infer.csv') The number of rows used by `infer` can be limited with the `limit` argument. -#### `infer(source, headers=1, limit=100, **options)` +#### `infer(source, headers=1, limit=100, confidence=0.75, **options)` Infer source schema. - `source (any)` - source as path, url or inline data - `headers (int/str[])` - headers rows number or headers list +- `confidence (float)` - how many casting errors are allowed (as a ratio, between 0 and 1) - `(exceptions.TableSchemaException)` - raises any error occured in the process - `(dict)` - returns schema descriptor diff --git a/tableschema/cli.py b/tableschema/cli.py index a095b12..3772ca6 100644 --- a/tableschema/cli.py +++ b/tableschema/cli.py @@ -29,9 +29,10 @@ def info(): @main.command() @click.argument('data') @click.option('--row_limit', default=100, type=int) +@click.option('--confidence', default=0.75, type=float) @click.option('--encoding', default='utf-8') @click.option('--to_file') -def infer(data, row_limit, encoding, to_file): +def infer(data, row_limit, confidence, encoding, to_file): """Infer a schema from data. * data must be a local filepath @@ -41,7 +42,10 @@ def infer(data, row_limit, encoding, to_file): * the first line of data must be headers * these constraints are just for the CLI """ - descriptor = tableschema.infer(data, encoding=encoding, limit=row_limit) + descriptor = tableschema.infer(data, + encoding=encoding, + limit=row_limit, + confidence=confidence) if to_file: with io.open(to_file, mode='w+t', encoding='utf-8') as dest: dest.write(json.dumps(descriptor, ensure_ascii=False, indent=4)) diff --git a/tableschema/infer.py b/tableschema/infer.py index d3010ec..9531adc 100644 --- a/tableschema/infer.py +++ b/tableschema/infer.py @@ -11,7 +11,7 @@ # Module API -def infer(source, headers=1, limit=100, **options): +def infer(source, headers=1, limit=100, confidence=0.75, **options): """https://github.com/frictionlessdata/tableschema-py#schema """ @@ -22,5 +22,5 @@ def infer(source, headers=1, limit=100, **options): source, headers = headers, source table = Table(source, headers=headers, **options) - descriptor = table.infer(limit=limit) + descriptor = table.infer(limit=limit, confidence=confidence) return descriptor diff --git a/tableschema/schema.py b/tableschema/schema.py index 6e56ebd..c104ae0 100644 --- a/tableschema/schema.py +++ b/tableschema/schema.py @@ -153,7 +153,7 @@ def cast_row(self, row, fail_fast=False): return result - def infer(self, rows, headers=1): + def infer(self, rows, headers=1, confidence=0.75): """https://github.com/frictionlessdata/tableschema-py#schema """ @@ -187,12 +187,12 @@ def infer(self, rows, headers=1): for index, value in enumerate(row): rv = guesser.cast(value) if type_matches.get(index): - type_matches[index].append(rv) + type_matches[index].extend(rv) else: - type_matches[index] = [rv] + type_matches[index] = list(rv) # choose a type/format for each column based on the matches for index, results in type_matches.items(): - rv = resolver.get(results) + rv = resolver.get(results, confidence) descriptor['fields'][index].update(**rv) # Save descriptor @@ -284,11 +284,11 @@ class _TypeGuesser(object): # Public def cast(self, value): - for name in _INFER_TYPE_ORDER: + for priority, name in enumerate(_INFER_TYPE_ORDER): cast = getattr(types, 'cast_%s' % name) result = cast('default', value) if result != config.ERROR: - return (name, 'default') + yield (name, 'default', priority) class _TypeResolver(object): @@ -297,11 +297,7 @@ class _TypeResolver(object): # Public - @staticmethod - def _sort_key(item): - return (item[1], _INFER_TYPE_ORDER.index(item[0][0])) - - def get(self, results): + def get(self, results, confidence): variants = set(results) # only one candidate... that's easy. if len(variants) == 1: @@ -314,6 +310,15 @@ def get(self, results): else: counts[result] = 1 # tuple representation of `counts` dict sorted by values - sorted_counts = sorted(counts.items(), key=self._sort_key, reverse=True) + sorted_counts = sorted(counts.items(), + key=lambda item: item[1], + reverse=True) + # Allow also counts that are not the max, based on the confidence + max_count = sorted_counts[0][1] + sorted_counts = filter(lambda item: item[1] >= max_count * confidence, + sorted_counts) + # Choose the most specific data type + sorted_counts = sorted(sorted_counts, + key=lambda item: item[0][2]) rv = {'type': sorted_counts[0][0][0], 'format': sorted_counts[0][0][1]} return rv diff --git a/tableschema/table.py b/tableschema/table.py index a22b864..4b6de46 100644 --- a/tableschema/table.py +++ b/tableschema/table.py @@ -136,7 +136,7 @@ def read(self, keyed=False, extended=False, cast=True, relations=False, limit=No break return result - def infer(self, limit=100): + def infer(self, limit=100, confidence=0.75): """https://github.com/frictionlessdata/tableschema-py#schema """ if self.__schema is None or self.__headers is None: @@ -146,7 +146,9 @@ def infer(self, limit=100): with self.__stream as stream: if self.__schema is None: self.__schema = Schema() - self.__schema.infer(stream.sample[:limit], headers=stream.headers) + self.__schema.infer(stream.sample[:limit], + headers=stream.headers, + confidence=confidence) if self.__headers is None: self.__headers = stream.headers diff --git a/tests/test_schema.py b/tests/test_schema.py index a02924e..6e00bab 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -161,20 +161,36 @@ def test_save(tmpdir, apply_defaults): def test_infer(): - schema = Schema() - schema.infer([ + data = [ ['id', 'age', 'name'], ['1','39','Paul'], ['2','23','Jimmy'], ['3','36','Jane'], ['4','N/A','Judy'], - ]) + ] + schema = Schema() + schema.infer(data) assert schema.descriptor == { 'fields': [ {'format': 'default', 'name': 'id', 'type': 'integer'}, {'format': 'default', 'name': 'age', 'type': 'integer'}, {'format': 'default', 'name': 'name', 'type': 'string'}], 'missingValues': ['']} + data = [ + ['id', 'age', 'name'], + ['1','39','Paul'], + ['2','23','Jimmy'], + ['3','36','Jane'], + ['4','N/A','Judy'], + ] + schema = Schema() + schema.infer(data, confidence=0.8) + assert schema.descriptor == { + 'fields': [ + {'format': 'default', 'name': 'id', 'type': 'integer'}, + {'format': 'default', 'name': 'age', 'type': 'string'}, + {'format': 'default', 'name': 'name', 'type': 'string'}], + 'missingValues': ['']} def test_add_remove_field():