Skip to content

Commit

Permalink
Added the confidence parameter to infer (#211)
Browse files Browse the repository at this point in the history
* Fix inferring algorithm, make sure to choose the lowest common denominator

* Add confidence measure to infer
  • Loading branch information
akariv authored and roll committed May 29, 2018
1 parent a3a81d7 commit 81f8edd
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 25 deletions.
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -210,11 +210,12 @@ Read the whole table and returns as array of rows. Count of rows could be limite
- `(exceptions.TableSchemaException)` - raises any error occured in this process
- `(list[])` - returns array of rows (see `table.iter`)

#### `table.infer(limit=100)`
#### `table.infer(limit=100, confidence=0.75)`

Infer a schema for the table. It will infer and set Table Schema to `table.schema` based on table data.

- `limit (int)` - limit rows samle size
- `limit (int)` - limit rows sample size
- `confidence (float)` - how many casting errors are allowed (as a ratio, between 0 and 1)
- `(dict)` - returns Table Schema descriptor

#### `table.save(target, storage=None, **options)`
Expand Down Expand Up @@ -360,14 +361,15 @@ Cast row based on field types and formats.
- `row (any[])` - data row as an array of values
- `(any[])` - returns cast data row

#### `schema.infer(rows, headers=1)`
#### `schema.infer(rows, headers=1, confidence=0.75)`

Infer and set `schema.descriptor` based on data sample.

- `rows (list[])` - array of arrays representing rows.
- `headers (int/str[])` - data sample headers (one of):
- row number containing headers (`rows` should contain headers rows)
- array of headers (`rows` should NOT contain headers rows)
- `confidence (float)` - how many casting errors are allowed (as a ratio, between 0 and 1)
- `{dict}` - returns Table Schema descriptor

#### `schema.commit(strict=None)`
Expand Down Expand Up @@ -533,12 +535,13 @@ descriptor = infer('data_to_infer.csv')

The number of rows used by `infer` can be limited with the `limit` argument.

#### `infer(source, headers=1, limit=100, **options)`
#### `infer(source, headers=1, limit=100, confidence=0.75, **options)`

Infer source schema.

- `source (any)` - source as path, url or inline data
- `headers (int/str[])` - headers rows number or headers list
- `confidence (float)` - how many casting errors are allowed (as a ratio, between 0 and 1)
- `(exceptions.TableSchemaException)` - raises any error occured in the process
- `(dict)` - returns schema descriptor

Expand Down
8 changes: 6 additions & 2 deletions tableschema/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@ def info():
@main.command()
@click.argument('data')
@click.option('--row_limit', default=100, type=int)
@click.option('--confidence', default=0.75, type=float)
@click.option('--encoding', default='utf-8')
@click.option('--to_file')
def infer(data, row_limit, encoding, to_file):
def infer(data, row_limit, confidence, encoding, to_file):
"""Infer a schema from data.
* data must be a local filepath
Expand All @@ -41,7 +42,10 @@ def infer(data, row_limit, encoding, to_file):
* the first line of data must be headers
* these constraints are just for the CLI
"""
descriptor = tableschema.infer(data, encoding=encoding, limit=row_limit)
descriptor = tableschema.infer(data,
encoding=encoding,
limit=row_limit,
confidence=confidence)
if to_file:
with io.open(to_file, mode='w+t', encoding='utf-8') as dest:
dest.write(json.dumps(descriptor, ensure_ascii=False, indent=4))
Expand Down
4 changes: 2 additions & 2 deletions tableschema/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

# Module API

def infer(source, headers=1, limit=100, **options):
def infer(source, headers=1, limit=100, confidence=0.75, **options):
"""https://github.com/frictionlessdata/tableschema-py#schema
"""

Expand All @@ -22,5 +22,5 @@ def infer(source, headers=1, limit=100, **options):
source, headers = headers, source

table = Table(source, headers=headers, **options)
descriptor = table.infer(limit=limit)
descriptor = table.infer(limit=limit, confidence=confidence)
return descriptor
29 changes: 17 additions & 12 deletions tableschema/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def cast_row(self, row, fail_fast=False):

return result

def infer(self, rows, headers=1):
def infer(self, rows, headers=1, confidence=0.75):
"""https://github.com/frictionlessdata/tableschema-py#schema
"""

Expand Down Expand Up @@ -187,12 +187,12 @@ def infer(self, rows, headers=1):
for index, value in enumerate(row):
rv = guesser.cast(value)
if type_matches.get(index):
type_matches[index].append(rv)
type_matches[index].extend(rv)
else:
type_matches[index] = [rv]
type_matches[index] = list(rv)
# choose a type/format for each column based on the matches
for index, results in type_matches.items():
rv = resolver.get(results)
rv = resolver.get(results, confidence)
descriptor['fields'][index].update(**rv)

# Save descriptor
Expand Down Expand Up @@ -284,11 +284,11 @@ class _TypeGuesser(object):
# Public

def cast(self, value):
for name in _INFER_TYPE_ORDER:
for priority, name in enumerate(_INFER_TYPE_ORDER):
cast = getattr(types, 'cast_%s' % name)
result = cast('default', value)
if result != config.ERROR:
return (name, 'default')
yield (name, 'default', priority)


class _TypeResolver(object):
Expand All @@ -297,11 +297,7 @@ class _TypeResolver(object):

# Public

@staticmethod
def _sort_key(item):
return (item[1], _INFER_TYPE_ORDER.index(item[0][0]))

def get(self, results):
def get(self, results, confidence):
variants = set(results)
# only one candidate... that's easy.
if len(variants) == 1:
Expand All @@ -314,6 +310,15 @@ def get(self, results):
else:
counts[result] = 1
# tuple representation of `counts` dict sorted by values
sorted_counts = sorted(counts.items(), key=self._sort_key, reverse=True)
sorted_counts = sorted(counts.items(),
key=lambda item: item[1],
reverse=True)
# Allow also counts that are not the max, based on the confidence
max_count = sorted_counts[0][1]
sorted_counts = filter(lambda item: item[1] >= max_count * confidence,
sorted_counts)
# Choose the most specific data type
sorted_counts = sorted(sorted_counts,
key=lambda item: item[0][2])
rv = {'type': sorted_counts[0][0][0], 'format': sorted_counts[0][0][1]}
return rv
6 changes: 4 additions & 2 deletions tableschema/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def read(self, keyed=False, extended=False, cast=True, relations=False, limit=No
break
return result

def infer(self, limit=100):
def infer(self, limit=100, confidence=0.75):
"""https://github.com/frictionlessdata/tableschema-py#schema
"""
if self.__schema is None or self.__headers is None:
Expand All @@ -146,7 +146,9 @@ def infer(self, limit=100):
with self.__stream as stream:
if self.__schema is None:
self.__schema = Schema()
self.__schema.infer(stream.sample[:limit], headers=stream.headers)
self.__schema.infer(stream.sample[:limit],
headers=stream.headers,
confidence=confidence)
if self.__headers is None:
self.__headers = stream.headers

Expand Down
22 changes: 19 additions & 3 deletions tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,20 +161,36 @@ def test_save(tmpdir, apply_defaults):


def test_infer():
schema = Schema()
schema.infer([
data = [
['id', 'age', 'name'],
['1','39','Paul'],
['2','23','Jimmy'],
['3','36','Jane'],
['4','N/A','Judy'],
])
]
schema = Schema()
schema.infer(data)
assert schema.descriptor == {
'fields': [
{'format': 'default', 'name': 'id', 'type': 'integer'},
{'format': 'default', 'name': 'age', 'type': 'integer'},
{'format': 'default', 'name': 'name', 'type': 'string'}],
'missingValues': ['']}
data = [
['id', 'age', 'name'],
['1','39','Paul'],
['2','23','Jimmy'],
['3','36','Jane'],
['4','N/A','Judy'],
]
schema = Schema()
schema.infer(data, confidence=0.8)
assert schema.descriptor == {
'fields': [
{'format': 'default', 'name': 'id', 'type': 'integer'},
{'format': 'default', 'name': 'age', 'type': 'string'},
{'format': 'default', 'name': 'name', 'type': 'string'}],
'missingValues': ['']}


def test_add_remove_field():
Expand Down

0 comments on commit 81f8edd

Please sign in to comment.