Added the confidence parameter to infer (#211)

* Fix inferring algorithm, make sure to choose the lowest common denominator * Add confidence measure to infer
frictionlessdata · May 29, 2018 · 81f8edd · 81f8edd
1 parent a3a81d7
commit 81f8edd
Show file tree

Hide file tree

Showing 6 changed files with 55 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -210,11 +210,12 @@ Read the whole table and returns as array of rows. Count of rows could be limite
 - `(exceptions.TableSchemaException)` - raises any error occured in this process
 - `(list[])` - returns array of rows (see `table.iter`)
 
-#### `table.infer(limit=100)`
+#### `table.infer(limit=100, confidence=0.75)`
 
 Infer a schema for the table. It will infer and set Table Schema to `table.schema` based on table data.
 
-- `limit (int)` - limit rows samle size
+- `limit (int)` - limit rows sample size
+- `confidence (float)` - how many casting errors are allowed (as a ratio, between 0 and 1)
 - `(dict)` - returns Table Schema descriptor
 
 #### `table.save(target, storage=None, **options)`
@@ -360,14 +361,15 @@ Cast row based on field types and formats.
 - `row (any[])` - data row as an array of values
 - `(any[])` - returns cast data row
 
-#### `schema.infer(rows, headers=1)`
+#### `schema.infer(rows, headers=1, confidence=0.75)`
 
 Infer and set `schema.descriptor` based on data sample.
 
 - `rows (list[])` - array of arrays representing rows.
 - `headers (int/str[])` - data sample headers (one of):
   - row number containing headers (`rows` should contain headers rows)
   - array of headers (`rows` should NOT contain headers rows)
+- `confidence (float)` - how many casting errors are allowed (as a ratio, between 0 and 1)
 - `{dict}` - returns Table Schema descriptor
 
 #### `schema.commit(strict=None)`
@@ -533,12 +535,13 @@ descriptor = infer('data_to_infer.csv')
 
 The number of rows used by `infer` can be limited with the `limit` argument.
 
-#### `infer(source, headers=1, limit=100, **options)`
+#### `infer(source, headers=1, limit=100, confidence=0.75, **options)`
 
 Infer source schema.
 
 - `source (any)` - source as path, url or inline data
 - `headers (int/str[])` - headers rows number or headers list
+- `confidence (float)` - how many casting errors are allowed (as a ratio, between 0 and 1)
 - `(exceptions.TableSchemaException)` - raises any error occured in the process
 - `(dict)` - returns schema descriptor
 

diff --git a/tableschema/cli.py b/tableschema/cli.py
@@ -29,9 +29,10 @@ def info():
 @main.command()
 @click.argument('data')
 @click.option('--row_limit', default=100, type=int)
+@click.option('--confidence', default=0.75, type=float)
 @click.option('--encoding', default='utf-8')
 @click.option('--to_file')
-def infer(data, row_limit, encoding, to_file):
+def infer(data, row_limit, confidence, encoding, to_file):
     """Infer a schema from data.
 
     * data must be a local filepath
@@ -41,7 +42,10 @@ def infer(data, row_limit, encoding, to_file):
     * the first line of data must be headers
     * these constraints are just for the CLI
     """
-    descriptor = tableschema.infer(data, encoding=encoding, limit=row_limit)
+    descriptor = tableschema.infer(data,
+                                   encoding=encoding,
+                                   limit=row_limit,
+                                   confidence=confidence)
     if to_file:
         with io.open(to_file, mode='w+t', encoding='utf-8') as dest:
             dest.write(json.dumps(descriptor, ensure_ascii=False, indent=4))

diff --git a/tableschema/infer.py b/tableschema/infer.py
@@ -11,7 +11,7 @@
 
 # Module API
 
-def infer(source, headers=1, limit=100, **options):
+def infer(source, headers=1, limit=100, confidence=0.75, **options):
     """https://github.com/frictionlessdata/tableschema-py#schema
     """
 
@@ -22,5 +22,5 @@ def infer(source, headers=1, limit=100, **options):
         source, headers = headers, source
 
     table = Table(source, headers=headers, **options)
-    descriptor = table.infer(limit=limit)
+    descriptor = table.infer(limit=limit, confidence=confidence)
     return descriptor
diff --git a/tableschema/schema.py b/tableschema/schema.py
@@ -153,7 +153,7 @@ def cast_row(self, row, fail_fast=False):
 
         return result
 
-    def infer(self, rows, headers=1):
+    def infer(self, rows, headers=1, confidence=0.75):
         """https://github.com/frictionlessdata/tableschema-py#schema
         """
 
@@ -187,12 +187,12 @@ def infer(self, rows, headers=1):
             for index, value in enumerate(row):
                 rv = guesser.cast(value)
                 if type_matches.get(index):
-                    type_matches[index].append(rv)
+                    type_matches[index].extend(rv)
                 else:
-                    type_matches[index] = [rv]
+                    type_matches[index] = list(rv)
         # choose a type/format for each column based on the matches
         for index, results in type_matches.items():
-            rv = resolver.get(results)
+            rv = resolver.get(results, confidence)
             descriptor['fields'][index].update(**rv)
 
         # Save descriptor
@@ -284,11 +284,11 @@ class _TypeGuesser(object):
     # Public
 
     def cast(self, value):
-        for name in _INFER_TYPE_ORDER:
+        for priority, name in enumerate(_INFER_TYPE_ORDER):
             cast = getattr(types, 'cast_%s' % name)
             result = cast('default', value)
             if result != config.ERROR:
-                return (name, 'default')
+                yield (name, 'default', priority)
 
 
 class _TypeResolver(object):
@@ -297,11 +297,7 @@ class _TypeResolver(object):
 
     # Public
 
-    @staticmethod
-    def _sort_key(item):
-        return (item[1], _INFER_TYPE_ORDER.index(item[0][0]))
-
-    def get(self, results):
+    def get(self, results, confidence):
         variants = set(results)
         # only one candidate... that's easy.
         if len(variants) == 1:
@@ -314,6 +310,15 @@ def get(self, results):
                 else:
                     counts[result] = 1
             # tuple representation of `counts` dict sorted by values
-            sorted_counts = sorted(counts.items(), key=self._sort_key, reverse=True)
+            sorted_counts = sorted(counts.items(),
+                                   key=lambda item: item[1],
+                                   reverse=True)
+            # Allow also counts that are not the max, based on the confidence
+            max_count = sorted_counts[0][1]
+            sorted_counts = filter(lambda item: item[1] >= max_count * confidence,
+                                   sorted_counts)
+            # Choose the most specific data type
+            sorted_counts = sorted(sorted_counts,
+                                   key=lambda item: item[0][2])
             rv = {'type': sorted_counts[0][0][0], 'format': sorted_counts[0][0][1]}
         return rv
diff --git a/tableschema/table.py b/tableschema/table.py
@@ -136,7 +136,7 @@ def read(self, keyed=False, extended=False, cast=True, relations=False, limit=No
                 break
         return result
 
-    def infer(self, limit=100):
+    def infer(self, limit=100, confidence=0.75):
         """https://github.com/frictionlessdata/tableschema-py#schema
         """
         if self.__schema is None or self.__headers is None:
@@ -146,7 +146,9 @@ def infer(self, limit=100):
                 with self.__stream as stream:
                     if self.__schema is None:
                         self.__schema = Schema()
-                        self.__schema.infer(stream.sample[:limit], headers=stream.headers)
+                        self.__schema.infer(stream.sample[:limit],
+                                            headers=stream.headers,
+                                            confidence=confidence)
                     if self.__headers is None:
                         self.__headers = stream.headers
 

diff --git a/tests/test_schema.py b/tests/test_schema.py
@@ -161,20 +161,36 @@ def test_save(tmpdir, apply_defaults):
 
 
 def test_infer():
-    schema = Schema()
-    schema.infer([
+    data = [
       ['id', 'age', 'name'],
       ['1','39','Paul'],
       ['2','23','Jimmy'],
       ['3','36','Jane'],
       ['4','N/A','Judy'],
-    ])
+    ]
+    schema = Schema()
+    schema.infer(data)
     assert schema.descriptor == {
         'fields': [
             {'format': 'default', 'name': 'id', 'type': 'integer'},
             {'format': 'default', 'name': 'age', 'type': 'integer'},
             {'format': 'default', 'name': 'name', 'type': 'string'}],
         'missingValues': ['']}
+    data = [
+      ['id', 'age', 'name'],
+      ['1','39','Paul'],
+      ['2','23','Jimmy'],
+      ['3','36','Jane'],
+      ['4','N/A','Judy'],
+    ]
+    schema = Schema()
+    schema.infer(data, confidence=0.8)
+    assert schema.descriptor == {
+        'fields': [
+            {'format': 'default', 'name': 'id', 'type': 'integer'},
+            {'format': 'default', 'name': 'age', 'type': 'string'},
+            {'format': 'default', 'name': 'name', 'type': 'string'}],
+        'missingValues': ['']}
 
 
 def test_add_remove_field():