Merge branch 'develop' into bugfix_in_calc_map_expectation_success

great-expectations · Sep 8, 2018 · b296f5e · b296f5e
2 parents 833b420 + 57b3b4d
commit b296f5e
Show file tree

Hide file tree

Showing 10 changed files with 367 additions and 71 deletions.
diff --git a/great_expectations/__init__.py b/great_expectations/__init__.py
@@ -1,5 +1,6 @@
 from .util import *
 from great_expectations import dataset
 from great_expectations.data_context import get_data_context
+from .file_expectations import *
 
 from .version import __version__
diff --git a/great_expectations/file_expectations.py b/great_expectations/file_expectations.py
@@ -0,0 +1,226 @@
+# This code is early and experimental. eventually it will live in a RawFileDataset class, per issue 321. In the
+# meantime, please use, enjoy, and share feedback.
+
+import csv
+import hashlib
+import json
+import jsonschema
+import os.path
+
+def expect_file_hash_to_equal(filename, value, hash_alg='md5'):
+    """
+    Return True or False indicating whether the hash matches the specified value for the default (md5) or user-specified hash algorithm
+
+    Parameters
+    ----------
+    filename : string
+        file on which the hash is computed
+    value : string
+        value to compare to computed hash
+    hash_alg : string, default='md5'
+        hash alogorithm to use. See hashlib.algorithms_available for supported algorithms.
+
+    Returns
+    -------
+    True if the computed hash matches the specified value; False otherwise
+
+    Raises
+    ------
+    IOError
+        if there is a problem reading the specified file
+    ValueError
+        if the specified hash algorithm is not defined by hashlib
+
+    """
+    success = False
+    try:
+        hash = hashlib.new(hash_alg)
+        # Limit file reads to 64 KB chunks at a time
+        BLOCKSIZE = 65536
+        try:
+            with open(filename, 'rb') as file:
+                file_buffer = file.read(BLOCKSIZE)
+                while len(file_buffer) > 0:
+                    hash.update(file_buffer)
+                    file_buffer = file.read(BLOCKSIZE)
+                success = hash.hexdigest() == value
+        except IOError:
+            raise
+    except ValueError:
+        raise
+    return success
+
+def expect_file_size_to_be_between(filename, minsize, maxsize):
+    """
+    Return True or False indicating whether the file size (in bytes) is (inclusively) between two values.
+
+    Parameters
+    ----------
+    filename : string
+        file to check file size
+    minsize : integer
+        minimum file size
+    maxsize : integer
+        maximum file size
+
+    Returns
+    -------
+    True if the computed hash matches the specified value; False otherwise
+
+    Raises
+    ------
+    OSError
+        if there is a problem reading the specified file
+    TypeError
+        if minsize or maxsize are not integers
+    ValueError
+        if there is a problem with the integer value of minsize or maxsize
+
+    """
+    try:
+        size = os.path.getsize(filename)
+    except OSError:
+        raise
+    if type(minsize) != int:
+        raise TypeError('minsize must be an integer')
+    if type(maxsize) != int:
+        raise TypeError('maxsize must be an integer')
+    if minsize < 0:
+        raise ValueError('minsize must be greater than of equal to 0')
+    if maxsize < 0:
+        raise ValueError('maxsize must be greater than of equal to 0')
+    if minsize > maxsize:
+        raise ValueError('maxsize must be greater than of equal to minsize')
+    if (size >= minsize) and (size <= maxsize):
+        return True
+    else:
+        return False
+
+def expect_file_to_exist(filename):
+    """
+    Return True or False indicating whether the specified file exists
+
+    Parameters
+    ----------
+    filename : string
+        file to check for existence
+
+    Returns
+    -------
+    True if the specified file exists; False otherwise
+
+    """
+    if os.path.isfile(filename):
+        return True
+    else:
+        return False
+
+def expect_file_unique_column_names_csv(filename,
+                                        skipLines=0,
+                                        sep=',',
+                                        quoteChar='"',
+                                        quot=csv.QUOTE_MINIMAL,
+                                        doubleQuote=True,
+                                        skipInitialSpace=False,
+                                        escapeChar=None):
+    """
+    Return True or False indicating whether the specified CSV file has unique column names.
+
+    Parameters
+    ----------
+    filename : string
+        file on which the hash is computed
+    skipLines : integer
+        number of rows or lines to skip before reading the header line
+    sep : string
+        delimiter used for parsing CSV lines
+    quoteChar : string
+        one-character string used to quote fields containing special characters
+    quot :
+        controls when quotes should be recognised by the reader
+    doubleQuote : boolean
+        controls how instances of quotechar appearing inside a field should themselves be quoted
+    skipInitialSpace : boolean
+        when True, whitespace immediately following the delimiter is ignored
+    escapeChar : string
+        one-char string which removes any special meaning from the following character
+    
+
+    Returns
+    -------
+    True if the column names are unique; False otherwise
+
+    Raises
+    ------
+    IOError
+        if there is a problem reading the specified file
+    csv.Error
+        if there is an error in the csv methods
+
+    """
+    success = False
+    try:
+        with open(filename, 'r') as f:
+            reader = csv.reader(f, delimiter=sep, quotechar=quoteChar, quoting=quot, doublequote=doubleQuote, skipinitialspace=skipInitialSpace, escapechar=escapeChar, strict=True)
+            if skipLines > 0:
+              for i in range(0, skipLines):
+                  next(reader)
+            colnames = next(reader)
+            if len(set(colnames)) == len(colnames):
+                success = True
+    except IOError:
+        raise
+    except csv.Error:
+        raise
+    return success
+
+
+def expect_file_valid_json(filename, schema=None):
+    """
+    Return True or False indicating whether the specified file is valid JSON.
+
+    Parameters
+    ----------
+    filename : string
+        file to test as valid JSON
+    schema : string
+        optional JSON schema file on which JSON data file is validated against
+
+
+    Returns
+    -------
+    True if the file is valid JSON; False otherwise
+
+    Raises
+    ------
+    Error
+        if there is a problem reading a specified file or parsing as JSON
+    jsonschema.SchemaError
+        if there is an error in the provided schema
+
+    """
+    success = False
+    if schema is None:
+        try:
+            with open(filename, 'r') as f:
+                json.load(f)
+            success = True
+        except ValueError:
+            success = False
+    else:
+        try:
+            with open(schema, 'r') as s:
+                schema_data = s.read()
+            sdata = json.loads(schema_data)
+            with open(filename, 'r') as f:
+                json_data = f.read()
+            jdata = json.loads(json_data)
+            jsonschema.validate(jdata, sdata)
+            success = True
+        except jsonschema.ValidationError:
+            success = False
+        except jsonschema.SchemaError:
+            raise
+        except:
+            raise
+    return success
diff --git a/great_expectations/util.py b/great_expectations/util.py
@@ -1,4 +1,3 @@
-import hashlib
 import pandas as pd
 import json
 
@@ -144,46 +143,3 @@ def __getattr__(self, attr):
     __delattr__= dict.__delitem__
     def __dir__(self):
         return self.keys()
-
-def expect_file_hash_to_equal(filename, value, hash_alg='md5'):
-    """
-    Return True or False indicating whether the hash matches the specified value for the default (md5) or user-specified hash algorithm
-
-    Parameters
-    ----------
-    filename : string
-        file on which the hash is computed
-    value : string
-        value to compare to computed hash
-    hash_alg : string, default='md5'
-        hash alogorithm to use. See hashlib.algorithms_available for supported algorithms.
-
-    Returns
-    -------
-    True if the computed hash matches the specified value; False otherwise
-
-    Raises
-    ------
-    IOError
-        if there is a problem reading the specified file
-    ValueError
-        if the specified hash algorithm is not defined by hashlib
-
-    """
-    success = False
-    try:
-        hash = hashlib.new(hash_alg)
-        # Limit file reads to 64 KB chunks at a time
-        BLOCKSIZE = 65536
-        try:
-            with open(filename, 'rb') as file:
-                file_buffer = file.read(BLOCKSIZE)
-                while len(file_buffer) > 0:
-                    hash.update(file_buffer)
-                    file_buffer = file.read(BLOCKSIZE)
-                success = hash.hexdigest() == value
-        except IOError:
-            raise
-    except ValueError:
-        raise
-    return success
diff --git a/tests/test_dataset_util.py b/tests/test_dataset_util.py
@@ -200,33 +200,6 @@ def test_recursively_convert_to_json_serializable(self):
         except NameError:
             pass
 
-
-
-    def test_expect_file_hash_to_equal(self):
-        test_file = './tests/test_sets/Titanic.csv'
-        # Test for non-existent file
-        try:
-            ge.expect_file_hash_to_equal('abc', value='abc')
-        except IOError:
-            pass
-        # Test for non-existent hash algorithm
-        try:
-            ge.expect_file_hash_to_equal(test_file,
-                                         hash_alg='md51',
-                                         value='abc')
-        except ValueError:
-            pass
-        # Test non-matching hash value
-        self.assertFalse(ge.expect_file_hash_to_equal(test_file,
-                                                      value='abc'))
-        # Test matching hash value with default algorithm
-        self.assertTrue(ge.expect_file_hash_to_equal(test_file,
-                                                     value='63188432302f3a6e8c9e9c500ff27c8a'))
-        # Test matching hash value with specified algorithm
-        self.assertTrue(ge.expect_file_hash_to_equal(test_file,
-                                                     value='f89f46423b017a1fc6a4059d81bddb3ff64891e3c81250fafad6f3b3113ecc9b',
-                                                     hash_alg='sha256'))
-
     def test_validate_distribution_parameters(self):
         D = ge.read_csv('./tests/test_sets/fixed_distributional_test_dataset.csv')