Skip to content

Commit

Permalink
Merge branch 'develop' into bugfix_in_calc_map_expectation_success
Browse files Browse the repository at this point in the history
  • Loading branch information
jcampbell committed Sep 8, 2018
2 parents 833b420 + 57b3b4d commit b296f5e
Show file tree
Hide file tree
Showing 10 changed files with 367 additions and 71 deletions.
1 change: 1 addition & 0 deletions great_expectations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .util import *
from great_expectations import dataset
from great_expectations.data_context import get_data_context
from .file_expectations import *

from .version import __version__
226 changes: 226 additions & 0 deletions great_expectations/file_expectations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
# This code is early and experimental. eventually it will live in a RawFileDataset class, per issue 321. In the
# meantime, please use, enjoy, and share feedback.

import csv
import hashlib
import json
import jsonschema
import os.path

def expect_file_hash_to_equal(filename, value, hash_alg='md5'):
"""
Return True or False indicating whether the hash matches the specified value for the default (md5) or user-specified hash algorithm
Parameters
----------
filename : string
file on which the hash is computed
value : string
value to compare to computed hash
hash_alg : string, default='md5'
hash alogorithm to use. See hashlib.algorithms_available for supported algorithms.
Returns
-------
True if the computed hash matches the specified value; False otherwise
Raises
------
IOError
if there is a problem reading the specified file
ValueError
if the specified hash algorithm is not defined by hashlib
"""
success = False
try:
hash = hashlib.new(hash_alg)
# Limit file reads to 64 KB chunks at a time
BLOCKSIZE = 65536
try:
with open(filename, 'rb') as file:
file_buffer = file.read(BLOCKSIZE)
while len(file_buffer) > 0:
hash.update(file_buffer)
file_buffer = file.read(BLOCKSIZE)
success = hash.hexdigest() == value
except IOError:
raise
except ValueError:
raise
return success

def expect_file_size_to_be_between(filename, minsize, maxsize):
"""
Return True or False indicating whether the file size (in bytes) is (inclusively) between two values.
Parameters
----------
filename : string
file to check file size
minsize : integer
minimum file size
maxsize : integer
maximum file size
Returns
-------
True if the computed hash matches the specified value; False otherwise
Raises
------
OSError
if there is a problem reading the specified file
TypeError
if minsize or maxsize are not integers
ValueError
if there is a problem with the integer value of minsize or maxsize
"""
try:
size = os.path.getsize(filename)
except OSError:
raise
if type(minsize) != int:
raise TypeError('minsize must be an integer')
if type(maxsize) != int:
raise TypeError('maxsize must be an integer')
if minsize < 0:
raise ValueError('minsize must be greater than of equal to 0')
if maxsize < 0:
raise ValueError('maxsize must be greater than of equal to 0')
if minsize > maxsize:
raise ValueError('maxsize must be greater than of equal to minsize')
if (size >= minsize) and (size <= maxsize):
return True
else:
return False

def expect_file_to_exist(filename):
"""
Return True or False indicating whether the specified file exists
Parameters
----------
filename : string
file to check for existence
Returns
-------
True if the specified file exists; False otherwise
"""
if os.path.isfile(filename):
return True
else:
return False

def expect_file_unique_column_names_csv(filename,
skipLines=0,
sep=',',
quoteChar='"',
quot=csv.QUOTE_MINIMAL,
doubleQuote=True,
skipInitialSpace=False,
escapeChar=None):
"""
Return True or False indicating whether the specified CSV file has unique column names.
Parameters
----------
filename : string
file on which the hash is computed
skipLines : integer
number of rows or lines to skip before reading the header line
sep : string
delimiter used for parsing CSV lines
quoteChar : string
one-character string used to quote fields containing special characters
quot :
controls when quotes should be recognised by the reader
doubleQuote : boolean
controls how instances of quotechar appearing inside a field should themselves be quoted
skipInitialSpace : boolean
when True, whitespace immediately following the delimiter is ignored
escapeChar : string
one-char string which removes any special meaning from the following character
Returns
-------
True if the column names are unique; False otherwise
Raises
------
IOError
if there is a problem reading the specified file
csv.Error
if there is an error in the csv methods
"""
success = False
try:
with open(filename, 'r') as f:
reader = csv.reader(f, delimiter=sep, quotechar=quoteChar, quoting=quot, doublequote=doubleQuote, skipinitialspace=skipInitialSpace, escapechar=escapeChar, strict=True)
if skipLines > 0:
for i in range(0, skipLines):
next(reader)
colnames = next(reader)
if len(set(colnames)) == len(colnames):
success = True
except IOError:
raise
except csv.Error:
raise
return success


def expect_file_valid_json(filename, schema=None):
"""
Return True or False indicating whether the specified file is valid JSON.
Parameters
----------
filename : string
file to test as valid JSON
schema : string
optional JSON schema file on which JSON data file is validated against
Returns
-------
True if the file is valid JSON; False otherwise
Raises
------
Error
if there is a problem reading a specified file or parsing as JSON
jsonschema.SchemaError
if there is an error in the provided schema
"""
success = False
if schema is None:
try:
with open(filename, 'r') as f:
json.load(f)
success = True
except ValueError:
success = False
else:
try:
with open(schema, 'r') as s:
schema_data = s.read()
sdata = json.loads(schema_data)
with open(filename, 'r') as f:
json_data = f.read()
jdata = json.loads(json_data)
jsonschema.validate(jdata, sdata)
success = True
except jsonschema.ValidationError:
success = False
except jsonschema.SchemaError:
raise
except:
raise
return success
44 changes: 0 additions & 44 deletions great_expectations/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import hashlib
import pandas as pd
import json

Expand Down Expand Up @@ -144,46 +143,3 @@ def __getattr__(self, attr):
__delattr__= dict.__delitem__
def __dir__(self):
return self.keys()

def expect_file_hash_to_equal(filename, value, hash_alg='md5'):
"""
Return True or False indicating whether the hash matches the specified value for the default (md5) or user-specified hash algorithm
Parameters
----------
filename : string
file on which the hash is computed
value : string
value to compare to computed hash
hash_alg : string, default='md5'
hash alogorithm to use. See hashlib.algorithms_available for supported algorithms.
Returns
-------
True if the computed hash matches the specified value; False otherwise
Raises
------
IOError
if there is a problem reading the specified file
ValueError
if the specified hash algorithm is not defined by hashlib
"""
success = False
try:
hash = hashlib.new(hash_alg)
# Limit file reads to 64 KB chunks at a time
BLOCKSIZE = 65536
try:
with open(filename, 'rb') as file:
file_buffer = file.read(BLOCKSIZE)
while len(file_buffer) > 0:
hash.update(file_buffer)
file_buffer = file.read(BLOCKSIZE)
success = hash.hexdigest() == value
except IOError:
raise
except ValueError:
raise
return success
27 changes: 0 additions & 27 deletions tests/test_dataset_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,33 +200,6 @@ def test_recursively_convert_to_json_serializable(self):
except NameError:
pass



def test_expect_file_hash_to_equal(self):
test_file = './tests/test_sets/Titanic.csv'
# Test for non-existent file
try:
ge.expect_file_hash_to_equal('abc', value='abc')
except IOError:
pass
# Test for non-existent hash algorithm
try:
ge.expect_file_hash_to_equal(test_file,
hash_alg='md51',
value='abc')
except ValueError:
pass
# Test non-matching hash value
self.assertFalse(ge.expect_file_hash_to_equal(test_file,
value='abc'))
# Test matching hash value with default algorithm
self.assertTrue(ge.expect_file_hash_to_equal(test_file,
value='63188432302f3a6e8c9e9c500ff27c8a'))
# Test matching hash value with specified algorithm
self.assertTrue(ge.expect_file_hash_to_equal(test_file,
value='f89f46423b017a1fc6a4059d81bddb3ff64891e3c81250fafad6f3b3113ecc9b',
hash_alg='sha256'))

def test_validate_distribution_parameters(self):
D = ge.read_csv('./tests/test_sets/fixed_distributional_test_dataset.csv')

Expand Down

0 comments on commit b296f5e

Please sign in to comment.