From 6c008899d803cd58301b0992f8a9b654564f9d22 Mon Sep 17 00:00:00 2001 From: James Campbell Date: Fri, 6 Jul 2018 15:28:40 -0400 Subject: [PATCH 1/3] Add explicit sort to pd.concat to future-proof change in behavior. --- great_expectations/dataset/pandas_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/great_expectations/dataset/pandas_dataset.py b/great_expectations/dataset/pandas_dataset.py index c6355d8bd63c..a5d113a3e2bc 100644 --- a/great_expectations/dataset/pandas_dataset.py +++ b/great_expectations/dataset/pandas_dataset.py @@ -1078,7 +1078,7 @@ def expect_column_chisquare_test_p_value_to_be_greater_than(self, column, partit # Convert to Series object to allow joining on index values expected_column = pd.Series(partition_object['weights'], index=partition_object['values'], name='expected') * len(column) # Join along the indices to allow proper comparison of both types of possible missing values - test_df = pd.concat([expected_column, observed_frequencies], axis = 1) + test_df = pd.concat([expected_column, observed_frequencies], axis=1, sort=True) na_counts = test_df.isnull().sum() @@ -1218,7 +1218,7 @@ def expect_column_kl_divergence_to_be_less_than(self, column, partition_object=N # Data are expected to be discrete, use value_counts observed_weights = column.value_counts() / len(column) expected_weights = pd.Series(partition_object['weights'], index=partition_object['values'], name='expected') - test_df = pd.concat([expected_weights, observed_weights], axis=1) + test_df = pd.concat([expected_weights, observed_weights], axis=1, sort=True) na_counts = test_df.isnull().sum() From 940e5fc4cb9fd27a85141df2a249e9ccdc2b10f2 Mon Sep 17 00:00:00 2001 From: James Campbell Date: Fri, 6 Jul 2018 15:30:41 -0400 Subject: [PATCH 2/3] Fix implicit column reference, and associated warning. --- great_expectations/dataset/sqlalchemy_dataset.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/great_expectations/dataset/sqlalchemy_dataset.py b/great_expectations/dataset/sqlalchemy_dataset.py index ad2b1664a2a4..24cc215f3019 100644 --- a/great_expectations/dataset/sqlalchemy_dataset.py +++ b/great_expectations/dataset/sqlalchemy_dataset.py @@ -653,12 +653,11 @@ def expect_column_median_to_be_between(self, sa.func.sum( sa.case([(sa.column(column) == None, 1)], else_=0) ).label('null_count') - ]). - select_from(sa.table(self.table_name)) + ]).select_from(sa.table(self.table_name)) ) element_values = self.engine.execute( - sa.select(column).order_by(column).where( + sa.select([sa.column(column)]).order_by(sa.column(column)).where( sa.column(column) != None ).select_from(sa.table(self.table_name)) ) From 8bc034116914635f071b7d9891c2f2f65e310d0a Mon Sep 17 00:00:00 2001 From: James Campbell Date: Fri, 6 Jul 2018 15:41:42 -0400 Subject: [PATCH 3/3] Refactor median to pull back only candidate median values, using same logic as previously. --- .../dataset/sqlalchemy_dataset.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/great_expectations/dataset/sqlalchemy_dataset.py b/great_expectations/dataset/sqlalchemy_dataset.py index 24cc215f3019..b5b8c5479df4 100644 --- a/great_expectations/dataset/sqlalchemy_dataset.py +++ b/great_expectations/dataset/sqlalchemy_dataset.py @@ -656,28 +656,27 @@ def expect_column_median_to_be_between(self, ]).select_from(sa.table(self.table_name)) ) + elements = count_query.fetchone() + # The number of non-null/non-ignored values + nonnull_count = elements['element_count'] - elements['null_count'] + element_values = self.engine.execute( sa.select([sa.column(column)]).order_by(sa.column(column)).where( sa.column(column) != None - ).select_from(sa.table(self.table_name)) + ).offset(nonnull_count // 2 - 1).limit(2).select_from(sa.table(self.table_name)) ) - # Fetch the Element count, null count, and sorted/null dropped column values - elements = count_query.fetchone() column_values = list(element_values.fetchall()) - # The number of non-null/non-ignored values - nonnull_count = elements['element_count'] - elements['null_count'] - if nonnull_count % 2 == 0: # An even number of column values: take the average of the two center values column_median = ( - column_values[nonnull_count // 2 - 1][0] + # left center value - column_values[nonnull_count // 2][0] # right center value + column_values[0][0] + # left center value + column_values[1][0] # right center value ) / 2.0 # Average center values else: # An odd number of column values, we can just take the center value - column_median = column_values[nonnull_count // 2][0] # True center value + column_median = column_values[1][0] # True center value return { 'success':