Skip to content

Commit

Permalink
Merge 27f2a7d into b29348b
Browse files Browse the repository at this point in the history
  • Loading branch information
jcampbell committed Aug 13, 2019
2 parents b29348b + 27f2a7d commit 764827c
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 4 deletions.
2 changes: 1 addition & 1 deletion great_expectations/cli/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def profile_datasource(context, data_source_name, data_assets=None, profile_all_

else:
cli_message(
"Okay, skipping HTML documentation for now.`."
"Okay, skipping HTML documentation for now."
)


Expand Down
12 changes: 10 additions & 2 deletions great_expectations/dataset/sparkdf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,15 @@
logger = logging.getLogger(__name__)

try:
from pyspark.sql.functions import udf, col, lit, stddev_samp, length as length_, when, year, count
from pyspark.sql.functions import (
udf, col, lit,
stddev_samp,
length as length_,
when,
year,
count,
countDistinct
)
import pyspark.sql.types as sparktypes
from pyspark.ml.feature import Bucketizer
from pyspark.sql import Window
Expand Down Expand Up @@ -268,7 +276,7 @@ def get_column_value_counts(self, column):
return series

def get_column_unique_count(self, column):
return self.get_column_value_counts(column).shape[0]
return self.spark_df.agg(countDistinct(column)).collect()[0][0]

def get_column_modes(self, column):
"""leverages computation done in _get_column_value_counts"""
Expand Down
2 changes: 1 addition & 1 deletion great_expectations/profile/basic_dataset_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _get_column_cardinality(cls, df, column):
'result']['observed_value']
pct_unique = df.expect_column_proportion_of_unique_values_to_be_between(
column, None, None)['result']['observed_value']
except KeyError: # if observed_value value is not set
except KeyError: # if observed_value value is not set
logger.exception("Failed to get cardinality of column {0:s} - continuing...".format(column))

if num_unique is None or num_unique == 0 or pct_unique is None:
Expand Down

0 comments on commit 764827c

Please sign in to comment.