In [3]:
import pyspark
from pyspark.sql.functions import date_format, col
from pyspark.sql import SparkSession
import logging
import great_expectations as gx
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [2]:
!pip install great_expectations==0.16.13

Defaulting to user installation because normal site-packages is not writeable
Collecting great_expectations==0.16.13
  Obtaining dependency information for great_expectations==0.16.13 from https://files.pythonhosted.org/packages/62/92/ffd54e99c84cccbf1cd3562726e6c2330e0c1fbf593678719ac3ea212c11/great_expectations-0.16.13-py3-none-any.whl.metadata
  Downloading great_expectations-0.16.13-py3-none-any.whl.metadata (11 kB)


Downloading great_expectations-0.16.13-py3-none-any.whl (5.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[33mDEPRECATION: distro-info 1.1build1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: great_expectations
  Attempting uninstall: great_expectations
    Found existing installation: great-expectations 0.17.12


    Uninstalling great-expectations-0.17.12:
      Successfully uninstalled great-expectations-0.17.12
Successfully installed great_expectations-0.16.13


In [4]:
def create_spark_session():
    
    """
    Create the spark session with the passed configs.
    """
    
    spark = SparkSession \
        .builder \
        .appName("How-Desafio-02")\
        .getOrCreate()

    return spark

In [5]:
def process_order(spark, input_data, output_data):

    """
    Perform ETL on orders to create the orders_silver:
    - Extract the match result data and insert in the match_results table.
      
    Parameters:
    - spark: spark session
    - input_data : path to input files
    - output_data : path to output files
    """


    #reading json files
    order_file_Path = input_data

    orders_df = (spark.read
                  .option("inferSchema", True)
                  .json(order_file_Path))
    
    orders_df_partition = orders_df.withColumn('date_partition', date_format(col('order_created_at'), "yyyy-MM-dd"))

    data_quality(orders_df_partition)

    orders_df_partition.write.parquet(os.path.join(output_data, 'orders.parquet'), 'overwrite')

    
    print("--- orders.parquet completed ---")

In [6]:
def data_quality(input_dataset):
    
    gx_context = gx.get_context()
    datasource = gx_context.sources.add_spark("my_spark_datasource")

    data_asset = datasource.add_dataframe_asset(name="my_df_asset", dataframe=input_dataset).build_batch_request()
    
    gx_context.add_or_update_expectation_suite("my_expectation_suite")
    
    #my_batch_request = data_asset
    
    validator = gx_context.get_validator(
    batch_request=data_asset,
    expectation_suite_name="my_expectation_suite"
                                        )
    
    order_null = validator.expect_column_values_to_not_be_null(column="order_id")
    order_unique = validator.expect_column_values_to_be_unique(column="order_id")
    date_format = validator.expect_column_values_to_match_strftime_format("date_partition", "%Y-%m-%d")
    rows_number = validator.expect_table_row_count_to_be_between(400,600)

    
    if order_null.success == False :
      raise ValueError(f"Data quality check failed {order_null.expectation_config.kwargs['column']} is null.")
    
    elif order_unique.success == False :
      raise ValueError(f"Data quality check failed {order_unique.expectation_config.kwargs['column']} is not unique.")
    
    elif date_format.success == False :
      raise ValueError(f"Data quality check failed {date_format.expectation_config.kwargs['column']} is not in {date_format.expectation_config.kwargs['strftime_format']} format.")
    
    #elif rows_number.success == False :
    #  raise ValueError(f"Data quality check failed number of rows is not between {rows_number.expectation_config.kwargs['min_value']} and {rows_number.expectation_config.kwargs['max_value']}.")
    
    else: logger.info(f"All validators passed with success!")
    

In [7]:
order_file_Path = "./order-data/*/*.json"

spark = create_spark_session()

orders_df = (spark.read
                  .option("inferSchema", True)
                  .json(order_file_Path))

23/09/01 14:22:51 WARN Utils: Your hostname, 14111-NB resolves to a loopback address: 127.0.1.1; using 172.26.45.45 instead (on interface eth0)
23/09/01 14:22:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/01 14:22:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [8]:
orders_df_partition = orders_df.withColumn('date_partition', date_format(col('order_created_at'), "yyyy-MM-dd"))

In [9]:
data_quality(orders_df_partition)


INFO:great_expectations.util:Could not find local file-backed GX project
INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmppfcg1p0r' for ephemeral docs site
INFO:great_expectations.data_context.data_context.abstract_data_context:EphemeralDataContext has not implemented `_load_fluent_config()` returning empty `GxConfig`
INFO:great_expectations.datasource.fluent.config:Loading 'datasources' ->
[]
INFO:great_expectations.datasource.fluent.fluent_base_model:SparkDatasource.dict() - substituting config values
23/09/01 14:26:00 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
  



Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

                                                                                

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

                                                                                

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

                                                                                

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:__main__:All validators passed with success!
