In [2]:
import great_expectations as ge
context = ge.data_context.DataContext()

In [3]:
from dotenv import load_dotenv
from great_expectations.dataset.sparkdf_dataset import SparkDFDataset
import os

load_dotenv()
s3_access_key = os.environ.get("S3_ACCESS_KEY")
s3_secret_key = os.environ.get("S3_SECRET_KEY")
CORES = 2 # Remember to use 2 cores for laptop work and 4 cores for local machine

- Common columns
- Different columns
- Compare data for common columns in both files
- Difference in the amount of items stored in each parquet file
- Count and group the occurrences of the value “trustRate” in each file
- Count and group the occurrences of the value “humanId” in each file
- Determine based on the contents of the compared files whether there is a major or critical difference in terms of data
- Any additional statistical information considered useful is appreciated (within the context of an identity resolution process)

1. Compare A to B in notebook with ge
2. See how to integrate with CI-CD
3. Final work

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Loyalty") \
    .config("spark.master", f"local[{CORES}]") \
    .config("spark.executor.cores", f"{CORES}") \
    .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:3.3.1") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", s3_access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", s3_secret_key) \
    .getOrCreate()

In [5]:
#Fetch results A, B and C parquets data from S3 
df_results_a = spark.read.parquet("s3a://data-test-202302/adstra/resultsA.parquet")
df_results_b = spark.read.parquet("s3a://data-test-202302/adstra/resultsB.parquet")
df_results_c = spark.read.parquet("s3a://data-test-202302/adstra/resultsC.parquet")

In [6]:
df_results_a.show(5)

+--------+------+------+--------------------+--------+-------+-----+----------+----------+-------+--------------------+---------+
|   index| fname| lname|               email|socialid|country|state|    mobile|     phone|zipcode|             humanId|trustRate|
+--------+------+------+--------------------+--------+-------+-----+----------+----------+-------+--------------------+---------+
|r0000001| james|robles|    person1@mail.net|774121t9|     mx|   tj|9991200767|8881200767|  49162|ae7b5b63-da1c-493...|       80|
|r0000002|daniel| boyle|roddaniel445@comp...|998112a4|     ca|   on|9991200610|8881200610|  20883|89abf215-f658-4a0...|       50|
|r0000003|  xxxx| xxxxx|              xxxxxx|878445b2|     us|   fl|      null|      null|   null|                null|        0|
|r0000004| marie| velez|marie.rodarte123@...|798125x4|     ca|   ot|9991200114|8881200114|  61415|19ce37e6-2edf-41e...|       50|
|r0000005|MARTIN|TORRES|not.provided@abc.com|981210o2|     us|   ca|9991200276|8881200276|

In [7]:
df_results_b.show(5)

+--------+------+------+--------------------+--------+-------+----------+----------+------+--------------------+---------+
|   index| fname| lname|               email|socialid|country|    mobile|     phone|gender|             humanId|trustRate|
+--------+------+------+--------------------+--------+-------+----------+----------+------+--------------------+---------+
|r0000001| james|robles|    person1@mail.net|774121t9|     mx|9991200767|8881200767|     f|ae7b5b63-da1c-493...|       99|
|r0000002|daniel| boyle|roddaniel445@comp...|998112a4|     ca|9991200610|8881200610|     f|89abf215-f658-4a0...|       50|
|r0000003|  xxxx| xxxxx|              xxxxxx|878445b2|     us|      null|      null|  null|                null|        0|
|r0000004| marie| velez|marie.rodarte123@...|798125x4|     ca|9991200114|8881200114|     f|19ce36e6-2edf-41e...|       50|
|r0000005|MARTIN|TORRES|not.provided@abc.com|981210o2|     us|9991200276|8881200276|     f|3b068d0c-3671-45c...|       60|
+--------+------

In [1]:
import great_expectations as gx
context = gx.get_context()



In [8]:
suite = context.create_expectation_suite("example")

In [33]:
ge_results_a = SparkDFDataset(df_results_a)

In [31]:
ge_results_b = SparkDFDataset(df_results_b)
ge_results_b.expect_column_values_to_not_be_null("lname")

{
  "result": {
    "element_count": 14,
    "unexpected_count": 1,
    "unexpected_percent": 7.142857142857142,
    "unexpected_percent_total": 7.142857142857142,
    "partial_unexpected_list": [
      null
    ]
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": false,
  "meta": {}
}

In [32]:
sui = ge_results_b.get_expectation_suite(discard_failed_expectations=False)
sui

{
  "data_asset_type": "Dataset",
  "expectation_suite_name": "default",
  "expectations": [
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "lname"
      }
    }
  ],
  "ge_cloud_id": null,
  "meta": {
    "great_expectations_version": "0.16.10"
  }
}

In [34]:
ge_results_a.validate(expectation_suite=sui)

{
  "results": [
    {
      "result": {
        "element_count": 16,
        "unexpected_count": 1,
        "unexpected_percent": 6.25,
        "unexpected_percent_total": 6.25,
        "partial_unexpected_list": [
          null
        ]
      },
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "success": false,
      "meta": {},
      "expectation_config": {
        "meta": {},
        "expectation_type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "column": "lname"
        }
      }
    }
  ],
  "success": false,
  "meta": {
    "great_expectations_version": "0.16.10",
    "expectation_suite_name": "default",
    "run_id": {
      "run_name": null,
      "run_time": "2023-05-04T00:14:38.508900+00:00"
    },
    "batch_kwargs": {
      "ge_batch_id": "a10da198-ea10-11ed-b42a-0242ac110003"
    },
    "batch_markers": {},
    "batch_parameters": {},
    "validati

In [14]:
batch_kwargs = {
    "datasource": "here",
    "dataset": ge_results_b
}

In [15]:
batch = context.get_batch(
    batch_kwargs=batch_kwargs,
    expectation_suite_name=suite
)

ValueError: Unable to load datasource `here` -- no configuration found or invalid configuration.