# Install Great Expectation Package

In [67]:
# # Install the library

# !pip install -q great-expectations

# Instantiate Data Context

In [68]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

# Connect to a DataSource

In [69]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-jan'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'Smartphone Retail Outlet Sales Data'
path_to_data = 'P2M3_Fadhilah_data_raw.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

Menghubungkan dataset raw sebagai data source

# Create an Expectation Suite

In [70]:
# Creat an expectation suite
expectation_suite_name = 'expectation-smartphone-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Date,F.Y,QUARTER,P_NO,PAYMENT TYPE,TYPE OF PRODUCT,Quantity,Price,Amount,TYPE OF ACCESSORY/MOBILE
0,01-04-2018,2018-2019,1,P/A/36,CASH,ACCESSORY,1,2042.38,2542.38,COVER
1,01-04-2018,2018-2019,1,P/A/47,CASH,ACCESSORY,1,2042.38,2118.64,WIRELESS HEADSET
2,01-04-2018,2018-2019,1,P/M/A34,CREDIT,MOBILE,1,2042.38,13303.58,BUDGET PHONE
3,01-04-2018,2018-2019,1,P/M/A42,CREDIT,MOBILE,1,2042.38,51696.43,FLAGSHIP PHONE
4,01-04-2018,2018-2019,1,P/M/A42,DEBIT,MOBILE,1,2042.38,51696.42,FLAGSHIP PHONE


## Expectation

In [71]:
# Expectation 1 : Column `Quantity` can not contain missing values

validator.expect_column_values_to_not_be_null('Quantity')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 6421,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Expectation 1 : Column `Quantity` can not contain missing values == Success, unexpected percent (0.0%)

In [72]:
# Expectation 2 : Column `Amount` must be unique

validator.expect_column_values_to_be_unique('Amount')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 6421,
    "unexpected_count": 6348,
    "unexpected_percent": 98.86310543529045,
    "partial_unexpected_list": [
      2542.38,
      2118.64,
      13303.58,
      51696.43,
      51696.42,
      57946.42,
      31241.08,
      13303.58,
      57946.42,
      2203.38,
      31241.08,
      51696.42,
      51696.43,
      57946.42,
      10084.74,
      1906.78,
      44633.93,
      1258.92,
      12410.72,
      46419.64
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 98.86310543529045,
    "unexpected_percent_nonmissing": 98.86310543529045
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Expectation 2 : Column `Amount` must be unique == Fail, unexpected percent (98.8%)

In [73]:
# Expectation 3 : Column `Amount` must be less than $ 20000

validator.expect_column_values_to_be_between(
    column='Price', min_value=100, max_value=20000
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 6421,
    "unexpected_count": 1785,
    "unexpected_percent": 27.799408191870423,
    "partial_unexpected_list": [
      57446.42,
      60125.0,
      60125.0,
      51196.42,
      51196.42,
      51196.42,
      61017.86,
      61017.86,
      60125.0,
      51196.42,
      58339.28,
      51196.42,
      61017.86,
      57446.43,
      57446.42,
      30741.08,
      51196.42,
      51196.42,
      57446.42,
      51196.43
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 27.799408191870423,
    "unexpected_percent_nonmissing": 27.799408191870423
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Expectation 3 : Column `Amount` must be less than $ 20000 == Fail, unexpected percent (27.7%)


In [74]:
# Expectation 4 : Column `Amount` must be there to calculate the amount of cost of 1 product that must be paid

validator.expect_column_to_exist(column='Amount')

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "result": {},
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Expectation 4 : Column `Amount` must be there to calculate the amount of cost of 1 product that must be paid == Success, unexpected percent (0.0%)

In [75]:
# Expectation 5 : Column `QUARTER` must contain one of the following 6 things :
# 1 = Januari, Februari, Maret
# 2 = April, Mei, Juni
# 3 = Juli, Agustus, September
# 4 = Oktober, November, Desember

validator.expect_column_values_to_be_in_set('QUARTER', [1, 2, 3, 4])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 6421,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Expectation 5 : Column `QUARTER` must contain one of the following 6 things :
- 1 = Januari, Februari, Maret
- 2 = April, Mei, Juni
- 3 = Juli, Agustus, September
- 4 = Oktober, November, Desember  
Expectation is Success, unexpected percent (0.0%) 

In [76]:
# Expectation 6 : Column `Price` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('Price', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Expectation 6 : Column `Price` must in form of integer or float == Success, unexpected percent (0.0%)

In [77]:
# Expectation 7 : The average of `Quantity` must in range 1 - 2

validator.expect_column_mean_to_be_between('Quantity', 1, 2)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 1.058246379068681
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Expectation 7 : The average of `Quantity` must in range 1 - 2 == Success, unexpected peercent (0.0%)

In [78]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

## Checkpoint

In [79]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [80]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/30 [00:00<?, ?it/s]

Checkpoint with data raw 100%

## Data Docs

In [81]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://c:\\Users\\Fadhilah\\Desktop\\P2M3-Fadhilah\\dags\\gx\\uncommitted/data_docs/local_site/index.html'}

# Data Validation using Another File

In [82]:
# Connect to a data source

import great_expectations as gx

context_jan = gx.get_context(context_root_dir='./gx/')

In [83]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-feb'
datasource = context_jan.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'Outlet_Smartphone'
path_to_data = 'P2M3_Fadhilah_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request_feb = asset.build_batch_request()

In [84]:
# Create a checkpoint

checkpoint_2 = context_jan.add_or_update_checkpoint(
    name = 'checkpoint_2',
    batch_request = batch_request_feb,
    expectation_suite_name = expectation_suite_name
)

checkpoint_result = checkpoint_2.run()

Calculating Metrics:   0%|          | 0/30 [00:00<?, ?it/s]

Checkpoint with data clean (another data) result 60%

In [85]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://c:\\Users\\Fadhilah\\Desktop\\P2M3-Fadhilah\\dags\\gx\\uncommitted/data_docs/local_site/index.html'}