# 1 - Introduction

Name : Gilang Wiradhyaksa

Data : [Ocean Cafe](https://www.kaggle.com/datasets/gladinvarghese/cafeocean)

Objective : The purpose of this task is to create data validation to check if the data is ready to use after cleaning.

# 2 - Import Libraries

In [27]:
# Create a data context
from great_expectations.data_context import FileDataContext
context = FileDataContext.create(project_root_dir='./')

# 3 - Data Loading

In [28]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'CSV_Clean'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'Restaurant_Order'
path_to_data = 'P2M3_gilang_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [29]:
# Creat an expectation suite
expectation_suite_name = 'expectation-restaurant-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,date,bill_number,item_desc,time,quantity,rate,tax,discount,total,category,year,month,month_name,day_name,is_weekend
0,0,2020-01-01,G0470115,MINERAL WATER(1000ML),01:15:11,1,50.0,11.88,0.0,61.88,Beverage,2020,1,January,Wednesday,No
1,1,2020-01-01,G0470115,MONSOON MALABAR (AULAIT),01:15:11,1,100.0,23.75,0.0,123.75,Beverage,2020,1,January,Wednesday,No
2,2,2020-01-01,G0470116,MASALA CHAI CUTTING,01:17:35,1,40.0,9.5,0.0,49.5,Beverage,2020,1,January,Wednesday,No
3,3,2020-01-01,G0470117,MINERAL WATER(1000ML),01:19:55,1,50.0,11.88,0.0,61.88,Beverage,2020,1,January,Wednesday,No
4,4,2020-01-01,G0470283,MOROCCAN MINT TEA,01:20:18,1,45.0,10.69,0.0,55.69,Beverage,2020,1,January,Wednesday,No


# 4 - Data Validation

## 4.1 - Validate Unique Data

In [30]:
# Expectation 1 : Column `ID` has to be unique

X = validator.expect_column_values_to_be_unique('id')
X.success

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

True

## 4.2 - Validate Minimal and Maximal Value

In [31]:
# Expectation 2 : Column `rate` must be between $0 and $2100

validator.expect_column_values_to_be_between(
    column='rate', min_value=0, max_value=2100
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 145150,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 4.3 - Validate Set Data

In [35]:
# Expectation 3 : Column `category` must be contains 'Beverage', 'Food', 'Liquor', 'Merchandise', 'Tobacco', 'Misc', 'Wines', 'Liquor & Tobacco'

catColumn = ['Beverage', 'Food', 'Liquor', 'Merchandise', 'Tobacco', 'Misc', 'Wines', 'Liquor & Tobacco']
validator.expect_column_values_to_be_in_set('category', catColumn)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 145150,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 4.4 - Validate Data Type

In [36]:
# Expectation 4 : Column `total` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('total', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 4.5 - Validate Greater Than

In [40]:
# Expectation 5 : Column `total` must be greater than column `tax`

validator.expect_column_pair_values_a_to_be_greater_than_b('total', 'tax')

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 145150,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 4.6 - Validate Data Length

In [45]:
# Expectation 6 : Column `quantity` length to be 

validator.expect_column_value_lengths_to_be_between('quantity', 1, 2)

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 145150,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 4.7 - Validate Data Count

In [55]:
# Expectation 7 : Column `quantity` length to be 

validator.expect_table_row_count_to_be_between(1, 150000)

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 145150
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}