# Data Validation with Great Expectation

In [1]:
import great_expectations as ge
import pandas as pd

In [2]:
# membaca data hasil yang sudah di load dan clean
df = pd.read_csv("P2M3_Muhammad_Aldzahabi_data_clean.csv")
ge_df = ge.from_pandas(df)


In [3]:
# melihat dataset yang sudah di load 
df.head()

Unnamed: 0,id,date,product_name,product_type,brand,gender,category,country,quantity,unit_price,amount,payment_mode,@timestamp
0,347,2022-01-01,Nike Dunk Low,Sneakers,Nike,Women,Streetwear,Japan,2,294.89,589.78,Wallet,2022-01-01
1,258,2022-01-01,Yeezy Boost 350,Sneakers,Adidas,Unisex,Casual,USA,1,202.42,202.42,Wallet,2022-01-01
2,10,2022-01-02,Jordan 1 High,Sneakers,Nike,Unisex,Sportswear,Japan,4,298.22,1192.88,Wallet,2022-01-02
3,320,2022-01-03,Puma Joggers,Joggers,Puma,Women,Limited Edition,Japan,2,267.42,534.84,Cash on Delivery,2022-01-03
4,157,2022-01-03,Fear of God Essentials Tee,T-shirt,Essentials,Unisex,Casual,UK,5,73.58,367.9,UPI,2022-01-03


In [4]:
# memastikan kolom dari date
ge_df.expect_column_to_exist("date")

{
  "success": true,
  "result": {},
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [5]:
# memastikan kolom product_name tidak kosong
ge_df.expect_column_values_to_not_be_null("product_name")

{
  "success": true,
  "result": {
    "element_count": 353,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [6]:
# Memastikan nilai maksimum quantity berada dalam batas wajar
ge_df.expect_column_max_to_be_between(
    column="quantity",
    min_value=1,
    max_value=100
)

{
  "success": true,
  "result": {
    "observed_value": 5,
    "element_count": 353,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# Memastikan rata-rata jumlah pembelian berada pada rentang wajar
ge_df.expect_column_mean_to_be_between(
    column="quantity",
    min_value=1,
    max_value=10
)

{
  "success": true,
  "result": {
    "observed_value": 3.1076487252124645,
    "element_count": 353,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Memastikan dataset tidak kosong
ge_df.expect_table_row_count_to_be_between(
    min_value=1,
    max_value=1_000_000
)

{
  "success": true,
  "result": {
    "observed_value": 353
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Memastikan seluruh nilai quantity berada pada rentang wajar
ge_df.expect_column_values_to_be_between(
    column="quantity",
    min_value=0,
    max_value=100
)

{
  "success": true,
  "result": {
    "element_count": 353,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# Memastikan kolom id berurutan secara meningkat
ge_df.expect_column_values_to_be_increasing(
    column="id",
    strictly=True
)

{
  "success": false,
  "result": {
    "element_count": 353,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 186,
    "unexpected_percent": 52.69121813031161,
    "unexpected_percent_total": 52.69121813031161,
    "unexpected_percent_nonmissing": 52.69121813031161,
    "partial_unexpected_list": [
      258,
      10,
      157,
      251,
      33,
      226,
      183,
      71,
      287,
      270,
      216,
      132,
      196,
      128,
      47,
      137,
      114,
      185,
      17,
      107
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}