# 1. Introductions

Nama : Erlangga Jayadipraja

Batch : SBY - 002

Making data Great Expectations

# 2. Import Libraries

In [1]:
import pandas as pd
from great_expectations.data_context import FileDataContext

# 3. Data Load

In [2]:
df = pd.read_csv('P2M3_erlangga_jayadipraja_data_clean.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,customer_id,churn,tenure,preferred_login_device,city_tier,warehouse_to_home,preferred_payment_mode,gender,hour_spend_on_app,...,prefered_order_cat,satisfaction_score,marital_status,number_of_address,complain,order_amount_hike_from_last_year,coupon_used,order_count,day_since_last_order,cashback_amount
0,0,50001,1,4.0,Mobile Phone,3,6.0,Debit Card,Female,3.0,...,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,160
1,3,50004,1,0.0,Phone,3,15.0,Debit Card,Male,2.0,...,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134
2,5,50006,1,0.0,Computer,1,22.0,Debit Card,Female,3.0,...,Mobile Phone,5,Single,2,1,22.0,4.0,6.0,7.0,139
3,11,50012,1,11.0,Mobile Phone,1,6.0,Debit Card,Male,3.0,...,Fashion,3,Single,10,1,13.0,0.0,1.0,0.0,154
4,12,50013,1,0.0,Phone,1,11.0,COD,Male,2.0,...,Mobile,3,Single,2,1,13.0,2.0,2.0,2.0,134


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3774 entries, 0 to 3773
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        3774 non-null   int64  
 1   customer_id                       3774 non-null   int64  
 2   churn                             3774 non-null   int64  
 3   tenure                            3774 non-null   float64
 4   preferred_login_device            3774 non-null   object 
 5   city_tier                         3774 non-null   int64  
 6   warehouse_to_home                 3774 non-null   float64
 7   preferred_payment_mode            3774 non-null   object 
 8   gender                            3774 non-null   object 
 9   hour_spend_on_app                 3774 non-null   float64
 10  number_of_device_registered       3774 non-null   int64  
 11  prefered_order_cat                3774 non-null   object 
 12  satisf

# 4. Instantiate Data Context

In [4]:
context = FileDataContext.create(project_root_dir='./')

# 5. Connect to A Datasource

In [5]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'milestone_3_data_clean'
datasource = context.sources.add_pandas(datasource_name)

In [6]:
# Give a name to a data asset
asset_name = 'data_clean'
path_to_data = 'P2M3_erlangga_jayadipraja_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)


In [7]:
# Build batch request
batch_request = asset.build_batch_request()

# 6. Create an Expectation Suite

In [8]:
# Creat an expectation suite
expectation_suite_name = 'expectation-clean-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,customer_id,churn,tenure,preferred_login_device,city_tier,warehouse_to_home,preferred_payment_mode,gender,hour_spend_on_app,...,prefered_order_cat,satisfaction_score,marital_status,number_of_address,complain,order_amount_hike_from_last_year,coupon_used,order_count,day_since_last_order,cashback_amount
0,0,50001,1,4.0,Mobile Phone,3,6.0,Debit Card,Female,3.0,...,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,160
1,3,50004,1,0.0,Phone,3,15.0,Debit Card,Male,2.0,...,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134
2,5,50006,1,0.0,Computer,1,22.0,Debit Card,Female,3.0,...,Mobile Phone,5,Single,2,1,22.0,4.0,6.0,7.0,139
3,11,50012,1,11.0,Mobile Phone,1,6.0,Debit Card,Male,3.0,...,Fashion,3,Single,10,1,13.0,0.0,1.0,0.0,154
4,12,50013,1,0.0,Phone,1,11.0,COD,Male,2.0,...,Mobile,3,Single,2,1,13.0,2.0,2.0,2.0,134


# 7. Expectations

## 7.1 - Expectation (to be unique)

In [9]:
# Expectation 1 : Column `customer_id` must be unique

validator.expect_column_values_to_be_unique('customer_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3774,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 7.2 - Expectation (to be between min_value and max_value)

In [10]:
# Expectation 2 : Column `satisfaction_score` must between 0 and 5

validator.expect_column_values_to_be_between(
    column='satisfaction_score', min_value=0, max_value=5
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3774,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 7.3 - Expectation (to be in set)

In [11]:
# Expectation 3 : Column 'churn' must contain one of the following 2 things :
# 1 : Churn
# 0 : No Churn

validator.expect_column_values_to_be_in_set('churn',[0,1])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3774,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 7.4 - Expectation (to be type list)

In [12]:
# Expectation 4 : Column `tenure` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('tenure', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 7.5 - Expectation (to not be null)

In [13]:
# Expectation 5 : Column 'prefered_order_cat' can not contain missing values

validator.expect_column_values_to_not_be_null('prefered_order_cat')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3774,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 7.6 - Expectation (max_to_be_between)

In [14]:
# Expectation 6 : Column 'complain' have 1 max values

validator.expect_column_max_to_be_between('complain',min_value=0, max_value=1)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 1
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 7.7 - Expectation (mean_to_be_between)

In [15]:
# # Expectation 7 : Column 'hour_spend_on_app' have 2.981187 mean values

validator.expect_column_mean_to_be_between('hour_spend_on_app', min_value=0, max_value=5)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 2.9811870694223637
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Save Into Expectation Suite

In [16]:
validator.save_expectation_suite(discard_failed_expectations=False)