Nama: Muhammad Fariz Firdaus

Batch: SBY002

Objective: This file was made to perform data validation on the cleaned data, using great expectations package.

# I. Libraries

In [1]:
from great_expectations.data_context import FileDataContext

# II. Instantiate Data Context

In [2]:
# Create a data context

context = FileDataContext.create(project_root_dir='./')

# III. Connect to A `Datasource`

In [3]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-emp'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'attrition-data'
path_to_data = 'airflow\\data\\P2M3_MuhammadFariz_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# IV. Create an Expectation Suite

In [4]:
# Creat an expectation suite
expectation_suite_name = 'expectation-employee-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,attrition,business_travel,cf_age_band,cf_attrition_label,department,education_field,emp_no,employee_number,gender,job_role,...,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager
0,Yes,Travel_Rarely,35 - 44,Ex-Employees,Sales,Life Sciences,STAFF-1,1,Female,Sales Executive,...,11,3,1,0,8,1,6,4,0,5
1,No,Travel_Frequently,45 - 54,Current Employees,R&D,Life Sciences,STAFF-2,2,Male,Research Scientist,...,23,4,4,1,10,3,10,7,1,7
2,Yes,Travel_Rarely,35 - 44,Ex-Employees,R&D,Other,STAFF-4,4,Male,Laboratory Technician,...,15,3,2,0,7,3,0,0,0,0
3,No,Travel_Frequently,25 - 34,Current Employees,R&D,Life Sciences,STAFF-5,5,Female,Research Scientist,...,11,3,3,0,8,3,8,7,3,0
4,No,Travel_Rarely,25 - 34,Current Employees,R&D,Medical,STAFF-7,7,Male,Laboratory Technician,...,12,3,4,1,6,3,2,2,2,2


## 1 - Expectations

### 1.1. To be Non-Null

In [5]:
validator.expect_column_values_to_not_be_null(column='attrition')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1470,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### 1.2. To be Unique

In [6]:
validator.expect_column_values_to_be_unique(column='emp_no')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1470,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### 1.3. To be Between Min and Max Values

In [7]:
validator.expect_column_values_to_be_between(column='work_life_balance', min_value=1, max_value=5)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1470,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### 1.4. To be In a Set

In [8]:
validator.expect_column_values_to_be_in_set(column='business_travel', value_set=['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1470,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### 1.5. To be of Type List

In [9]:
validator.expect_column_values_to_be_in_type_list('monthly_rate', ['int64', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### 1.6. mean to be between

In [10]:
validator.expect_column_mean_to_be_between(column='age', min_value=30, max_value=40)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 36.923809523809524
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### 1.7. most common value to be in set

In [11]:
validator.expect_column_most_common_value_to_be_in_set(column='education', value_set=["Bachelor's Degree", "master's Degree"])

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      "Bachelor's Degree"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### 1.8. proportion of unique values to be between

In [12]:
validator.expect_column_proportion_of_unique_values_to_be_between(column='department', min_value=0, max_value=1)

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 0.0020408163265306124
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### 1.9. max to be between

In [13]:
validator.expect_column_max_to_be_between(column='hourly_rate', min_value=80, max_value=100) # the standard is 80

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 100
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### 1.10. stdev to be between

In [14]:
validator.expect_column_stdev_to_be_between(column='relationship_satisfaction', min_value=1, max_value=5)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 1.081208886440361
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### 1.11. Save into Expectation Suite

In [15]:
validator.save_expectation_suite(discard_failed_expectations=False)

## 2. Checkpoint

In [16]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [17]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/37 [00:00<?, ?it/s]

## 3. Data Docs

In [18]:
context.build_data_docs()

{'local_site': 'file://c:\\Users\\FARIS\\Documents\\GitHub\\milestone\\p2-ftds002-sby-m3-faris-afk\\gx\\uncommitted/data_docs/local_site/index.html'}