In [None]:
'''
=================================================
Milestone 3

Nama  : Gede Aldi
Batch : FTDS-RMT-029

Program ini dibuat untuk melakukan validasi data dengan menggunakan great expectations dimana terdapat 7 great expectations yang digunakan.
Data yang digunakan adalah data performa dan attrition karyawan yang sudah bersih hasil cleaning menggunakan scheduling apache airflow
=================================================
'''

In [1]:
from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

In [3]:
datasource_name = 'm3_clean_2'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'EmployeeAttrition'
path_to_data = r'C:\Users\User\github-classroom\FTDS-assignment-bay\p2-ftds029-rmt-m3-gedealdi28\dags\P2M3_GedeAldi_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [4]:
expectation_suite_name = 'expectation-employeeattrition-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,unnamed_0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,...,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager
0,0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,...,1,80,0,8,0,1,6,4,0,5
1,1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,...,4,80,1,10,3,3,10,7,1,7
2,2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,...,2,80,0,7,3,3,0,0,0,0
3,3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,...,3,80,0,8,3,3,8,7,3,0
4,4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,...,4,80,1,6,3,3,2,2,2,2


In [5]:
# Expectation 1 : Column `employeenumber` must be unique

validator.expect_column_values_to_be_unique('employeenumber')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1470,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [6]:
# Expectation 2 : Column `age` must be between 0-60 

validator.expect_column_values_to_be_between(
    column='age', min_value=0, max_value=60
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1470,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# Expectation 3 : Column `montlyrate` must in form of integer or float

validator.expect_column_values_to_be_of_type("monthlyrate", "int64")

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Expectation 4 : Column 'employeecount' has 1470 rows
validator.expect_column_value_lengths_to_equal("employeecount",1)

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1470,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [28]:
# Expectation 5 : column 'standardhours' has mean value of 80

validator.expect_column_mean_to_be_between("standardhours",80)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 80.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Expectation 6 : Column worklifebalance has a value between 0-4

validator.expect_column_proportion_of_unique_values_to_be_between("worklifebalance", min_value=0, max_value=4)

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 0.0027210884353741495
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [11]:
#Expectation 7 : column attrition has 'yes' and 'no' value

validator.expect_column_values_to_be_in_set('attrition', ['Yes', 'No'])

Calculating Metrics:   0%|          | 0/8 [00:01<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1470,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}