In [2]:
# Create a data context
from great_expectations.data_context import FileDataContext
import pandas as pd

context = FileDataContext.create(project_root_dir='./')

In [4]:
# Karena ada kolom index maka dilakukan data cleaning, hal ini dilakukan karena waktu export data pada elasticsearch tanpa index tidak bisa dijalankan
# ditambah juga kolom index ini akan digunakan ID
df = pd.read_csv('dags/P2M3_fikri_data_clean.csv')
df = df.rename(columns={'Unnamed: 0':'ID'})
df.to_csv('P2M3_fikri_data_clean_gx.csv', index=False)

In [5]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'data-clean-dropout-succes-academic-csv'
datasource = context.sources.add_pandas(datasource_name)

asset_name = 'data-dropout-succes-academic'
path_to_data = 'P2M3_fikri_data_clean_gx.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()


In [6]:
# Creat an expectation suite
expectation_suite_name = 'expectation-dropout-succes-academic'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,ID,marital_status,application_mode,application_order,course,daytimeevening_attendancet,previous_qualification,previous_qualification_grade,nacionality,mothers_qualification,...,curricular_units_2nd_sem_credited,curricular_units_2nd_sem_enrolled,curricular_units_2nd_sem_evaluations,curricular_units_2nd_sem_approved,curricular_units_2nd_sem_grade,curricular_units_2nd_sem_without_evaluations,unemployment_rate,inflation_rate,gdp,target
0,0,1,17,5,171,1,1,122,1,19,...,0,0,0,0,0,0,10,1,1,Dropout
1,1,1,15,1,9254,1,1,160,1,1,...,0,6,6,6,13,0,13,0,0,Graduate
2,2,1,1,5,9070,1,1,122,1,37,...,0,6,0,0,0,0,10,1,1,Dropout
3,3,1,17,2,9773,1,1,122,1,38,...,0,6,10,5,12,0,9,0,-3,Graduate
4,4,2,39,1,8014,0,1,100,1,37,...,0,6,6,6,13,0,13,0,0,Graduate


In [7]:
df

Unnamed: 0,ID,marital_status,application_mode,application_order,course,daytimeevening_attendancet,previous_qualification,previous_qualification_grade,nacionality,mothers_qualification,...,curricular_units_2nd_sem_credited,curricular_units_2nd_sem_enrolled,curricular_units_2nd_sem_evaluations,curricular_units_2nd_sem_approved,curricular_units_2nd_sem_grade,curricular_units_2nd_sem_without_evaluations,unemployment_rate,inflation_rate,gdp,target
0,0,1,17,5,171,1,1,122,1,19,...,0,0,0,0,0,0,10,1,1,Dropout
1,1,1,15,1,9254,1,1,160,1,1,...,0,6,6,6,13,0,13,0,0,Graduate
2,2,1,1,5,9070,1,1,122,1,37,...,0,6,0,0,0,0,10,1,1,Dropout
3,3,1,17,2,9773,1,1,122,1,38,...,0,6,10,5,12,0,9,0,-3,Graduate
4,4,2,39,1,8014,0,1,100,1,37,...,0,6,6,6,13,0,13,0,0,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,4419,1,1,6,9773,1,1,125,1,1,...,0,6,8,5,12,0,15,2,-4,Graduate
4420,4420,1,1,2,9773,1,1,120,105,1,...,0,6,6,2,11,0,11,0,2,Dropout
4421,4421,1,1,1,9500,1,1,154,1,37,...,0,8,9,1,13,0,13,0,0,Dropout
4422,4422,1,1,1,9147,1,1,180,1,37,...,0,5,6,5,12,0,9,0,-3,Graduate


### Ekspektasi 1

Pada kolom `ID` dilakukan ekspektasi `to be unique` data dimana data harus uniq karena menandakan 1 ID 1 Siswa, jika ada yang double maka data duplikat dan harus dihandling (biasanya dihapus jika ada kesamaan data)

In [8]:
# Expectation 1 : Column `ID` must be unique

validator.expect_column_values_to_be_unique('ID')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4424,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Ekspektasi 2

Pada kolom `admission_grade` dilakukan ekspektasi menggunakan `min max (between)` dimana data nilai tidak boleh kurang dari 0 dan tidak boleh lebih dari 200, karena data tidak dapat dikategorikan jika diluar dari itu.

In [9]:
# Expectation 2 : Column `Admission Grade` must be in range 0 to 200

validator.expect_column_values_to_be_between(
    column='admission_grade', min_value=0, max_value=200
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4424,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Ekspektasi 3

Melakukan pengecekan ekspektasi menggunakan `set` pada kolom `marital_status` dan `target`:
- `marital_status`
    - `1 = single`
    - `2 = married`
    - `3 = widower`
    - `4 = divorced`
    - `5 = facto union`
    - `6 = legally separated`


Alasan menggunakan ekspektasi ini untuk memastikan bahwa setiap nilai dalam kolom ini adalah salah satu dari status perkawinan yang telah ditentukan.

- `target`
    - `dropout`
    - `enrolled`
    - `graduate`

Alasan menggunakan ekspektasi ini untuk memastikan bahwa setiap nilai dalam kolom ini adalah salah satu dari kategori target pendidikan yang telah ditetapkan


#### Marital Status

In [10]:
validator.expect_column_values_to_be_in_set('marital_status', [1, 2, 3, 4, 5, 6])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4424,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

#### Target

In [11]:
validator.expect_column_values_to_be_in_set('target', ['dropout', 'enrolled', 'graduate'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 4424,
    "unexpected_count": 4424,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "Dropout",
      "Graduate",
      "Dropout",
      "Graduate",
      "Graduate",
      "Graduate",
      "Graduate",
      "Dropout",
      "Graduate",
      "Dropout",
      "Graduate",
      "Graduate",
      "Dropout",
      "Graduate",
      "Graduate",
      "Dropout",
      "Enrolled",
      "Graduate",
      "Graduate",
      "Enrolled"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Ekspektasi 4

Pada kolom `inflation_rate` dilakukan ekspektasi `type_list` untuk memastikan bahwa kolom inflation_rate hanya berisi nilai bertipe integer atau float. Jika terdapat nilai dengan tipe data lain, akan dianggap sebagai pelanggaran ekspektasi.

In [12]:
# Expectation 4 : Column `inflation_rate` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('inflation_rate', ['int64', 'float64'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Ekspektasi 5

Pada kolom `curricular_units_1st_sem_grade` dilakukan ekspektasi `mean` untuk mengetahui siswa yang nilainya diatas rata-rata pada semester 1.

In [13]:
min_cu1sg = df['curricular_units_1st_sem_grade'].min()
max_cu1sg = df['curricular_units_1st_sem_grade'].max()
validator.expect_column_mean_to_be_between("curricular_units_1st_sem_grade", min_value=min_cu1sg, max_value=max_cu1sg)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 10.297920433996383
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Ekspektasi 6

Pada kolom `nacionality` dilakukan pengecekan ekspetasi bahwa dataset ini didapatkan dari negara `portugal` sehingga bahasa yang digunakan oleh murid mayoritas menggunakan bahasa `portugis` dan kode kelima negara `(Portugis, Brazil, Spanyol, Santomean, Cape Verdean)` tersebut memiliki hubungan dekat secara bahasa yaitu bahasa `portugis`. Ekspetasi menggunakan `disctinct values to contain set`.

In [14]:
validator.expect_column_distinct_values_to_contain_set("nacionality", value_set=[1, 41, 26, 22, 6])

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      1,
      2,
      6,
      11,
      13,
      14,
      17,
      21,
      22,
      24,
      25,
      26,
      32,
      41,
      62,
      100,
      101,
      103,
      105,
      108,
      109
    ],
    "details": {
      "value_counts": [
        {
          "value": 1,
          "count": 4314
        },
        {
          "value": 2,
          "count": 2
        },
        {
          "value": 6,
          "count": 13
        },
        {
          "value": 11,
          "count": 3
        },
        {
          "value": 13,
          "count": 1
        },
        {
          "value": 14,
          "count": 1
        },
        {
          "value": 17,
          "count": 1
        },
        {
          "value": 21,
          "count": 2
        },
        {
          "value": 22,
          "count": 13
        },
        {
          "value": 24,
          "count": 5
        },
        {
          "value": 

### Ekspektasi 7

Melakukan ekspektasi bahwa usia pendaftar minimum harus `17` tahun, menggunakan ekspektasi `column_min_to_be_between`.

In [15]:
min_age = df['age_at_enrollment'].min()
max_age = df['age_at_enrollment'].max()
validator.expect_column_min_to_be_between("age_at_enrollment", min_values=min_age, max_values=max_age)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 17
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}