In [1]:
import pandas as pd
import great_expectations as ge
from great_expectations.data_context import FileDataContext
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
df = pd.read_csv('project-m3/dags/P2M3_handwitanto_abraham_data_clean.csv')
# Initialize the FileDataContext directly by providing the project_root_dir
context = ge.get_context()

# Check that the context is created successfully
print("Great Expectations context initialized successfully")

Great Expectations context initialized successfully


In [3]:
data_source = context.data_sources.add_pandas("pandas_3")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

In [4]:
batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

In [5]:
expectation = ge.expectations.ExpectColumnValuesToBeUnique(
    column='user_id'
)

validation_result = batch.validate(expectation)
print (validation_result)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_unique",
    "kwargs": {
      "batch_id": "pandas_3-pd dataframe asset",
      "column": "user_id"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1320,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


In [6]:
valid_vehicle_models = ["BMW i3", "Tesla Model 3", "Nissan Leaf", "Chevy Bolt", "Hyundai Kona"]
expectation = ge.expectations.ExpectColumnValuesToBeInSet(
    column='vehicle_model',
    value_set=valid_vehicle_models
)

validation_result = batch.validate(expectation)
print (validation_result)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_in_set",
    "kwargs": {
      "batch_id": "pandas_3-pd dataframe asset",
      "column": "vehicle_model",
      "value_set": [
        "BMW i3",
        "Tesla Model 3",
        "Nissan Leaf",
        "Chevy Bolt",
        "Hyundai Kona"
      ]
    },
    "meta": {}
  },
  "result": {
    "element_count": 1320,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


In [7]:
expectation = ge.expectations.ExpectColumnValuesToBeInTypeList(
    column='temperature_c',
    type_list=["FLOAT"]
)

validation_result = batch.validate(expectation)
print (validation_result)

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_in_type_list",
    "kwargs": {
      "batch_id": "pandas_3-pd dataframe asset",
      "column": "temperature_c",
      "type_list": [
        "FLOAT"
      ]
    },
    "meta": {}
  },
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


In [8]:
expectation = ge.expectations.ExpectColumnValuesToBeBetween(
    column="charging_duration_hours",
    min_value=0,
    max_value=24
)

validation_result = batch.validate(expectation)
print (validation_result)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_between",
    "kwargs": {
      "batch_id": "pandas_3-pd dataframe asset",
      "column": "charging_duration_hours",
      "min_value": 0.0,
      "max_value": 24.0
    },
    "meta": {}
  },
  "result": {
    "element_count": 1320,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


In [9]:
expectation = ge.expectations.ExpectColumnValuesToNotBeNull(
    column="charging_cost_usd"
)

validation_result = batch.validate(expectation)
print (validation_result)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_not_be_null",
    "kwargs": {
      "batch_id": "pandas_3-pd dataframe asset",
      "column": "charging_cost_usd"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1320,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


In [10]:
expectation = ge.expectations.ExpectColumnMaxToBeBetween(
    column="charging_cost_usd",
    min_value=0,
    max_value=100
)

validation_result = batch.validate(expectation)
print (validation_result)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_max_to_be_between",
    "kwargs": {
      "batch_id": "pandas_3-pd dataframe asset",
      "column": "charging_cost_usd",
      "min_value": 0.0,
      "max_value": 100.0
    },
    "meta": {}
  },
  "result": {
    "observed_value": 69.4077431939
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


In [11]:
expectation = ge.expectations.ExpectColumnDistinctValuesToContainSet(
    column="day_of_week",
    value_set=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
)

validation_result = batch.validate(expectation)
print (validation_result)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_distinct_values_to_contain_set",
    "kwargs": {
      "batch_id": "pandas_3-pd dataframe asset",
      "column": "day_of_week",
      "value_set": [
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday",
        "Sunday"
      ]
    },
    "meta": {}
  },
  "result": {
    "observed_value": [
      "Friday",
      "Monday",
      "Saturday",
      "Sunday",
      "Thursday",
      "Tuesday",
      "Wednesday"
    ],
    "details": {
      "value_counts": [
        {
          "value": "Friday",
          "count": 188
        },
        {
          "value": "Monday",
          "count": 185
        },
        {
          "value": "Saturday",
          "count": 205
        },
        {
          "value": "Sunday",
          "count": 191
        },
        {
          "value": "Thursday",
          "count": 154
        },
        {
          "value": "Tues