In [1]:
!rm -rf dqm_repo
!feast init dqm_repo
%cd dqm_repo/feature_repo/


Creating a new Feast repository in [1m[32m/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo[0m.

/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo/feature_repo


In [2]:
import warnings
import pandas as pd
from IPython.display import display

warnings.filterwarnings('ignore')

In [3]:
display(pd.read_parquet("data/driver_stats.parquet").head())

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2022-09-01 09:00:00+00:00,1005,0.266191,0.970802,212,2022-09-16 09:45:47.995
1,2022-09-01 10:00:00+00:00,1005,0.936703,0.259681,753,2022-09-16 09:45:47.995
2,2022-09-01 11:00:00+00:00,1005,0.697647,0.878257,422,2022-09-16 09:45:47.995
3,2022-09-01 12:00:00+00:00,1005,0.812464,0.142667,451,2022-09-16 09:45:47.995
4,2022-09-01 13:00:00+00:00,1005,0.452546,0.485283,404,2022-09-16 09:45:47.995


In [4]:
!feast apply

Created entity [1m[32mdriver[0m
Created feature view [1m[32mdriver_hourly_stats_fresh[0m
Created feature view [1m[32mdriver_hourly_stats[0m
Created on demand feature view [1m[32mtransformed_conv_rate[0m
Created on demand feature view [1m[32mtransformed_conv_rate_fresh[0m
Created feature service [1m[32mdriver_activity_v2[0m
Created feature service [1m[32mdriver_activity_v3[0m
Created feature service [1m[32mdriver_activity_v1[0m

Created sqlite table [1m[32mdqm_repo_driver_hourly_stats_fresh[0m
Created sqlite table [1m[32mdqm_repo_driver_hourly_stats[0m



In [5]:
from datetime import datetime, timedelta
import pandas as pd

from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage
from feast.dqm.profilers.ge_profiler import ge_profiler
from great_expectations.dataset import PandasDataset
from great_expectations.core.expectation_suite import ExpectationSuite

# The entity dataframe is the dataframe we want to enrich with feature values
entity_df = pd.DataFrame.from_dict(
    {
        "driver_id": [1001, 1002, 1003],
        "label_driver_reported_satisfaction": [1, 5, 3], 
        "event_timestamp": [
            datetime.now() - timedelta(minutes=11),
            datetime.now() - timedelta(minutes=36),
            datetime.now() - timedelta(minutes=73),
        ],
    }
)

store = FeatureStore(repo_path=".")

training_data_job = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ], 
)

  from urllib3.contrib.pyopenssl import orig_util_SSLContext as SSLContext


In [6]:
reference_dataset = store.create_saved_dataset(
    from_=training_data_job,
    name="saved_reference_dataset",
    storage=SavedDatasetFileStorage(path='data/saved_reference_dataset.parquet')
)

print(reference_dataset)

{
  "spec": {
    "name": "saved_reference_dataset",
    "features": [
      "driver_hourly_stats:conv_rate",
      "driver_hourly_stats:acc_rate",
      "driver_hourly_stats:avg_daily_trips"
    ],
    "joinKeys": [
      "driver_id",
      "label_driver_reported_satisfaction"
    ],
    "storage": {
      "fileStorage": {
        "fileFormat": {
          "parquetFormat": {}
        },
        "uri": "data/saved_reference_dataset.parquet"
      }
    }
  },
  "meta": {
    "createdTimestamp": "2022-09-16T15:46:34.876586Z",
    "minEventTimestamp": "2022-09-16T08:33:34.725403Z",
    "maxEventTimestamp": "2022-09-16T09:35:34.725392Z"
  }
}




In [7]:
# This should pass
@ge_profiler
def user_features_profiler(dataset: PandasDataset) -> ExpectationSuite:
    # print(dataset.columns)
    dataset.expect_column_to_exist("driver_id")
    dataset.expect_column_values_to_be_between("avg_daily_trips", 0, 1000)
    dataset.expect_column_values_to_be_between("conv_rate", 0, 1)
    dataset.expect_column_values_to_be_between("acc_rate", 0, 1)
    return dataset.get_expectation_suite()

# This should fail
@ge_profiler
def user_features_profiler_fail(dataset: PandasDataset) -> ExpectationSuite:
    # print(dataset.columns)
    dataset.expect_column_to_exist("something_random")
    return dataset.get_expectation_suite()

# This should be mixed
@ge_profiler
def user_features_profiler_partialfail(dataset: PandasDataset) -> ExpectationSuite:
    # print(dataset.columns)
    dataset.expect_column_to_exist("something_random")
    dataset.expect_column_to_exist("driver_id")
    return dataset.get_expectation_suite()

ds = store.get_saved_dataset('saved_reference_dataset')
validation_reference = ds.as_reference(name='user_features_profiler', profiler=user_features_profiler)
validation_reference_fail = ds.as_reference(name='user_features_profiler_fail', profiler=user_features_profiler_fail)
validation_reference_partialfail = ds.as_reference(name='user_features_profiler_fail', profiler=user_features_profiler_partialfail)



In [8]:
# This profiler yields 4 items
print(f"\n***this profiler has {len(ds.get_profile(profiler=user_features_profiler).expectation_suite['expectations'])} items***\n")

print(ds.get_profile(profiler=user_features_profiler))


***this profiler has 4 items***

<GEProfile with expectations: [
  {
    "meta": {},
    "kwargs": {
      "column": "driver_id"
    },
    "expectation_type": "expect_column_to_exist"
  },
  {
    "meta": {},
    "kwargs": {
      "column": "avg_daily_trips",
      "min_value": 0,
      "max_value": 1000
    },
    "expectation_type": "expect_column_values_to_be_between"
  },
  {
    "meta": {},
    "kwargs": {
      "column": "conv_rate",
      "min_value": 0,
      "max_value": 1
    },
    "expectation_type": "expect_column_values_to_be_between"
  },
  {
    "meta": {},
    "kwargs": {
      "column": "acc_rate",
      "min_value": 0,
      "max_value": 1
    },
    "expectation_type": "expect_column_values_to_be_between"
  }
]>


In [9]:
ds.get_profile(profiler=user_features_profiler_fail)
# This profiler yields 0 items
print(f"\n***this profiler has {len(ds.get_profile(profiler=user_features_profiler_fail).expectation_suite['expectations'])} items***\n")

print(ds.get_profile(profiler=user_features_profiler_fail))


***this profiler has 0 items***

<GEProfile with expectations: []>


In [11]:
# This profiler yields 1 item
print(f"\n***this profiler has {len(ds.get_profile(profiler=user_features_profiler_partialfail).expectation_suite['expectations'])} items***\n")

print(ds.get_profile(profiler=user_features_profiler_partialfail))


***this profiler has 1 items***

<GEProfile with expectations: [
  {
    "meta": {},
    "kwargs": {
      "column": "driver_id"
    },
    "expectation_type": "expect_column_to_exist"
  }
]>


# Let's try to build the data

In [12]:
# This validation should pass

new_data = training_data_job.to_df(
    validation_reference=store.get_saved_dataset("saved_reference_dataset").as_reference(
        name='test', 
        profiler=user_features_profiler
    ),
)
display(new_data)



Unnamed: 0,driver_id,label_driver_reported_satisfaction,event_timestamp,conv_rate,acc_rate,avg_daily_trips
0,1002,5,2022-09-16 09:10:34.725401+00:00,0.304093,0.241745,275
1,1001,1,2022-09-16 09:35:34.725392+00:00,0.541217,0.849682,610
2,1003,3,2022-09-16 08:33:34.725403+00:00,0.403135,0.962808,471


In [13]:
# This validation should fail

new_data_fail = training_data_job.to_df(
    validation_reference=store.get_saved_dataset("saved_reference_dataset").as_reference(
        name='test_fail', 
        profiler=user_features_profiler_fail
    ),
)
display(new_data_fail)



Unnamed: 0,driver_id,label_driver_reported_satisfaction,event_timestamp,conv_rate,acc_rate,avg_daily_trips
0,1002,5,2022-09-16 09:10:34.725401+00:00,0.304093,0.241745,275
1,1001,1,2022-09-16 09:35:34.725392+00:00,0.541217,0.849682,610
2,1003,3,2022-09-16 08:33:34.725403+00:00,0.403135,0.962808,471


In [14]:
# This validation should partially fail

new_data_partialfail = training_data_job.to_df(
    validation_reference=store.get_saved_dataset("saved_reference_dataset").as_reference(
        name='test_partialfail', 
        profiler=user_features_profiler_partialfail
    ),
)
display(new_data_partialfail)



Unnamed: 0,driver_id,label_driver_reported_satisfaction,event_timestamp,conv_rate,acc_rate,avg_daily_trips
0,1002,5,2022-09-16 09:10:34.725401+00:00,0.304093,0.241745,275
1,1001,1,2022-09-16 09:35:34.725392+00:00,0.541217,0.849682,610
2,1003,3,2022-09-16 08:33:34.725403+00:00,0.403135,0.962808,471


# END