In [1]:
!rm -rf dqm_repo
!feast init dqm_repo
%cd dqm_repo


Creating a new Feast repository in [1m[32m/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo[0m.

/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo


In [2]:
import pandas as pd

pd.read_parquet("data/driver_stats.parquet")

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2022-08-30 11:00:00+00:00,1005,0.452095,0.635198,696,2022-09-14 11:40:22.613
1,2022-08-30 12:00:00+00:00,1005,0.278842,0.491479,807,2022-09-14 11:40:22.613
2,2022-08-30 13:00:00+00:00,1005,0.199585,0.544063,44,2022-09-14 11:40:22.613
3,2022-08-30 14:00:00+00:00,1005,0.234893,0.304014,715,2022-09-14 11:40:22.613
4,2022-08-30 15:00:00+00:00,1005,0.462757,0.025234,425,2022-09-14 11:40:22.613
...,...,...,...,...,...,...
1802,2022-09-14 09:00:00+00:00,1001,0.034532,0.964205,902,2022-09-14 11:40:22.613
1803,2022-09-14 10:00:00+00:00,1001,0.045746,0.353958,86,2022-09-14 11:40:22.613
1804,2021-04-12 07:00:00+00:00,1001,0.238023,0.343046,1,2022-09-14 11:40:22.613
1805,2022-09-06 23:00:00+00:00,1003,0.155819,0.030764,263,2022-09-14 11:40:22.613


In [3]:
!feast apply

Created entity [1m[32mdriver[0m
Created feature view [1m[32mdriver_hourly_stats[0m
Created feature service [1m[32mdriver_activity[0m

Created sqlite table [1m[32mdqm_repo_driver_hourly_stats[0m



In [4]:
from datetime import datetime, timedelta
import pandas as pd

from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage
from feast.dqm.profilers.ge_profiler import ge_profiler
from great_expectations.dataset import PandasDataset
from great_expectations.core.expectation_suite import ExpectationSuite

# The entity dataframe is the dataframe we want to enrich with feature values
entity_df = pd.DataFrame.from_dict(
    {
        "driver_id": [1001, 1002, 1003],
        "label_driver_reported_satisfaction": [1, 5, 3], 
        "event_timestamp": [
            datetime.now() - timedelta(minutes=11),
            datetime.now() - timedelta(minutes=36),
            datetime.now() - timedelta(minutes=73),
        ],
        "created": datetime.now(),
    }
)

store = FeatureStore(repo_path=".")

training_data_job = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ], 
)

  __version_info__ = tuple(LooseVersion(__version__).version)


In [5]:
training_data_job.to_df()

Unnamed: 0,driver_id,label_driver_reported_satisfaction,event_timestamp,created__,conv_rate,acc_rate,avg_daily_trips
0,1002,5,2022-09-14 11:04:58.587163+00:00,2022-09-14 11:40:22.613,0.825219,0.345714,569
1,1001,1,2022-09-14 11:29:58.587153+00:00,2022-09-14 11:40:22.613,0.045746,0.353958,86
2,1003,3,2022-09-14 10:27:58.587165+00:00,2022-09-14 11:40:22.613,0.531909,0.257,982


In [6]:
reference_dataset = store.create_saved_dataset(
    from_=training_data_job,
    name="reference_dataset",
    storage=SavedDatasetFileStorage(path='data/driver_stats.parquet')
)

print(reference_dataset)

{
  "spec": {
    "name": "reference_dataset",
    "features": [
      "driver_hourly_stats:conv_rate",
      "driver_hourly_stats:acc_rate",
      "driver_hourly_stats:avg_daily_trips"
    ],
    "joinKeys": [
      "created",
      "driver_id",
      "label_driver_reported_satisfaction"
    ],
    "storage": {
      "fileStorage": {
        "fileFormat": {
          "parquetFormat": {}
        },
        "uri": "data/driver_stats.parquet"
      }
    }
  },
  "meta": {
    "createdTimestamp": "2022-09-14T17:41:13.968677Z",
    "minEventTimestamp": "2022-09-14T10:27:58.587165Z",
    "maxEventTimestamp": "2022-09-14T11:29:58.587153Z"
  }
}




In [7]:
@ge_profiler
def user_features_profiler(dataset: PandasDataset) -> ExpectationSuite:
    print(dataset.columns)
    dataset.expect_column_to_exist("driver_id")
    dataset.expect_column_values_to_be_between("driver_hourly_stats__avg_daily_trips", 0, 1000)
    dataset.expect_column_values_to_be_between("driver_hourly_stats__conv_rate", 0, 1)
    dataset.expect_column_values_to_be_between("driver_hourly_stats__acc_rate", 0, 1)
    return dataset.get_expectation_suite()

ds = store.get_saved_dataset('reference_dataset')



In [8]:
validation_reference = ds.as_reference(name='user_features_profiler', profiler=user_features_profiler)
_ = training_data_job.to_df(validation_reference=validation_reference)

KeyError: "['created'] not in index"

In [None]:
training_data_job.to_df()