In [1]:
!rm -rf dqm_repo
!feast init dqm_repo
%cd dqm_repo/feature_repo/


Creating a new Feast repository in [1m[32m/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo[0m.

/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo/feature_repo


In [2]:
import pandas as pd

pd.read_parquet("data/driver_stats.parquet").head()

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2022-08-31 14:00:00+00:00,1005,0.93849,0.032503,371,2022-09-15 14:14:30.704
1,2022-08-31 15:00:00+00:00,1005,0.12832,0.361154,883,2022-09-15 14:14:30.704
2,2022-08-31 16:00:00+00:00,1005,0.617912,0.460527,406,2022-09-15 14:14:30.704
3,2022-08-31 17:00:00+00:00,1005,0.001669,0.976201,713,2022-09-15 14:14:30.704
4,2022-08-31 18:00:00+00:00,1005,0.063925,0.091771,860,2022-09-15 14:14:30.704


In [None]:
!feast apply

Created entity [1m[32mdriver[0m
Created feature view [1m[32mdriver_hourly_stats_fresh[0m
Created feature view [1m[32mdriver_hourly_stats[0m
Created on demand feature view [1m[32mtransformed_conv_rate[0m
Created on demand feature view [1m[32mtransformed_conv_rate_fresh[0m
Created feature service [1m[32mdriver_activity_v2[0m
Created feature service [1m[32mdriver_activity_v1[0m
Created feature service [1m[32mdriver_activity_v3[0m

Created sqlite table [1m[32mdqm_repo_driver_hourly_stats_fresh[0m
Created sqlite table [1m[32mdqm_repo_driver_hourly_stats[0m



In [4]:
from datetime import datetime, timedelta
import pandas as pd

from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage
from feast.dqm.profilers.ge_profiler import ge_profiler
from great_expectations.dataset import PandasDataset
from great_expectations.core.expectation_suite import ExpectationSuite

# The entity dataframe is the dataframe we want to enrich with feature values
entity_df = pd.DataFrame.from_dict(
    {
        "driver_id": [1001, 1002, 1003],
        "label_driver_reported_satisfaction": [1, 5, 3], 
        "event_timestamp": [
            datetime.now() - timedelta(minutes=11),
            datetime.now() - timedelta(minutes=36),
            datetime.now() - timedelta(minutes=73),
        ],
    }
)

store = FeatureStore(repo_path=".")

training_data_job = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ], 
)

  from urllib3.contrib.pyopenssl import orig_util_SSLContext as SSLContext


In [5]:
training_data_job.to_df()

Unnamed: 0,driver_id,label_driver_reported_satisfaction,event_timestamp,conv_rate,acc_rate,avg_daily_trips
0,1002,5,2022-09-15 13:39:22.878585+00:00,0.134408,0.616837,548
1,1001,1,2022-09-15 14:04:22.878576+00:00,0.939694,0.44357,415
2,1003,3,2022-09-15 13:02:22.878587+00:00,0.052578,0.218352,935


In [6]:
training_data_job

<feast.infra.offline_stores.file.FileRetrievalJob at 0x133abe940>

In [7]:
reference_dataset = store.create_saved_dataset(
    from_=training_data_job,
    name="saved_reference_dataset",
    storage=SavedDatasetFileStorage(path='data/saved_reference_dataset.parquet')
)

print(reference_dataset)

{
  "spec": {
    "name": "saved_reference_dataset",
    "features": [
      "driver_hourly_stats:conv_rate",
      "driver_hourly_stats:acc_rate",
      "driver_hourly_stats:avg_daily_trips"
    ],
    "joinKeys": [
      "label_driver_reported_satisfaction",
      "driver_id"
    ],
    "storage": {
      "fileStorage": {
        "fileFormat": {
          "parquetFormat": {}
        },
        "uri": "data/saved_reference_dataset.parquet"
      }
    }
  },
  "meta": {
    "createdTimestamp": "2022-09-15T20:15:42.783687Z",
    "minEventTimestamp": "2022-09-15T13:02:22.878587Z",
    "maxEventTimestamp": "2022-09-15T14:04:22.878576Z"
  }
}




In [11]:
# This should pass
@ge_profiler
def user_features_profiler(dataset: PandasDataset) -> ExpectationSuite:
    print(dataset.columns)
    dataset.expect_column_to_exist("driver_id")
    dataset.expect_column_values_to_be_between("avg_daily_trips", 0, 1000)
    dataset.expect_column_values_to_be_between("conv_rate", 0, 1)
    dataset.expect_column_values_to_be_between("acc_rate", 0, 1)
    return dataset.get_expectation_suite()

# This should fail
@ge_profiler
def user_features_profiler_fail(dataset: PandasDataset) -> ExpectationSuite:
    print(dataset.columns)
    dataset.expect_column_to_exist("something_random")
    return dataset.get_expectation_suite()

ds = store.get_saved_dataset('saved_reference_dataset')
validation_reference = ds.as_reference(name='user_features_profiler', profiler=user_features_profiler)
validation_reference_fail = ds.as_reference(name='user_features_profiler_fail', profiler=user_features_profiler_fail)



In [40]:
# This profiler yields 4 items
print(f"\n***this profiler has {len(ds.get_profile(profiler=user_features_profiler).expectation_suite['expectations'])} items***\n")

print(ds.get_profile(profiler=user_features_profiler))

Index(['driver_id', 'acc_rate', 'avg_daily_trips',
       'label_driver_reported_satisfaction', 'event_timestamp', 'conv_rate'],
      dtype='object')

***this profiler has 4 items***

Index(['driver_id', 'acc_rate', 'avg_daily_trips',
       'label_driver_reported_satisfaction', 'event_timestamp', 'conv_rate'],
      dtype='object')
<GEProfile with expectations: [
  {
    "kwargs": {
      "column": "driver_id"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  {
    "kwargs": {
      "column": "avg_daily_trips",
      "min_value": 0,
      "max_value": 1000
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between"
  },
  {
    "kwargs": {
      "column": "conv_rate",
      "min_value": 0,
      "max_value": 1
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between"
  },
  {
    "kwargs": {
      "column": "acc_rate",
      "min_value": 0,
      "max_value": 1
    },
    "meta": {},
    "expectation_type"

In [41]:
ds.get_profile(profiler=user_features_profiler_fail)
# This profiler yields 0 items
print(f"\n***this profiler has {len(ds.get_profile(profiler=user_features_profiler_fail).expectation_suite['expectations'])} items***\n")

print(ds.get_profile(profiler=user_features_profiler_fail))

Index(['driver_id', 'acc_rate', 'avg_daily_trips',
       'label_driver_reported_satisfaction', 'event_timestamp', 'conv_rate'],
      dtype='object')
Index(['driver_id', 'acc_rate', 'avg_daily_trips',
       'label_driver_reported_satisfaction', 'event_timestamp', 'conv_rate'],
      dtype='object')

***this profiler has 0 items***

Index(['driver_id', 'acc_rate', 'avg_daily_trips',
       'label_driver_reported_satisfaction', 'event_timestamp', 'conv_rate'],
      dtype='object')
<GEProfile with expectations: []>


In [44]:
# This validation should pass
training_data_job.to_df(
    validation_reference=store.
    get_saved_dataset("saved_reference_dataset").
    as_reference(name='user_features_profiler', profiler=user_features_profiler)
)

Index(['driver_id', 'acc_rate', 'avg_daily_trips',
       'label_driver_reported_satisfaction', 'event_timestamp', 'conv_rate'],
      dtype='object')




Unnamed: 0,driver_id,label_driver_reported_satisfaction,event_timestamp,conv_rate,acc_rate,avg_daily_trips
0,1002,5,2022-09-15 13:39:22.878585+00:00,0.134408,0.616837,548
1,1001,1,2022-09-15 14:04:22.878576+00:00,0.939694,0.44357,415
2,1003,3,2022-09-15 13:02:22.878587+00:00,0.052578,0.218352,935


In [45]:
# This validation should fail
training_data_job.to_df(
    validation_reference=store.
    get_saved_dataset("saved_reference_dataset").
    as_reference(name='user_features_profiler_fail', profiler=user_features_profiler_fail)
)

Index(['driver_id', 'acc_rate', 'avg_daily_trips',
       'label_driver_reported_satisfaction', 'event_timestamp', 'conv_rate'],
      dtype='object')




Unnamed: 0,driver_id,label_driver_reported_satisfaction,event_timestamp,conv_rate,acc_rate,avg_daily_trips
0,1002,5,2022-09-15 13:39:22.878585+00:00,0.134408,0.616837,548
1,1001,1,2022-09-15 14:04:22.878576+00:00,0.939694,0.44357,415
2,1003,3,2022-09-15 13:02:22.878587+00:00,0.052578,0.218352,935
