In [None]:
%%sh
pip install Pygments -q
echo "Please restart your runtime now (Runtime -> Restart runtime). This ensures that the correct dependencies are loaded."

# https://github.com/feast-dev/feast-gcp-fraud-tutorial/blob/main/notebooks/Validating_Online_Features_While_Detecting_Fraud.ipynb

In [1]:
!rm -rf dqm_repo
!feast init dqm_repo


Creating a new Feast repository in [1m[32m/home/francisco/github/feast/examples/data-quality-monitoring/dqm_repo[0m.



In [2]:
%cd dqm_repo
!mkdir logs
!ls -R

/home/francisco/github/feast/examples/data-quality-monitoring/dqm_repo
.:
data  example.py  feature_store.yaml  __init__.py  logs

./data:
driver_stats.parquet

./logs:


In [3]:
!pygmentize feature_store.yaml

[94mproject[39;49;00m:[37m [39;49;00mdqm_repo[37m[39;49;00m
[94mregistry[39;49;00m:[37m [39;49;00mdata/registry.db[37m[39;49;00m
[94mprovider[39;49;00m:[37m [39;49;00mlocal[37m[39;49;00m
[94monline_store[39;49;00m:[37m[39;49;00m
[37m    [39;49;00m[94mpath[39;49;00m:[37m [39;49;00mdata/online_store.db[37m[39;49;00m
[94mentity_key_serialization_version[39;49;00m:[37m [39;49;00m2[37m[39;49;00m


# Taking a look at our data

In [4]:
import pandas as pd

pd.read_parquet("data/driver_stats.parquet").head()

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2022-07-29 05:00:00+00:00,1005,0.162131,0.161591,650,2022-08-13 05:58:10.728
1,2022-07-29 06:00:00+00:00,1005,0.981602,0.228001,460,2022-08-13 05:58:10.728
2,2022-07-29 07:00:00+00:00,1005,0.192713,0.658858,766,2022-08-13 05:58:10.728
3,2022-07-29 08:00:00+00:00,1005,0.730334,0.099552,277,2022-08-13 05:58:10.728
4,2022-07-29 09:00:00+00:00,1005,0.109518,0.519829,103,2022-08-13 05:58:10.728


In [5]:
!pygmentize -f terminal16m example.py

[38;2;61;123;123;03m# This is an example feature definition file[39;00m

[38;2;0;128;0;01mfrom[39;00m [38;2;0;0;255;01mdatetime[39;00m [38;2;0;128;0;01mimport[39;00m timedelta

[38;2;0;128;0;01mfrom[39;00m [38;2;0;0;255;01mfeast[39;00m [38;2;0;128;0;01mimport[39;00m Entity, FeatureService, FeatureView, Field, FileSource
[38;2;0;128;0;01mfrom[39;00m [38;2;0;0;255;01mfeast[39;00m[38;2;0;0;255;01m.[39;00m[38;2;0;0;255;01mtypes[39;00m [38;2;0;128;0;01mimport[39;00m Float32, Int64

[38;2;61;123;123;03m# Read data from parquet files. Parquet is convenient for local development mode. For[39;00m
[38;2;61;123;123;03m# production, you can use your favorite DWH, such as BigQuery. See Feast documentation[39;00m
[38;2;61;123;123;03m# for more info.[39;00m
driver_hourly_stats [38;2;102;102;102m=[39m FileSource(
    name[38;2;102;102;102m=[39m[38;2;186;33;33m"[39m[38;2;186;33;33mdriver_hourly_stats_source[39m[38;2;186;33;33m"[39m,
    path[38;2;102;102;102m=[

# Configurations for online evaluation

In [6]:
feature_store = \
f"""project: dqm_repo
registry: data/registry.db
provider: local
offline_store:
    type: file
online_store:
    path: data/online_store.db
entity_key_serialization_version: 2
feature_server:
    enabled: True
    feature_logging:
        enabled: True
        flush_interval_secs: 60
        write_to_disk_interval_secs: 10
    
go_feature_retrieval: False
"""

with open('feature_store.yaml', "w") as feature_store_file:
    feature_store_file.write(feature_store)

dqm_services = f"""
from feast import FeatureService
from feast.feature_logging import LoggingConfig
from feast.infra.offline_stores.file_source import FileLoggingDestination

from example import driver_hourly_stats, driver, driver_hourly_stats_view

fs = FeatureService(
    name="driver_activity", 
    features=[driver_hourly_stats_view],
    logging_config=LoggingConfig(
        sample_rate=1.0,
        destination=FileLoggingDestination(path="logs/"),
    )
)
"""
    
with open('dqm_services.py', "w") as dqm_services_file:
    dqm_services_file.write(dqm_services)

In [7]:
!feast apply

Created entity [1m[32mdriver[0m
Created feature view [1m[32mdriver_hourly_stats[0m
Created feature service [1m[32mdriver_activity[0m

Created sqlite table [1m[32mdqm_repo_driver_hourly_stats[0m



In [8]:
from datetime import datetime, timedelta
import pandas as pd

from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

# The entity dataframe is the dataframe we want to enrich with feature values
entity_df = pd.DataFrame.from_dict(
    {
        "driver_id": [1001, 1002, 1003],
        "label_driver_reported_satisfaction": [1, 5, 3], 
        "event_timestamp": [
            datetime.now() - timedelta(minutes=11),
            datetime.now() - timedelta(minutes=36),
            datetime.now() - timedelta(minutes=73),
        ],
    }
)

store = FeatureStore(repo_path=".")

training_data_job = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
)

reference_dataset = store.create_saved_dataset(
    from_=training_data_job,
    name="reference_dataset",
    storage=SavedDatasetFileStorage(path='data/driver_stats_reference.parquet')
)

print(reference_dataset)

{
  "spec": {
    "name": "reference_dataset",
    "features": [
      "driver_hourly_stats:conv_rate",
      "driver_hourly_stats:acc_rate",
      "driver_hourly_stats:avg_daily_trips"
    ],
    "joinKeys": [
      "label_driver_reported_satisfaction",
      "driver_id"
    ],
    "storage": {
      "fileStorage": {
        "fileFormat": {
          "parquetFormat": {}
        },
        "uri": "data/driver_stats_reference.parquet"
      }
    }
  },
  "meta": {
    "createdTimestamp": "2022-08-13T11:58:33.296601Z",
    "minEventTimestamp": "2022-08-13T04:45:33.211718Z",
    "maxEventTimestamp": "2022-08-13T05:47:33.211712Z"
  }
}




# Creating our own Expectations Profiler

In [29]:
from feast.dqm.profilers.ge_profiler import ge_profiler
from great_expectations.dataset import PandasDataset
from great_expectations.core.expectation_suite import ExpectationSuite

# this profiler should pass
@ge_profiler
def user_features_profiler(dataset: PandasDataset) -> ExpectationSuite:
    print(dataset.columns)
    dataset.expect_column_to_exist("driver_id")
    dataset.expect_column_values_to_be_between("avg_daily_trips", 0, 1000)
    dataset.expect_column_values_to_be_between("conv_rate", 0, 1)
    dataset.expect_column_values_to_be_between("acc_rate", 0, 1)
    return dataset.get_expectation_suite()

# this profiler should trigger failures
@ge_profiler
def user_features_profiler_fail(dataset: PandasDataset) -> ExpectationSuite:
    dataset.expect_column_to_exist("something random")
    # dataset.expect_column_values_to_be_between("avg_daily_trips", )
    return dataset.get_expectation_suite()

ds = store.get_saved_dataset('reference_dataset')
validation_reference = ds.as_reference(name='user_features_profiler', profiler=user_features_profiler)
validation_reference_fail = ds.as_reference(name='user_features_profiler_fail', profiler=user_features_profiler_fail)

# print(training_data_job.to_df(validation_reference=validation_reference))
# print(training_data_job.to_df(validation_reference=validation_reference_fail))



In [31]:
ds.get_profile(profiler=user_features_profiler_fail)

<GEProfile with expectations: []>

In [32]:
training_data_job.to_df(validation_reference=validation_reference_fail)



Unnamed: 0,driver_id,label_driver_reported_satisfaction,event_timestamp,conv_rate,acc_rate,avg_daily_trips
0,1002,5,2022-08-13 05:22:33.211717+00:00,0.23835,0.84283,857
1,1001,1,2022-08-13 05:47:33.211712+00:00,0.350402,0.734558,910
2,1003,3,2022-08-13 04:45:33.211718+00:00,0.65448,0.534863,939


In [16]:
from datetime import datetime
!feast materialize-incremental {datetime.now().isoformat()}

Materializing [1m[32m1[0m feature views to [1m[32m2022-08-12 23:34:21-06:00[0m into the [1m[32msqlite[0m online store.

[1m[32mdriver_hourly_stats[0m from [1m[32m2022-08-12 11:34:23-06:00[0m to [1m[32m2022-08-12 23:34:21-06:00[0m:
100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 198.98it/s]


In [17]:
print("--- Data directory ---")
!ls data

import sqlite3
import pandas as pd
con = sqlite3.connect("data/online_store.db")
print("\n--- Schema of online store ---")
print(
    pd.read_sql_query("SELECT * FROM dqm_repo_driver_hourly_stats limit 10;", con)
)
con.close()

--- Data directory ---
driver_stats.parquet		online_store.db
driver_stats_reference.parquet	registry.db

--- Schema of online store ---
                                          entity_key     feature_name  \
0  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   
1  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...         acc_rate   
2  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...  avg_daily_trips   
3  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   
4  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...         acc_rate   
5  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...  avg_daily_trips   
6  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   
7  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...         acc_rate   
8  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...  avg_daily_trips   
9  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   

               value             event_ts                  c

In [18]:
from pprint import pprint
from feast import FeatureStore

store = FeatureStore(repo_path=".")

feature_vector = store.get_online_features(
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
    entity_rows=[
        {"driver_id": 1004},
        {"driver_id": 1005},
    ],
).to_dict()

pprint(feature_vector)


{'acc_rate': [0.4692337214946747, 0.5439016222953796],
 'avg_daily_trips': [529, 114],
 'conv_rate': [0.8161515593528748, 0.20459125936031342],
 'driver_id': [1004, 1005]}


In [19]:
from feast import FeatureStore
feature_store = FeatureStore('.')  # Initialize the feature store

feature_service = feature_store.get_feature_service("driver_activity")
feature_vector = feature_store.get_online_features(
    features=feature_service,
    entity_rows=[
        # {join_key: entity_value}
        {"driver_id": 1004},
        {"driver_id": 1005},
    ],
).to_dict()
pprint(feature_vector)

{'acc_rate': [0.4692337214946747, 0.5439016222953796],
 'avg_daily_trips': [529, 114],
 'conv_rate': [0.8161515593528748, 0.20459125936031342],
 'driver_id': [1004, 1005]}


In [20]:
from feast.saved_dataset import ValidationReference

ref = ValidationReference(
    name='user_features_training_ref',
    dataset_name="reference_dataset",
    profiler=user_features_profiler,
)

In [21]:
store.apply(ref)

In [23]:
import pandas as pd
insert_df = pd.DataFrame({
    "driver_id": [1003],
    "conv_rate": [-1],
    "acc_rate": [2],
    "avg_daily_trips": [1500],
    "event_timestamp": [datetime.now()],
    "created": [datetime.now()],
})

store.write_to_online_store("driver_hourly_stats", insert_df)

In [None]:
end_ts = datetime.now()
start_ts = end_ts - timedelta(minutes=10)

! feast validate --feature-service driver_activity --reference user_features_training_ref {start_ts.isoformat()} {end_ts.isoformat()}