In [1]:
%%sh
pip install Pygments -q
echo "Please restart your runtime now (Runtime -> Restart runtime). This ensures that the correct dependencies are loaded."

Please restart your runtime now (Runtime -> Restart runtime). This ensures that the correct dependencies are loaded.


# Based on these tutorials:

- https://docs.feast.dev/reference/dqm
- https://docs.feast.dev/tutorials/validating-historical-features


In [1]:
import feast

In [2]:
feast.__version__

'0.23.1.dev35+g32d20395'

In [4]:
!rm -rf dqm_repo
!feast init dqm_repo


Creating a new Feast repository in [1m[32m/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo[0m.



In [6]:
!feast version

Feast SDK Version: "feast 0.23.1.dev35+g32d20395"


In [7]:
%cd dqm_repo
!mkdir logs
!ls -R

/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo
__init__.py        example.py         [34mlogs[m[m
[34mdata[m[m               feature_store.yaml

./data:
driver_stats.parquet

./logs:


In [8]:
!pygmentize feature_store.yaml

[94mproject[39;49;00m:[37m [39;49;00mdqm_repo[37m[39;49;00m
[94mregistry[39;49;00m:[37m [39;49;00mdata/registry.db[37m[39;49;00m
[94mprovider[39;49;00m:[37m [39;49;00mlocal[37m[39;49;00m
[94monline_store[39;49;00m:[37m[39;49;00m
[37m    [39;49;00m[94mpath[39;49;00m:[37m [39;49;00mdata/online_store.db[37m[39;49;00m
[94mentity_key_serialization_version[39;49;00m:[37m [39;49;00m2[37m[39;49;00m


In [9]:
import pandas as pd

pd.read_parquet("data/driver_stats.parquet")

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2022-07-25 20:00:00+00:00,1005,0.934775,0.645397,464,2022-08-09 20:11:20.159
1,2022-07-25 21:00:00+00:00,1005,0.288393,0.138461,389,2022-08-09 20:11:20.159
2,2022-07-25 22:00:00+00:00,1005,0.755318,0.711767,875,2022-08-09 20:11:20.159
3,2022-07-25 23:00:00+00:00,1005,0.664952,0.911580,609,2022-08-09 20:11:20.159
4,2022-07-26 00:00:00+00:00,1005,0.970190,0.840818,493,2022-08-09 20:11:20.159
...,...,...,...,...,...,...
1802,2022-08-09 18:00:00+00:00,1001,0.970100,0.362816,94,2022-08-09 20:11:20.159
1803,2022-08-09 19:00:00+00:00,1001,0.827786,0.719152,654,2022-08-09 20:11:20.159
1804,2021-04-12 07:00:00+00:00,1001,0.564200,0.843841,736,2022-08-09 20:11:20.159
1805,2022-08-02 08:00:00+00:00,1003,0.933908,0.863545,587,2022-08-09 20:11:20.159


In [10]:
!pygmentize -f terminal16m example.py

[38;2;61;123;123;03m# This is an example feature definition file[39;00m

[38;2;0;128;0;01mfrom[39;00m [38;2;0;0;255;01mdatetime[39;00m [38;2;0;128;0;01mimport[39;00m timedelta

[38;2;0;128;0;01mfrom[39;00m [38;2;0;0;255;01mfeast[39;00m [38;2;0;128;0;01mimport[39;00m Entity, FeatureService, FeatureView, Field, FileSource
[38;2;0;128;0;01mfrom[39;00m [38;2;0;0;255;01mfeast[39;00m[38;2;0;0;255;01m.[39;00m[38;2;0;0;255;01mtypes[39;00m [38;2;0;128;0;01mimport[39;00m Float32, Int64

[38;2;61;123;123;03m# Read data from parquet files. Parquet is convenient for local development mode. For[39;00m
[38;2;61;123;123;03m# production, you can use your favorite DWH, such as BigQuery. See Feast documentation[39;00m
[38;2;61;123;123;03m# for more info.[39;00m
driver_hourly_stats [38;2;102;102;102m=[39m FileSource(
    name[38;2;102;102;102m=[39m[38;2;186;33;33m"[39m[38;2;186;33;33mdriver_hourly_stats_source[39m[38;2;186;33;33m"[39m,
    path[38;2;102;102;102m=[

In [11]:
from datetime import timedelta

from feast import Entity, FeatureService, FeatureView, Field, FileSource, BatchFeatureView
from feast.types import Float32, Int64
from feast.data_format import ParquetFormat


# Read data from parquet files. Parquet is convenient for local development mode. For
# production, you can use your favorite DWH, such as BigQuery. See Feast documentation
# for more info.
driver_hourly_stats = FileSource(
    name="driver_hourly_stats_source",
    path="/home/francisco/github/feast/examples/data-quality-monitoring/dqm_repo/data/driver_stats.parquet",
    timestamp_field="event_timestamp",
    created_timestamp_column="created",
    file_format=ParquetFormat(),
)

# Define an entity for the driver. You can think of entity as a primary key used to
# fetch features.
driver = Entity(name="driver", join_keys=["driver_id"])

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=[driver],
    ttl=timedelta(days=1),
    schema=[
        Field(name="conv_rate", dtype=Float32),
        Field(name="acc_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
    online=True,
    source=driver_hourly_stats,
    tags={},
)

In [13]:
driver_hourly_stats_batch_view = BatchFeatureView(
    name='driver_hourly_stats_batch',
    entities=[driver],
#     features=[
#         Field(name="conv_rate", dtype=Float32),
#         Field(name="acc_rate", dtype=Float32),
#         Field(name="avg_daily_trips", dtype=Int64),
#     ],
    ttl=timedelta(days=1),
    source=driver_hourly_stats,
)

In [14]:
from feast.on_demand_feature_view import on_demand_feature_view

@on_demand_feature_view(
    schema=[
        Field(name="conv_rate", dtype=Float32),
        Field(name="acc_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
    sources=[
      driver_hourly_stats_batch_view,
    ]
)

def on_demand_stats(inp):
    out = pd.DataFrame()
    out["conv_rate"] = inp["conv_rate"] * 5.
    return out

In [15]:
from feast import FeatureStore

store = FeatureStore(repo_path=".")

store.apply([driver, driver_hourly_stats_batch_view, on_demand_stats])  # writing to the registry



KeyError: 'conv_rate'

In [16]:
driver_stats_fs = FeatureService(
    name="driver_activity", 
    features=[driver_hourly_stats_view],
)

In [17]:
!feast apply

Created entity [1m[32mdriver[0m
Created feature view [1m[32mdriver_hourly_stats[0m
Created feature service [1m[32mdriver_activity[0m

Created sqlite table [1m[32mdqm_repo_driver_hourly_stats[0m



In [18]:
from datetime import datetime, timedelta
import pandas as pd

from feast import FeatureStore

# The entity dataframe is the dataframe we want to enrich with feature values
entity_df = pd.DataFrame.from_dict(
    {
        "driver_id": [1001, 1002, 1003],
        "label_driver_reported_satisfaction": [1, 5, 3], 
        "event_timestamp": [
            datetime.now() - timedelta(minutes=11),
            datetime.now() - timedelta(minutes=36),
            datetime.now() - timedelta(minutes=73),
        ],
    }
)

store = FeatureStore(repo_path=".")

training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
).to_df()

print("----- Feature schema -----\n")
print(training_df.info())

print()
print("----- Example features -----\n")
print(training_df.head())

----- Feature schema -----

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   driver_id                           3 non-null      int64              
 1   label_driver_reported_satisfaction  3 non-null      int64              
 2   event_timestamp                     3 non-null      datetime64[ns, UTC]
 3   conv_rate                           3 non-null      float32            
 4   acc_rate                            3 non-null      float32            
 5   avg_daily_trips                     3 non-null      int32              
dtypes: datetime64[ns, UTC](1), float32(2), int32(1), int64(2)
memory usage: 236.0 bytes
None

----- Example features -----

   driver_id  label_driver_reported_satisfaction  \
0       1002                                   5   
1       1001               

In [19]:
from datetime import datetime
!feast materialize-incremental {datetime.now().isoformat()}

Materializing [1m[32m1[0m feature views to [1m[32m2022-08-09 14:13:53-06:00[0m into the [1m[32msqlite[0m online store.

[1m[32mdriver_hourly_stats[0m from [1m[32m2022-08-09 02:13:55-06:00[0m to [1m[32m2022-08-09 14:13:53-06:00[0m:
100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 435.65it/s]


In [20]:
print("--- Data directory ---")
!ls data

import sqlite3
import pandas as pd
con = sqlite3.connect("data/online_store.db")
print("\n--- Schema of online store ---")
print(
    pd.read_sql_query(
        "SELECT * FROM dqm_repo_driver_hourly_stats limit 10;", con)#.columns.tolist()
)
con.close()

--- Data directory ---
driver_stats.parquet online_store.db      registry.db

--- Schema of online store ---
                                          entity_key     feature_name  \
0  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   
1  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...         acc_rate   
2  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...  avg_daily_trips   
3  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   
4  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...         acc_rate   
5  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...  avg_daily_trips   
6  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   
7  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...         acc_rate   
8  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...  avg_daily_trips   
9  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   

               value             event_ts                  created_ts  
0        b'5S\x

In [21]:
from pprint import pprint
from feast import FeatureStore

store = FeatureStore(repo_path=".")

feature_vector = store.get_online_features(
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
    entity_rows=[
        {"driver_id": 1004},
        {"driver_id": 1005},
    ],
).to_dict()

pprint(feature_vector)


{'acc_rate': [0.12524475157260895, 0.7028396725654602],
 'avg_daily_trips': [290, 441],
 'conv_rate': [0.4703161418437958, 0.46816256642341614],
 'driver_id': [1004, 1005]}


In [22]:
from feast import FeatureStore
feature_store = FeatureStore('.')  # Initialize the feature store

feature_service = feature_store.get_feature_service("driver_activity")
feature_vector = feature_store.get_online_features(
    features=feature_service,
    entity_rows=[
        # {join_key: entity_value}
        {"driver_id": 1004},
        {"driver_id": 1005},
    ],
).to_dict()
pprint(feature_vector)

{'acc_rate': [0.12524475157260895, 0.7028396725654602],
 'avg_daily_trips': [290, 441],
 'conv_rate': [0.4703161418437958, 0.46816256642341614],
 'driver_id': [1004, 1005]}


# Great Expectations

In [23]:
feature_store = \
f"""project: dqm_repo
registry: data/registry.db
provider: local
offline_store:
    type: file
online_store:
    path: data/online_store.db
entity_key_serialization_version: 2
feature_server:
    enabled: True
    feature_logging:
        enabled: True
        flush_interval_secs: 60
        write_to_disk_interval_secs: 10
    
go_feature_retrieval: False
"""

with open('feature_store.yaml', "w") as feature_store_file:
    feature_store_file.write(feature_store)

dqm_services = f"""
from feast import FeatureService
from feast.feature_logging import LoggingConfig
from feast.infra.offline_stores.file_source import FileLoggingDestination

from example import driver_hourly_stats, driver, driver_hourly_stats_view

fs = FeatureService(
    name="driver_activity", 
    features=[driver_hourly_stats_view],
    logging_config=LoggingConfig(
        sample_rate=1.0,
        destination=FileLoggingDestination(path="logs/"),
    )
)
"""
    
with open('dqm_services.py', "w") as dqm_services_file:
    dqm_services_file.write(dqm_services)

In [24]:
!feast apply

Updated feature service [1m[33mdriver_activity[0m
	logging_config: [1m[33m[0m -> [1m[92msample_rate: 1.0
file_destination {
  path: "logs/"
}
[0m

[1m[94mNo changes to infrastructure


In [25]:
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

In [26]:
training_df.head()

Unnamed: 0,driver_id,label_driver_reported_satisfaction,event_timestamp,conv_rate,acc_rate,avg_daily_trips
0,1002,5,2022-08-09 19:37:52.917735+00:00,0.054348,0.029196,526
1,1001,1,2022-08-09 20:02:52.917724+00:00,0.827786,0.719152,654
2,1003,3,2022-08-09 19:00:52.917738+00:00,0.95577,0.59637,970


In [27]:
feature_service

<FeatureService(name = driver_activity, _features = [], feature_view_projections = [FeatureViewProjection(name='driver_hourly_stats', name_alias='', desired_features=[], features=[conv_rate-Float32, acc_rate-Float32, avg_daily_trips-Int64], join_key_map={})], description = , tags = {}, owner = , created_timestamp = 2022-08-10 02:13:41.425728, last_updated_timestamp = 2022-08-10 02:13:41.425728, logging_config = None)>

In [28]:
training_data_job = store.get_historical_features(
    entity_df=entity_df,
    features=feature_service,
    full_feature_names=True, 
)

In [29]:
reference_dataset = store.create_saved_dataset(
    from_=training_data_job,
    name="reference_dataset",
    storage=SavedDatasetFileStorage(path='data/driver_stats.parquet')
)



In [30]:
from feast.dqm.profilers.ge_profiler import ge_profiler
from great_expectations.dataset import PandasDataset
from great_expectations.core.expectation_suite import ExpectationSuite

In [31]:
reference_dataset.features

['driver_hourly_stats:conv_rate',
 'driver_hourly_stats:acc_rate',
 'driver_hourly_stats:avg_daily_trips']

In [32]:
@ge_profiler
def user_features_profiler(ds: PandasDataset) -> ExpectationSuite:
    print(ds.columns)
    ds.expect_column_to_exist("driver_id")
    ds.expect_column_values_to_be_between("driver_hourly_stats__avg_daily_trips", 0, 1000)
    ds.expect_column_values_to_be_between("driver_hourly_stats__conv_rate", 0, 1)
    ds.expect_column_values_to_be_between("driver_hourly_stats__acc_rate", 0, 1)
    return ds.get_expectation_suite()

In [33]:
reference_dataset.get_profile(profiler=user_features_profiler)

Index(['label_driver_reported_satisfaction', 'driver_id', 'event_timestamp',
       'driver_hourly_stats__acc_rate', 'driver_hourly_stats__avg_daily_trips',
       'driver_hourly_stats__conv_rate'],
      dtype='object')


<GEProfile with expectations: [
  {
    "kwargs": {
      "column": "driver_id"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  {
    "kwargs": {
      "column": "driver_hourly_stats__avg_daily_trips",
      "min_value": 0,
      "max_value": 1000
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between"
  },
  {
    "kwargs": {
      "column": "driver_hourly_stats__conv_rate",
      "min_value": 0,
      "max_value": 1
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between"
  },
  {
    "kwargs": {
      "column": "driver_hourly_stats__acc_rate",
      "min_value": 0,
      "max_value": 1
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between"
  }
]>

In [34]:
ds = store.get_saved_dataset('reference_dataset')



In [35]:
validation_reference = ds.as_reference(name='user_features_profiler', profiler=user_features_profiler)

In [36]:
ds.to_df()

Unnamed: 0,label_driver_reported_satisfaction,driver_id,event_timestamp,driver_hourly_stats__acc_rate,driver_hourly_stats__avg_daily_trips,driver_hourly_stats__conv_rate
0,3,1003,2022-08-09 19:00:52.917738+00:00,0.59637,970,0.95577
1,5,1002,2022-08-09 19:37:52.917735+00:00,0.029196,526,0.054348
2,1,1001,2022-08-09 20:02:52.917724+00:00,0.719152,654,0.827786


In [37]:
ds.get_profile(profiler=user_features_profiler)

Index(['label_driver_reported_satisfaction', 'driver_id', 'event_timestamp',
       'driver_hourly_stats__acc_rate', 'driver_hourly_stats__avg_daily_trips',
       'driver_hourly_stats__conv_rate'],
      dtype='object')


<GEProfile with expectations: [
  {
    "kwargs": {
      "column": "driver_id"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  },
  {
    "kwargs": {
      "column": "driver_hourly_stats__avg_daily_trips",
      "min_value": 0,
      "max_value": 1000
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between"
  },
  {
    "kwargs": {
      "column": "driver_hourly_stats__conv_rate",
      "min_value": 0,
      "max_value": 1
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between"
  },
  {
    "kwargs": {
      "column": "driver_hourly_stats__acc_rate",
      "min_value": 0,
      "max_value": 1
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between"
  }
]>

In [38]:
training_data_job.full_feature_names

True

In [39]:
training_data_job.evaluation_function().compute()

KeyError: "['created'] not in index"

In [40]:
training_data_job.to_df()

KeyError: "['created'] not in index"

In [41]:
_ = training_data_job.to_df(validation_reference=validation_reference)

KeyError: "['created'] not in index"

In [42]:
training_df

Unnamed: 0,driver_id,label_driver_reported_satisfaction,event_timestamp,conv_rate,acc_rate,avg_daily_trips
0,1002,5,2022-08-09 19:37:52.917735+00:00,0.054348,0.029196,526
1,1001,1,2022-08-09 20:02:52.917724+00:00,0.827786,0.719152,654
2,1003,3,2022-08-09 19:00:52.917738+00:00,0.95577,0.59637,970


In [43]:
from feast.saved_dataset import ValidationReference

ref = ValidationReference(
    name='user_features_training_ref',
    dataset_name="reference_dataset",
    profiler=user_features_profiler,
)

In [44]:
store.apply(ref)

In [45]:
import pandas as pd
insert_df = pd.DataFrame({
    "driver_id": [1003],
    "conv_rate": [-1],
    "acc_rate": [2],
    "avg_daily_trips": [1500],
    "event_timestamp": [datetime.now()],
    "created": [datetime.now()],
})

store.write_to_online_store("driver_hourly_stats", insert_df)

In [46]:
end_ts = datetime.now()
start_ts = end_ts - timedelta(minutes=10)

! feast validate --feature-service driver_activity --reference user_features_training_ref {start_ts.isoformat()} {end_ts.isoformat()}

Traceback (most recent call last):
  File "/Users/franciscojavierarceo/GitHub/feast/venv/bin/feast", line 33, in <module>
    sys.exit(load_entry_point('feast', 'console_scripts', 'feast')())
  File "/Users/franciscojavierarceo/GitHub/feast/venv/lib/python3.8/site-packages/click/core.py", line 1130, in __call__
    return self.main(*args, **kwargs)
  File "/Users/franciscojavierarceo/GitHub/feast/venv/lib/python3.8/site-packages/click/core.py", line 1055, in main
    rv = self.invoke(ctx)
  File "/Users/franciscojavierarceo/GitHub/feast/venv/lib/python3.8/site-packages/click/core.py", line 1657, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/Users/franciscojavierarceo/GitHub/feast/venv/lib/python3.8/site-packages/click/core.py", line 1404, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/Users/franciscojavierarceo/GitHub/feast/venv/lib/python3.8/site-packages/click/core.py", line 760, in invoke
    return __callback(*args, **kwargs)
