In [1]:
%%sh
pip install Pygments -q
echo "Please restart your runtime now (Runtime -> Restart runtime). This ensures that the correct dependencies are loaded."

You should consider upgrading via the '/Users/franciscojavierarceo/GitHub/feast/venv/bin/python3 -m pip install --upgrade pip' command.


Please restart your runtime now (Runtime -> Restart runtime). This ensures that the correct dependencies are loaded.


# https://github.com/feast-dev/feast-gcp-fraud-tutorial/blob/main/notebooks/Validating_Online_Features_While_Detecting_Fraud.ipynb

In [2]:
!rm -rf dqm_repo
!feast init dqm_repo


Creating a new Feast repository in [1m[32m/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo[0m.



In [3]:
%cd dqm_repo/feature_repo/
!mkdir logs
!ls -R

/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo/feature_repo
__init__.py        example_repo.py    [34mlogs[m[m
[34mdata[m[m               feature_store.yaml test_workflow.py

./data:
driver_stats.parquet

./logs:


In [4]:
!pygmentize feature_store.yaml

[94mproject[39;49;00m:[37m [39;49;00mdqm_repo[37m[39;49;00m
[37m# By default, the registry is a file (but can be turned into a more scalable SQL-backed registry)[39;49;00m[37m[39;49;00m
[94mregistry[39;49;00m:[37m [39;49;00mdata/registry.db[37m[39;49;00m
[37m# The provider primarily specifies default offline / online stores & storing the registry in a given cloud[39;49;00m[37m[39;49;00m
[94mprovider[39;49;00m:[37m [39;49;00mlocal[37m[39;49;00m
[94monline_store[39;49;00m:[37m[39;49;00m
[37m    [39;49;00m[94mtype[39;49;00m:[37m [39;49;00msqlite[37m[39;49;00m
[37m    [39;49;00m[94mpath[39;49;00m:[37m [39;49;00mdata/online_store.db[37m[39;49;00m
[94mentity_key_serialization_version[39;49;00m:[37m [39;49;00m2[37m[39;49;00m


# Taking a look at our data

In [5]:
import pandas as pd

pd.read_parquet("data/driver_stats.parquet").head()

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2022-08-31 13:00:00+00:00,1005,0.272476,0.038139,904,2022-09-15 13:36:11.907
1,2022-08-31 14:00:00+00:00,1005,0.086198,0.773058,775,2022-09-15 13:36:11.907
2,2022-08-31 15:00:00+00:00,1005,0.137416,0.39063,863,2022-09-15 13:36:11.907
3,2022-08-31 16:00:00+00:00,1005,0.915498,0.605327,387,2022-09-15 13:36:11.907
4,2022-08-31 17:00:00+00:00,1005,0.778479,0.126495,656,2022-09-15 13:36:11.907


In [7]:
!pygmentize -f terminal16m example_repo.py

[38;2;61;123;123;03m# This is an example feature definition file[39;00m

[38;2;0;128;0;01mfrom[39;00m [38;2;0;0;255;01mdatetime[39;00m [38;2;0;128;0;01mimport[39;00m timedelta

[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mpandas[39;00m [38;2;0;128;0;01mas[39;00m [38;2;0;0;255;01mpd[39;00m

[38;2;0;128;0;01mfrom[39;00m [38;2;0;0;255;01mfeast[39;00m [38;2;0;128;0;01mimport[39;00m (
    Entity,
    FeatureService,
    FeatureView,
    Field,
    FileSource,
    PushSource,
    RequestSource,
)
[38;2;0;128;0;01mfrom[39;00m [38;2;0;0;255;01mfeast[39;00m[38;2;0;0;255;01m.[39;00m[38;2;0;0;255;01mon_demand_feature_view[39;00m [38;2;0;128;0;01mimport[39;00m on_demand_feature_view
[38;2;0;128;0;01mfrom[39;00m [38;2;0;0;255;01mfeast[39;00m[38;2;0;0;255;01m.[39;00m[38;2;0;0;255;01mtypes[39;00m [38;2;0;128;0;01mimport[39;00m Float32, Float64, Int64

[38;2;61;123;123;03m# Define an entity for the driver. You can think of an entity as a primary key used t

# Configurations for online evaluation

In [16]:
feature_store = \
f"""project: dqm_repo
registry: data/registry.db
provider: local
offline_store:
    type: file
online_store:
    path: data/online_store.db
entity_key_serialization_version: 2
feature_server:
    enabled: True
    feature_logging:
        enabled: True
        flush_interval_secs: 60
        write_to_disk_interval_secs: 10
    
go_feature_retrieval: False
"""

with open('feature_store.yaml', "w") as feature_store_file:
    feature_store_file.write(feature_store)

dqm_services = f"""
from feast import FeatureService
from feast.feature_logging import LoggingConfig
from feast.infra.offline_stores.file_source import FileLoggingDestination

from example_repo import driver_stats_fv

fs = FeatureService(
    name="driver_activity", 
    features=[driver_stats_fv],
    logging_config=LoggingConfig(
        sample_rate=1.0,
        destination=FileLoggingDestination(path="logs/"),
    )
)
"""
    
with open('dqm_services.py', "w") as dqm_services_file:
    dqm_services_file.write(dqm_services)

In [17]:
!feast apply

Created entity [1m[32mdriver[0m
Created feature view [1m[32mdriver_hourly_stats_fresh[0m
Created feature view [1m[32mdriver_hourly_stats[0m
Created on demand feature view [1m[32mtransformed_conv_rate_fresh[0m
Created on demand feature view [1m[32mtransformed_conv_rate[0m
Created feature service [1m[32mdriver_activity_v2[0m
Created feature service [1m[32mdriver_activity[0m
Created feature service [1m[32mdriver_activity_v1[0m
Created feature service [1m[32mdriver_activity_v3[0m

Created sqlite table [1m[32mdqm_repo_driver_hourly_stats[0m
Created sqlite table [1m[32mdqm_repo_driver_hourly_stats_fresh[0m



In [18]:
from datetime import datetime, timedelta
import pandas as pd

from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

# The entity dataframe is the dataframe we want to enrich with feature values
entity_df = pd.DataFrame.from_dict(
    {
        "driver_id": [1001, 1002, 1003],
        "label_driver_reported_satisfaction": [1, 5, 3], 
        "event_timestamp": [
            datetime.now() - timedelta(minutes=11),
            datetime.now() - timedelta(minutes=36),
            datetime.now() - timedelta(minutes=73),
        ],
    }
)

store = FeatureStore(repo_path=".")

training_data_job = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
)

reference_dataset = store.create_saved_dataset(
    from_=training_data_job,
    name="reference_dataset",
    storage=SavedDatasetFileStorage(path='data/driver_stats_reference.parquet')
)

print(reference_dataset)

{
  "spec": {
    "name": "reference_dataset",
    "features": [
      "driver_hourly_stats:conv_rate",
      "driver_hourly_stats:acc_rate",
      "driver_hourly_stats:avg_daily_trips"
    ],
    "joinKeys": [
      "label_driver_reported_satisfaction",
      "driver_id"
    ],
    "storage": {
      "fileStorage": {
        "fileFormat": {
          "parquetFormat": {}
        },
        "uri": "data/driver_stats_reference.parquet"
      }
    }
  },
  "meta": {
    "createdTimestamp": "2022-09-15T19:44:09.571895Z",
    "minEventTimestamp": "2022-09-15T12:31:09.446320Z",
    "maxEventTimestamp": "2022-09-15T13:33:09.446306Z"
  }
}




# Creating our own Expectations Profiler

In [19]:
from feast.dqm.profilers.ge_profiler import ge_profiler
from great_expectations.dataset import PandasDataset
from great_expectations.core.expectation_suite import ExpectationSuite

# this profiler should pass
@ge_profiler
def user_features_profiler(dataset: PandasDataset) -> ExpectationSuite:
    print(dataset.columns)
    dataset.expect_column_to_exist("driver_id")
    dataset.expect_column_values_to_be_between("avg_daily_trips", 0, 1000)
    dataset.expect_column_values_to_be_between("conv_rate", 0, 1)
    dataset.expect_column_values_to_be_between("acc_rate", 0, 1)
    return dataset.get_expectation_suite()

# this profiler should trigger failures
@ge_profiler
def user_features_profiler_fail(dataset: PandasDataset) -> ExpectationSuite:
    dataset.expect_column_to_exist("this should fail")
    # dataset.expect_column_values_to_be_between("avg_daily_trips", )
    return dataset.get_expectation_suite()

ds = store.get_saved_dataset('reference_dataset')
validation_reference = ds.as_reference(name='user_features_profiler', profiler=user_features_profiler)
validation_reference_fail = ds.as_reference(name='user_features_profiler_fail', profiler=user_features_profiler_fail)

# print(training_data_job.to_df(validation_reference=validation_reference))
# print(training_data_job.to_df(validation_reference=validation_reference_fail))

  from urllib3.contrib.pyopenssl import orig_util_SSLContext as SSLContext


In [20]:
@ge_profiler
def profiler_with_unrealistic_expectations(dataset: PandasDataset) -> ExpectationSuite:
    # need to create dataframe with corrupted data first
    df = pd.DataFrame()
    df["current_balance"] = [100]
    df["avg_passenger_count"] = [0]

    other_ds = PandasDataset(df)
    other_ds.expect_column_max_to_be_between("current_balance", -1000, -100)
    other_ds.expect_column_values_to_be_in_set("avg_passenger_count", value_set={0})

    # this should pass
    other_ds.expect_column_min_to_be_between("avg_passenger_count", 0, 1000)

    return other_ds.get_expectation_suite()

@ge_profiler
def profiler_with_unrealistic_expectations2(dataset: PandasDataset) -> ExpectationSuite:
    # need to create dataframe with corrupted data first
    df = pd.DataFrame()
    df["current_balance"] = [-100]
    df["avg_passenger_count"] = [0]

    other_ds = PandasDataset(df)
    other_ds.expect_column_max_to_be_between("current_balance", -1000, -100)
    other_ds.expect_column_values_to_be_in_set("avg_passenger_count", value_set={0})

    # this should pass
    other_ds.expect_column_min_to_be_between("avg_passenger_count", 0, 1000)

    return other_ds.get_expectation_suite()


In [21]:
ds.get_profile(profiler=profiler_with_unrealistic_expectations) ==  ds.get_profile(profiler=profiler_with_unrealistic_expectations2) 

False

In [22]:
ds.get_profile(profiler=user_features_profiler_fail)

<GEProfile with expectations: []>

In [23]:
type(user_features_profiler), type(ds), type(ds.to_df())

(feast.dqm.profilers.ge_profiler.GEProfiler,
 feast.saved_dataset.SavedDataset,
 pandas.core.frame.DataFrame)

In [24]:
from great_expectations.dataset import PandasDataset
from feast.dqm.profilers.ge_profiler import _prepare_dataset, _add_feature_metadata, GEProfile
import great_expectations as ge

In [25]:
dataset = _prepare_dataset(PandasDataset(ds.to_df()))
# this doesn't appear to do anything but it's in the `analyze_dataset` method
dataset = _add_feature_metadata(dataset)
# It looks like when it fails it doesn't return anything
gething = user_features_profiler_fail.user_defined_profiler(dataset)

Index(['label_driver_reported_satisfaction', 'event_timestamp', 'acc_rate',
       'conv_rate', 'driver_id', 'avg_daily_trips'],
      dtype='object')


In [36]:
gething

{
  "meta": {
    "great_expectations_version": "0.14.13"
  },
  "data_asset_type": "Dataset",
  "ge_cloud_id": null,
  "expectation_suite_name": "default",
  "expectations": [
    {
      "meta": {},
      "expectation_type": "expect_column_to_exist",
      "kwargs": {
        "column": "driver_id"
      }
    },
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "avg_daily_trips",
        "min_value": 0,
        "max_value": 1000
      }
    },
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "conv_rate",
        "min_value": 0,
        "max_value": 1
      }
    },
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "acc_rate",
        "min_value": 0,
        "max_value": 1
      }
    }
  ]
}

In [37]:
# and now we reproduce the empty expectation
GEProfile(expectation_suite=gething)

<GEProfile with expectations: [
  {
    "meta": {},
    "expectation_type": "expect_column_to_exist",
    "kwargs": {
      "column": "driver_id"
    }
  },
  {
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "avg_daily_trips",
      "min_value": 0,
      "max_value": 1000
    }
  },
  {
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "conv_rate",
      "min_value": 0,
      "max_value": 1
    }
  },
  {
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "acc_rate",
      "min_value": 0,
      "max_value": 1
    }
  }
]>

In [38]:
# so the root is the profiler and gething is returning nothing

In [42]:
user_features_profiler_fail.user_defined_profiler(dataset)

{
  "meta": {
    "great_expectations_version": "0.14.13"
  },
  "data_asset_type": "Dataset",
  "ge_cloud_id": null,
  "expectation_suite_name": "default",
  "expectations": [
    {
      "meta": {},
      "expectation_type": "expect_column_to_exist",
      "kwargs": {
        "column": "driver_id"
      }
    },
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "avg_daily_trips",
        "min_value": 0,
        "max_value": 1000
      }
    },
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "conv_rate",
        "min_value": 0,
        "max_value": 1
      }
    },
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "acc_rate",
        "min_value": 0,
        "max_value": 1
      }
    }
  ]
}

In [43]:
type(user_features_profiler_fail), 

(feast.dqm.profilers.ge_profiler.GEProfiler,)

In [44]:
ge.validate(dataset, expectation_suite=gething, result_format="COMPLETE")

{
  "statistics": {
    "evaluated_expectations": 4,
    "successful_expectations": 4,
    "unsuccessful_expectations": 0,
    "success_percent": 100.0
  },
  "meta": {
    "great_expectations_version": "0.14.13",
    "expectation_suite_name": "default",
    "run_id": {
      "run_time": "2022-09-15T19:46:37.430399+00:00",
      "run_name": null
    },
    "batch_kwargs": {
      "ge_batch_id": "0898779e-352f-11ed-9d0b-acde48001122"
    },
    "batch_markers": {},
    "batch_parameters": {},
    "validation_time": "20220915T194637.430340Z",
    "expectation_suite_meta": {
      "great_expectations_version": "0.14.13"
    }
  },
  "evaluation_parameters": {},
  "success": true,
  "results": [
    {
      "meta": {},
      "success": true,
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "expectation_config": {
        "meta": {},
        "expectation_type": "expect_column_to_exist",
       

In [45]:
ds.get_profile(profiler=user_features_profiler)

Index(['label_driver_reported_satisfaction', 'event_timestamp', 'acc_rate',
       'conv_rate', 'driver_id', 'avg_daily_trips'],
      dtype='object')


<GEProfile with expectations: [
  {
    "meta": {},
    "expectation_type": "expect_column_to_exist",
    "kwargs": {
      "column": "driver_id"
    }
  },
  {
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "avg_daily_trips",
      "min_value": 0,
      "max_value": 1000
    }
  },
  {
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "conv_rate",
      "min_value": 0,
      "max_value": 1
    }
  },
  {
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "acc_rate",
      "min_value": 0,
      "max_value": 1
    }
  }
]>

In [46]:
training_data_job.to_df(validation_reference=validation_reference_fail)



Unnamed: 0,driver_id,label_driver_reported_satisfaction,event_timestamp,conv_rate,acc_rate,avg_daily_trips
0,1002,5,2022-09-15 13:08:09.446317+00:00,0.378602,0.472523,10
1,1001,1,2022-09-15 13:33:09.446306+00:00,0.473804,0.160682,308
2,1003,3,2022-09-15 12:31:09.446320+00:00,0.34252,0.539268,50


In [47]:
from datetime import datetime
!feast materialize-incremental {datetime.now().isoformat()}

Materializing [1m[32m2[0m feature views to [1m[32m2022-09-15 07:49:38-06:00[0m into the [1m[32msqlite[0m online store.

[1m[32mdriver_hourly_stats_fresh[0m from [1m[32m2022-09-14 19:49:40-06:00[0m to [1m[32m2022-09-15 07:49:38-06:00[0m:
100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 371.72it/s]
[1m[32mdriver_hourly_stats[0m from [1m[32m2022-09-14 19:49:40-06:00[0m to [1m[32m2022-09-15 07:49:38-06:00[0m:
100%|███████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1549.43it/s]


In [48]:
print("--- Data directory ---")
!ls data

import sqlite3
import pandas as pd
con = sqlite3.connect("data/online_store.db")
print("\n--- Schema of online store ---")
print(
    pd.read_sql_query("SELECT * FROM dqm_repo_driver_hourly_stats limit 10;", con)
)
con.close()

--- Data directory ---
driver_stats.parquet           online_store.db
driver_stats_reference.parquet registry.db

--- Schema of online store ---
                                          entity_key     feature_name  \
0  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   
1  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...         acc_rate   
2  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...  avg_daily_trips   
3  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   
4  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...         acc_rate   
5  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...  avg_daily_trips   
6  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   
7  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...         acc_rate   
8  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...  avg_daily_trips   
9  b'\x02\x00\x00\x00driver_id\x04\x00\x00\x00\x0...        conv_rate   

            value             event_ts             

In [49]:
from pprint import pprint
from feast import FeatureStore

store = FeatureStore(repo_path=".")

feature_vector = store.get_online_features(
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
    entity_rows=[
        {"driver_id": 1004},
        {"driver_id": 1005},
    ],
).to_dict()

pprint(feature_vector)

{'acc_rate': [0.23019002377986908, 0.556277334690094],
 'avg_daily_trips': [117, 858],
 'conv_rate': [0.8403194546699524, 0.5078779458999634],
 'driver_id': [1004, 1005]}


In [50]:
from feast import FeatureStore
feature_store = FeatureStore('.')  # Initialize the feature store

feature_service = feature_store.get_feature_service("driver_activity")
feature_vector = feature_store.get_online_features(
    features=feature_service,
    entity_rows=[
        # {join_key: entity_value}
        {"driver_id": 1004},
        {"driver_id": 1005},
    ],
).to_dict()
pprint(feature_vector)

{'acc_rate': [0.23019002377986908, 0.556277334690094],
 'avg_daily_trips': [117, 858],
 'conv_rate': [0.8403194546699524, 0.5078779458999634],
 'driver_id': [1004, 1005]}


In [64]:
from feast.saved_dataset import ValidationReference

ref = ValidationReference(
    name='user_features_training_ref',
    dataset_name="reference_dataset",
    profiler=user_features_profiler,
)

In [82]:
store.apply(ref)

In [81]:
store.registry.to_dict(project='dqm_repo')

defaultdict(list,
            {'project': 'dqm_repo',
             'projectMetadata': [{'project': 'dqm_repo',
               'projectUuid': 'df9817a0-5ac3-45f0-95aa-3beb6d0caefe'}],
             'dataSources': [{'createdTimestampColumn': 'created',
               'fileOptions': {'uri': '/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo/feature_repo/data/driver_stats.parquet'},
               'name': 'driver_hourly_stats_source',
               'timestampField': 'event_timestamp',
               'type': 'BATCH_FILE'},
              {'batchSource': {'createdTimestampColumn': 'created',
                'fileOptions': {'uri': '/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo/feature_repo/data/driver_stats.parquet'},
                'name': 'driver_hourly_stats_source',
                'timestampField': 'event_timestamp',
                'type': 'BATCH_FILE'},
               'name': 'driver_stats_push_source',
             

In [62]:
import pandas as pd
insert_df = pd.DataFrame({
    "driver_id": [1003],
    "conv_rate": [-1],
    "acc_rate": [2],
    "avg_daily_trips": [1500],
    "event_timestamp": [datetime.now()],
    "created": [datetime.now()],
})

store.write_to_online_store("driver_hourly_stats", insert_df)

In [63]:
end_ts = datetime.now()
start_ts = end_ts - timedelta(minutes=10)

! feast validate --feature-service driver_activity --reference user_features_training_ref {start_ts.isoformat()} {end_ts.isoformat()}

  from urllib3.contrib.pyopenssl import orig_util_SSLContext as SSLContext
  def add_data_context_id_to_url(self, jinja_context, url, add_datetime=True):

Traceback (most recent call last):
  File "/Users/franciscojavierarceo/GitHub/feast/venv/bin/feast", line 8, in <module>
    sys.exit(cli())
  File "/Users/franciscojavierarceo/GitHub/feast/venv/lib/python3.8/site-packages/click/core.py", line 1130, in __call__
    return self.main(*args, **kwargs)
  File "/Users/franciscojavierarceo/GitHub/feast/venv/lib/python3.8/site-packages/click/core.py", line 1055, in main
    rv = self.invoke(ctx)
  File "/Users/franciscojavierarceo/GitHub/feast/venv/lib/python3.8/site-packages/click/core.py", line 1657, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/Users/franciscojavierarceo/GitHub/feast/venv/lib/python3.8/site-packages/click/core.py", line 1404, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/Users/franciscojavierarceo/GitHub/feast/venv