In [None]:
!rm -rf dqm_repo
!feast init dqm_repo
%cd dqm_repo


Creating a new Feast repository in [1m[32m/Users/franciscojavierarceo/GitHub/feast/examples/data-quality-monitoring/dqm_repo[0m.



In [None]:
import pandas as pd

pd.read_parquet("data/driver_stats.parquet")

In [None]:
!feast apply

In [None]:
from datetime import datetime, timedelta
import pandas as pd

from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage
from feast.dqm.profilers.ge_profiler import ge_profiler
from great_expectations.dataset import PandasDataset
from great_expectations.core.expectation_suite import ExpectationSuite

# The entity dataframe is the dataframe we want to enrich with feature values
entity_df = pd.DataFrame.from_dict(
    {
        "driver_id": [1001, 1002, 1003],
        "label_driver_reported_satisfaction": [1, 5, 3], 
        "event_timestamp": [
            datetime.now() - timedelta(minutes=11),
            datetime.now() - timedelta(minutes=36),
            datetime.now() - timedelta(minutes=73),
        ],
    }
)

store = FeatureStore(repo_path=".")

training_data_job = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
)

In [None]:
training_data_job.to_df()

In [None]:
reference_dataset = store.create_saved_dataset(
    from_=training_data_job,
    name="reference_dataset",
    storage=SavedDatasetFileStorage(path='data/driver_stats.parquet')
)

print(reference_dataset)

In [None]:
@ge_profiler
def user_features_profiler(dataset: PandasDataset) -> ExpectationSuite:
    print(dataset.columns)
    dataset.expect_column_to_exist("driver_id")
    dataset.expect_column_values_to_be_between("driver_hourly_stats__avg_daily_trips", 0, 1000)
    dataset.expect_column_values_to_be_between("driver_hourly_stats__conv_rate", 0, 1)
    dataset.expect_column_values_to_be_between("driver_hourly_stats__acc_rate", 0, 1)
    return dataset.get_expectation_suite()

ds = store.get_saved_dataset('reference_dataset')

In [None]:
validation_reference = ds.as_reference(name='user_features_profiler', profiler=user_features_profiler)
_ = training_data_job.to_df(validation_reference=validation_reference)

In [None]:
training_data_job.to_df()