In [1]:
import feast
import pandas as pd

In [2]:
fs = feast.FeatureStore(repo_path="/home/gianmaria/repos/airbnb-bc/src/feature_store/feature_repo")

In [3]:
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage
df = pd.read_parquet("/home/gianmaria/repos/airbnb-bc/src/feature_store/feature_repo/data/test_df.parquet")

test = fs.get_historical_features(
    entity_df=df,
    features=[
        "df1_feature_view:bedrooms",
        "df1_feature_view:bathrooms",
        "df2_feature_view:cleaning_fee",
        "df2_feature_view:available_days",
        "df2_feature_view:blocked_days",
        "df2_feature_view:occupancy_rate",
        "df2_feature_view:reservation_days",
        "df2_feature_view:adr_usd",
        "df2_feature_view:number_of_reservation",
        # "on_demand_rates:rate_blocked_days",
        # "on_demand_rates:rate_available_days",
        "df3_feature_view:num_neighbours",
        "df3_feature_view:dist_from_bc",
    ],
)

In [5]:
import numpy as np
import feast

from feast.dqm.profilers.ge_profiler import ge_profiler

from great_expectations.core.expectation_suite import ExpectationSuite
from great_expectations.dataset import PandasDataset

In [6]:
DELTA = 0.1  # controlling allowed window in fraction of the value on scale [0, 1]

@ge_profiler
def stats_profiler(ds: PandasDataset) -> ExpectationSuite:
    # simple checks on data consistency
    ds.expect_column_values_to_be_between(
        "available_days",
        min_value=1,
        max_value=31,
        mostly=0.99  # allow some outliers
    )

    ds.expect_column_values_to_be_between(
        "bathrooms",
        min_value=1,
        max_value=1,
        mostly=0.99  # allow some outliers
    )

    ds.expect_column_values_to_be_between(
        "num_neighbours",
        min_value=0,
        max_value=100.,
        mostly=0.99  # allow some outliers
    )

    # expectation of means based on observed values
    # observed_mean = ds.trip_count.mean()
    # ds.expect_column_mean_to_be_between("trip_count",
    #                                     min_value=observed_mean * (1 - DELTA),
    #                                     max_value=observed_mean * (1 + DELTA))

    # observed_mean = ds.earned_per_hour.mean()
    # ds.expect_column_mean_to_be_between("earned_per_hour",
    #                                     min_value=observed_mean * (1 - DELTA),
    #                                     max_value=observed_mean * (1 + DELTA))


    # expectation of quantiles
    # qs = [0.5, 0.75, 0.9, 0.95]
    # observed_quantiles = ds.avg_fare.quantile(qs)

    # ds.expect_column_quantile_values_to_be_between(
    #     "avg_fare",
    #     quantile_ranges={
    #         "quantiles": qs,
    #         "value_ranges": [[None, max_value] for max_value in observed_quantiles]
    #     })

    return ds.get_expectation_suite(discard_failed_expectations=False)

In [7]:
dataset = fs.create_saved_dataset(
    from_=test,
    name='my_inference_ds',
    allow_overwrite=True,
    storage=SavedDatasetFileStorage(path='my_inference_ds.parquet'),
    tags={'author': 'fsxz'}
)

ds = fs.get_saved_dataset('my_inference_ds')
vr = ds.as_reference(name="validation_reference_dataset", profiler=stats_profiler)

# ds.get_profile(profiler=stats_profiler)

In [9]:
# vr = ValidationReference.from_saved_dataset('check', dataset=fs.get_saved_dataset('my_training_ds'), profiler=stats_profiler)

In [10]:
from feast.saved_dataset import ValidationReference

In [63]:
# vr = ValidationReference('check',dataset_name = 'my_training_ds', profiler=stats_profiler)

In [13]:
from feast.dqm.errors import ValidationFailed
try:
    validated_test = test.to_df(validation_reference=vr)
except ValidationFailed as exc:
    print("VALIDATION FAILED! THERE'S SOME PROBLEM IN THE DATA!")
    print(exc.validation_report)

VALIDATION FAILED! THERE'S SOME PROBLEM IN THE DATA!
[
  {
    "success": false,
    "expectation_config": {
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "bathrooms",
        "min_value": 1,
        "max_value": 1,
        "mostly": 0.99,
        "result_format": "COMPLETE"
      },
      "meta": {}
    },
    "result": {
      "element_count": 1436,
      "missing_count": 0,
      "missing_percent": 0.0,
      "unexpected_count": 864,
      "unexpected_percent": 60.16713091922006,
      "unexpected_percent_total": 60.16713091922006,
      "unexpected_percent_nonmissing": 60.16713091922006,
      "partial_unexpected_list": [
        2,
        2,
        2,
        2,
        2,
        2,
        3,
        2,
        2,
        2,
        2,
        3,
        2,
        2,
        4,
        2,
        2,
        2,
        2,
        2
      ],
      "partial_unexpected_index_list": [
        1,
        2,
        3,
        5,

In [41]:
pippo.shape, job.to_df().shape

((24849, 8), (24849, 8))