In [21]:
pip install feast==0.5.0

Collecting feast==0.5.0
  Downloading feast-0.5.0-py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 1.5 MB/s eta 0:00:011
Collecting fastavro<0.23,>=0.22.11
  Downloading fastavro-0.22.13-cp37-cp37m-manylinux2010_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 11.7 MB/s eta 0:00:01
Installing collected packages: fastavro, feast
  Attempting uninstall: fastavro
    Found existing installation: fastavro 0.23.3
    Uninstalling fastavro-0.23.3:
      Successfully uninstalled fastavro-0.23.3
  Attempting uninstall: feast
    Found existing installation: feast 0.1.dev1+g8ab5620.d20200108
    Uninstalling feast-0.1.dev1+g8ab5620.d20200108:
      Successfully uninstalled feast-0.1.dev1+g8ab5620.d20200108
Successfully installed fastavro-0.22.13 feast-0.5.0
You should consider upgrading via the '/feast-venv/bin/python3.7 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from pytz import utc
from feast import Client, FeatureSet, Entity, ValueType, Feature, KafkaSource
from google.protobuf.duration_pb2 import Duration
from datetime import datetime, timedelta
from random import randrange, randint
import os

In [3]:
days = [datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=utc) \
        - timedelta(day) for day in range(3)]

customers = [1001, 1002, 1003, 1004, 1005, 1006, 1007]

In [5]:
customer_features_v1 = pd.DataFrame(
    {
        "datetime": [day for day in days for customer in customers],
        "version": "v1",
        "cust_id": [customer for day in days for customer in customers],
        "daily_txns": [np.random.rand() * 10 for _ in range(len(days) * len(customers))],
        "total_txns": [np.random.randint(100) for _ in range(len(days) * len(customers))],
    }
)

print(customer_features_v1.head(10))

customer_features_v2 = pd.DataFrame(
    {
        "datetime": [day for day in days for customer in customers],
        "version": ["v2" for day in days for customer in customers],
        "cust_id": [customer for day in days for customer in customers],
        "daily_txns": [np.random.rand() * 10 for _ in range(len(days) * len(customers))],
        "total_txns": [np.random.randint(100) for _ in range(len(days) * len(customers))],
    }
)

print(customer_features_v2.head(10))

                   datetime version  cust_id  daily_txns  total_txns
0 2020-06-11 00:00:00+00:00      v1     1001    7.663303          79
1 2020-06-11 00:00:00+00:00      v1     1002    2.290965          52
2 2020-06-11 00:00:00+00:00      v1     1003    7.382909          52
3 2020-06-11 00:00:00+00:00      v1     1004    4.390253          23
4 2020-06-11 00:00:00+00:00      v1     1005    6.689010           7
5 2020-06-11 00:00:00+00:00      v1     1006    7.473775          81
6 2020-06-11 00:00:00+00:00      v1     1007    5.180179          52
7 2020-06-10 00:00:00+00:00      v1     1001    7.703890           1
8 2020-06-10 00:00:00+00:00      v1     1002    1.289145          70
9 2020-06-10 00:00:00+00:00      v1     1003    5.611463          48
                   datetime version  cust_id  daily_txns  total_txns
0 2020-06-11 00:00:00+00:00      v2     1001    4.830068          35
1 2020-06-11 00:00:00+00:00      v2     1002    8.588636           3
2 2020-06-11 00:00:00+00:00      v

In [19]:
customer_fs = FeatureSet(
    "customer_txns",
    max_age=Duration(seconds=86400),
    entities=[Entity(name='version', dtype=ValueType.STRING),Entity(name='cust_id', dtype=ValueType.INT64)]
)
customer_fs.infer_fields_from_df(customer_features_v1, replace_existing_features=True)

Feature daily_txns (ValueType.DOUBLE) added from dataframe.
Feature total_txns (ValueType.INT64) added from dataframe.



In [22]:
CORE_URL="feast-feast-core.deep:6565"
BATCH_SERVING_URL="feast-feast-batch-serving.deep:6566"
PROJECT="version_demo"
print(PROJECT)
client = Client(core_url=CORE_URL, serving_url=BATCH_SERVING_URL, project=PROJECT)
if PROJECT not in client.list_projects():
    client.create_project(PROJECT)

version_demo


In [23]:
client.apply(customer_fs)

Feature set created: "customer_txns"


In [27]:
customer_fs = client.get_feature_set("customer_txns")
print(customer_fs)

{
  "spec": {
    "name": "customer_txns",
    "entities": [
      {
        "name": "version",
        "valueType": "STRING"
      },
      {
        "name": "cust_id",
        "valueType": "INT64"
      }
    ],
    "features": [
      {
        "name": "daily_txns",
        "valueType": "DOUBLE"
      },
      {
        "name": "total_txns",
        "valueType": "INT64"
      }
    ],
    "maxAge": "86400s",
    "source": {
      "type": "KAFKA",
      "kafkaSourceConfig": {
        "bootstrapServers": "10.163.12.6:9092",
        "topic": "feast-features"
      }
    },
    "project": "version_demo"
  },
  "meta": {
    "createdTimestamp": "2020-06-11T14:09:21Z",
    "status": "STATUS_READY"
  }
}


In [28]:
client.ingest("customer_txns", customer_features_v1)



  0%|          | 0/21 [00:00<?, ?rows/s][A[A

Waiting for feature set to be ready for ingestion...




  5%|▍         | 1/21 [00:01<00:20,  1.00s/rows][A[A

100%|██████████| 21/21 [00:01<00:00, 20.78rows/s][A[A

Ingestion complete!

Ingestion statistics:
Success: 21/21
Removing temporary file(s)...





'bea2bfc9-4788-3231-beba-84c2c9ba6bb6'

Process ForkPoolWorker-3:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 352, in get
    res = self._reader.recv_bytes()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


In [29]:
client.ingest("customer_txns", customer_features_v2)



  0%|          | 0/21 [00:00<?, ?rows/s][A[A

Waiting for feature set to be ready for ingestion...




  5%|▍         | 1/21 [00:01<00:20,  1.00s/rows][A[A

100%|██████████| 21/21 [00:01<00:00, 20.78rows/s][A[A

Ingestion complete!

Ingestion statistics:
Success: 21/21
Removing temporary file(s)...





'52b311dc-e11b-3404-8653-a324afb05eb3'

Process ForkPoolWorker-4:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 352, in get
    res = self._reader.recv_bytes()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


In [30]:
event_timestamps = [datetime.utcnow().replace(tzinfo=utc) - timedelta(days=randrange(3), hours=randrange(24), minutes=randrange(60)) for day in range(7)]

entity_rows_v1 = pd.DataFrame(
    {
        "datetime": event_timestamps,
        "version": "v1",
        "cust_id": [customers[idx % len(customers)] for idx in range(len(event_timestamps))],
    }
)

print(entity_rows_v1.head(10))

entity_rows_v2 = pd.DataFrame(
    {
        "datetime": event_timestamps,
        "version": "v2",
        "cust_id": [customers[idx % len(customers)] for idx in range(len(event_timestamps))],
    }
)

print(entity_rows_v2.head(10))

                          datetime version  cust_id
0 2020-06-10 05:30:12.851982+00:00      v1     1001
1 2020-06-10 23:39:12.852040+00:00      v1     1002
2 2020-06-08 19:27:12.852068+00:00      v1     1003
3 2020-06-11 13:51:12.852078+00:00      v1     1004
4 2020-06-10 11:21:12.852086+00:00      v1     1005
5 2020-06-09 09:06:12.852094+00:00      v1     1006
6 2020-06-09 01:56:12.852102+00:00      v1     1007
                          datetime version  cust_id
0 2020-06-10 05:30:12.851982+00:00      v2     1001
1 2020-06-10 23:39:12.852040+00:00      v2     1002
2 2020-06-08 19:27:12.852068+00:00      v2     1003
3 2020-06-11 13:51:12.852078+00:00      v2     1004
4 2020-06-10 11:21:12.852086+00:00      v2     1005
5 2020-06-09 09:06:12.852094+00:00      v2     1006
6 2020-06-09 01:56:12.852102+00:00      v2     1007


In [32]:
job = client.get_batch_features(
                            feature_refs=[
                                "daily_txns",
                                "total_txns"
                               ],
                            entity_rows=entity_rows_v1
                         )
df = job.to_dataframe()
print(df.head(10))

                   event_timestamp version  cust_id  daily_txns  total_txns
0 2020-06-08 19:27:12.852068+00:00      v1     1003         NaN         NaN
1 2020-06-10 05:30:12.851982+00:00      v1     1001    7.703890         1.0
2 2020-06-09 09:06:12.852094+00:00      v1     1006    2.969220        23.0
3 2020-06-11 13:51:12.852078+00:00      v1     1004    4.390253        23.0
4 2020-06-10 11:21:12.852086+00:00      v1     1005    6.836232        60.0
5 2020-06-10 23:39:12.852040+00:00      v1     1002    1.289145        70.0
6 2020-06-09 01:56:12.852102+00:00      v1     1007    6.958955        92.0


In [34]:
job = client.get_batch_features(
                            feature_refs=[
                                "daily_txns",
                                "total_txns"
                               ],
                            entity_rows=entity_rows_v2
                         )
df = job.to_dataframe()
print(df.head(10))

                   event_timestamp version  cust_id  daily_txns  total_txns
0 2020-06-08 19:27:12.852068+00:00      v2     1003         NaN         NaN
1 2020-06-09 09:06:12.852094+00:00      v2     1006    8.203336        46.0
2 2020-06-10 23:39:12.852040+00:00      v2     1002    0.250941        92.0
3 2020-06-11 13:51:12.852078+00:00      v2     1004    8.294043        74.0
4 2020-06-10 11:21:12.852086+00:00      v2     1005    0.285854        15.0
5 2020-06-09 01:56:12.852102+00:00      v2     1007    7.016564        19.0
6 2020-06-10 05:30:12.851982+00:00      v2     1001    8.855293        40.0
