In [3]:
import pandas as pd
from generated import (
    house_area_pb2,
    house_prices_pb2,
)
from google.cloud import pubsub_v1
import google.auth
import google.auth.transport.requests
import httpx
import functools
import time
import datetime
from google.cloud import bigquery

In [4]:
df = pd.read_csv("home-data-for-ml-course/train.csv")
df.shape

(1460, 81)

In [5]:
client = bigquery.Client()
dataset_reference = bigquery.dataset.DatasetReference(
    project=client.project,
    dataset_id="house_pricing")
for tab in client.list_tables(dataset=dataset_reference):
    print(tab.table_id)

ai_house_area
ai_house_prices
house_area
house_prices


In [6]:
from google.cloud import aiplatform_v1beta1

# doesn't work properly
fs_client = aiplatform_v1beta1.FeatureRegistryServiceClient()

In [7]:
publisher = pubsub_v1.PublisherClient()
feature_topic = "projects/ml-lab-324709/topics/house_area"
price_topic = "projects/ml-lab-324709/topics/house_prices"
df = pd.read_csv("home-data-for-ml-course/train.csv")
for idx, row in df.iterrows():
    utc_ts = int(time.mktime(
        datetime.datetime.utcnow().timetuple()
    ) * 1e6)  # python returns ts in seconds, bigquery expects micro-sec.
    # fill HouseFeatures
    feature_row = house_area_pb2.HouseArea()
    feature_row.house_id = str(row.Id)
    feature_row.house_valuation_timestamp = utc_ts
    feature_row.flr_one_sq_feet = row["1stFlrSF"]
    feature_row.flr_two_sq_feet = row["2ndFlrSF"]
    # fill HousePrice
    sales_row = house_prices_pb2.HousePrice()
    sales_row.house_id = str(row.Id)
    sales_row.sale_timestamp = utc_ts
    sales_row.sale_price = row.SalePrice
    # Publish messages
    _ = publisher.publish(feature_topic, 
                          feature_row.SerializeToString())
    _ = publisher.publish(price_topic, 
                          sales_row.SerializeToString())

    if idx > 100:
        break

In [9]:
client = bigquery.Client()
query = """
select *
from house_pricing.ai_house_area
limit 10
"""
query_job = client.query(query)
query_job.result().to_dataframe()

Unnamed: 0,entity_id,feature_timestamp,flr_one_sq_feet,flr_two_sq_feet,house_sq_feet
0,3,2024-01-28 14:01:01+00:00,920,866,1786
1,32,2024-01-28 14:01:01+00:00,1228,0,1228
2,33,2024-01-28 14:01:01+00:00,1234,0,1234
3,96,2024-01-28 14:01:01+00:00,680,790,1470
4,97,2024-01-28 14:01:01+00:00,1588,0,1588
5,98,2024-01-28 14:01:01+00:00,960,0,960
6,99,2024-01-28 14:01:01+00:00,835,0,835
7,13,2024-01-28 14:01:01+00:00,912,0,912
8,40,2024-01-28 14:01:01+00:00,1152,0,1152
9,41,2024-01-28 14:01:01+00:00,1324,0,1324


In [10]:
class GToken:
    _cached_token: str = None

    @classmethod
    @property
    @functools.lru_cache
    def location_id(cls) -> str:
        #TODO: replace with an API call, hardcode for now
        return "europe-west1"

    @classmethod
    @property
    @functools.lru_cache
    def project_id(cls) -> str:
        creds, project_id = google.auth.default()
        _ = creds
        return project_id
    
    @classmethod
    @property
    def token(cls) -> str:
        creds, project_id = google.auth.default()
        _ = project_id
        if (not creds.valid) or (not creds.token):
            auth_req = google.auth.transport.requests.Request()
            creds.refresh(auth_req)
            cls._cached_token = creds.token
        return cls._cached_token

In [49]:
class FeatureManager:

    def register_feature(self,
                         feature_group_name: str,
                         name: str, 
                         column_name: str | None = None) -> dict:
        if column_name is None:
           column_name = name
        url = (
            f"https://{GToken.location_id}-aiplatform.googleapis.com/v1"
            f"/projects/{GToken.project_id}"
            f"/locations/{GToken.location_id}"
            f"/featureGroups/{feature_group_name}"
            "/features"
        )
        headers = {
            "Authorization": f"Bearer {GToken.token}",
            "Content-Type": "application/json; charset=utf-8",
        }
        params = dict(feature_id=name)
        payload = dict(version_column_name=column_name)
        with httpx.Client() as client:
            response = client.post(
                url=url, 
                headers=headers, 
                params=params,
                json=payload,
            )
            return response.json()
        
    def remove_feature(self, feature_group_name: str, name: str) -> dict:
        url = (
            f"https://{GToken.location_id}-aiplatform.googleapis.com/v1"
            f"/projects/{GToken.project_id}"
            f"/locations/{GToken.location_id}"
            f"/featureGroups/{feature_group_name}"
            f"/features/{name}"
        )
        headers = {
            "Authorization": f"Bearer {GToken.token}",
            "Content-Type": "application/json; charset=utf-8",
        }
        with httpx.Client() as client:
            response = client.delete(url=url, headers=headers)
            return response.json()
   
    def remove_feature_group(self, name: str) -> dict:
        headers = {
            "Authorization": f"Bearer {GToken.token}",
            "Content-Type": "application/json; charset=utf-8",
        }
        url = (
            f"https://{GToken.location_id}-aiplatform.googleapis.com/v1"
            f"/projects/{GToken.project_id}"
            f"/locations/{GToken.location_id}"
            f"/featureGroups/{name}"
        )
        params = dict(force=True)
        with httpx.Client() as client:
            response = client.delete(
                url=url, 
                headers=headers,
                params=params,
            )
            return response.json()


    def create_feature_group(self, name: str, table_uri: str) -> dict:
        headers = {
            "Authorization": f"Bearer {GToken.token}",
            "Content-Type": "application/json; charset=utf-8",
        }
        payload = dict(
            big_query=dict(
                big_query_source=dict(input_uri=table_uri)
            ),
            run_sync_immediately=True,
        )
        url = (
            f"https://{GToken.location_id}-aiplatform.googleapis.com/v1"
            f"/projects/{GToken.project_id}"
            f"/locations/{GToken.location_id}"
            f"/featureGroups"
        )
        params = dict(feature_group_id=name)
        with httpx.Client() as client:
            response = client.post(
                url=url, 
                headers=headers,
                params=params,
                json=payload
            )
            return response.json()

    def create_feature_view(self, 
                            feature_online_store: str,
                            view_name: str,
                            features: list[dict]) -> dict:
        url = (
            f"https://{GToken.location_id}-aiplatform.googleapis.com/v1"
            f"/projects/{GToken.project_id}"
            f"/locations/{GToken.location_id}"
            f"/featureOnlineStores/{feature_online_store}"
            f"/featureViews"
        )
        headers = {
            "Authorization": f"Bearer {GToken.token}",
            "Content-Type": "application/json; charset=utf-8",
        }
        params = dict(
            feature_view_id=view_name,
            run_sync_immediately=True,
        )
        payload = dict(
            # name=view_name,
            feature_registry_source=dict(
                feature_groups=features
            ),
            sync_config={
                "cron": "0 * * * *"  # every hour at 0 minute
            }
        )
        print(payload)
        with httpx.Client() as client:
            response = client.post(
                url=url,
                params=params,
                headers=headers,
                json=payload,
            )
            return response

    def remove_feature_view(self,
                            feature_online_store: str,
                            view_name: str) -> dict:
        headers = {
            "Authorization": f"Bearer {GToken.token}",
            "Content-Type": "application/json; charset=utf-8",
        }
        url = (
            f"https://{GToken.location_id}-aiplatform.googleapis.com/v1"
            f"/projects/{GToken.project_id}"
            f"/locations/{GToken.location_id}"
            f"/featureOnlineStores/{feature_online_store}"
            f"/featureViews/{view_name}"
        )
        with httpx.Client() as client:
            response = client.delete(
                url=url, 
                headers=headers,
            )
            return response.json()
        
    def list_sync_ops(self, feature_online_store: str, view_name: str) -> dict:
        headers = {
            "Authorization": f"Bearer {GToken.token}",
            "Content-Type": "application/json; charset=utf-8",
        }
        url = (
            f"https://{GToken.location_id}-aiplatform.googleapis.com/v1beta1"
            f"/projects/{GToken.project_id}"
            f"/locations/{GToken.location_id}"
            f"/featureOnlineStores/{feature_online_store}"
            f"/featureViews/{view_name}"
            "/featureViewSyncs"
        )
        with httpx.Client() as client:
            response = client.get(
                url=url, 
                headers=headers,
            )
            return response.json()



In [50]:
feature_manager = FeatureManager()
feature_manager.list_sync_ops("house_pricing", "price_by_area_model")

{'featureViewSyncs': [{'name': 'projects/ml-lab-324709/locations/europe-west1/featureOnlineStores/house_pricing/featureViews/price_by_area_model/featureViewSyncs/1321892115101253632',
   'createTime': '2024-01-28T15:05:58.462694Z',
   'finalStatus': {},
   'runTime': {'startTime': '2024-01-28T15:05:58.462694Z',
    'endTime': '2024-01-28T15:16:34.590531Z'}}]}

In [12]:
feature_manager = FeatureManager()

feature_manager.register_feature("house_area", "flr_one_sq_feet")
feature_manager.register_feature("house_area", "flr_two_sq_feet")
feature_manager.register_feature("house_area", "house_sq_feet")

feature_manager.register_feature("house_price", "sale_price")

{'name': 'projects/670967409083/locations/europe-west1/featureGroups/house_price/features/sale_price/operations/3147416355939024896',
 'metadata': {'@type': 'type.googleapis.com/google.cloud.aiplatform.v1.CreateFeatureOperationMetadata',
  'genericMetadata': {'createTime': '2024-01-28T15:01:27.893584Z',
   'updateTime': '2024-01-28T15:01:27.893584Z'}},
 'done': True,
 'response': {'@type': 'type.googleapis.com/google.cloud.aiplatform.v1.Feature',
  'name': 'projects/670967409083/locations/europe-west1/featureGroups/house_price/features/sale_price'}}

In [8]:
feature_manager = FeatureManager()
feature_manager.remove_feature_view(
    feature_online_store="house_pricing",
    view_name="price_by_area_model",
)

{'error': {'code': 404,
  'message': 'The FeatureView projects/670967409083/locations/europe-west1/featureOnlineStores/house_pricing/featureViews/price_by_area_model does not exist.',
  'status': 'NOT_FOUND'}}

In [14]:
feature_manager = FeatureManager()

feature_manager.create_feature_view(
    feature_online_store="house_pricing",
    view_name="price_by_area_model",
    features=[
        dict(
            feature_group_id="house_area",
            feature_ids=["flr_one_sq_feet", "flr_two_sq_feet", "house_sq_feet"],
        ),
    ]
)

{'feature_registry_source': {'feature_groups': [{'feature_group_id': 'house_area', 'feature_ids': ['flr_one_sq_feet', 'flr_two_sq_feet', 'house_sq_feet']}]}, 'sync_config': {'cron': '0 * * * *'}}


<Response [200 OK]>

In [19]:
from google.cloud.aiplatform_v1beta1 import FeatureOnlineStoreServiceClient
from google.cloud.aiplatform_v1beta1 import (
    FeatureViewDataKey,
    FetchFeatureValuesRequest,
)
from google.cloud.aiplatform_v1beta1.services.feature_online_store_service.transports.grpc import FeatureOnlineStoreServiceGrpcTransport
import grpc

In [42]:
from google.cloud.aiplatform_v1beta1.types import feature_online_store_service as feature_online_store_service_pb2
feature_online_store_service_pb2.FetchFeatureValuesRequest.Format.KEY_VALUE

<Format.KEY_VALUE: 1>

In [70]:
feature_online_store = "house_pricing"
view_name = "price_by_area_model"
headers = {
    "Authorization": f"Bearer {GToken.token}",
    "Content-Type": "application/json; charset=utf-8",
}
url = (
    f"https://{GToken.location_id}-aiplatform.googleapis.com/v1"
    f"/projects/{GToken.project_id}"
    f"/locations/{GToken.location_id}"
    f"/featureOnlineStores/{feature_online_store}"
    f"/featureViews/{view_name}:fetchFeatureValues"
)
payload = {
  "data_key": {
      "key": "14",
  },
  # "data_format": "KEY_VALUE",
}
features = dict()
print(url)
with httpx.Client() as client:
    response = client.post(
        url=url, 
        headers=headers,
        json=payload,
    )
    features = response.json()
features    

https://europe-west1-aiplatform.googleapis.com/v1/projects/ml-lab-324709/locations/europe-west1/featureOnlineStores/house_pricing/featureViews/price_by_area_model:fetchFeatureValues


{'error': {'code': 501,
  'message': 'Operation is not implemented, or supported, or enabled.',
  'status': 'UNIMPLEMENTED'}}

In [None]:
POST https://LOCATION_ID-aiplatform.googleapis.com/v1/projects/PROJECT_ID/locations/LOCATION_ID/featureOnlineStores/FEATUREONLINESTORE_NAME/featureViews/FEATUREVIEW_NAME:fetchFeatureValues


In [46]:
from google.cloud.aiplatform_v1beta1 import FeatureOnlineStoreServiceClient
from google.cloud.aiplatform_v1beta1.types import feature_online_store_service as feature_online_store_service_pb2

data_client = FeatureOnlineStoreServiceClient(
  client_options={"api_endpoint": f"{GToken.location_id}-aiplatform.googleapis.com"}
)
data_client.fetch_feature_values(
  request=feature_online_store_service_pb2.FetchFeatureValuesRequest(
    feature_view=(
        f"projects/{GToken.project_id}/locations/{GToken.location_id}"
        f"/featureOnlineStores/house_pricing/featureViews/price_by_area_model"
    ),
    data_key=feature_online_store_service_pb2.FeatureViewDataKey(key="1"),
    format=feature_online_store_service_pb2.FetchFeatureValuesRequest.Format.KEY_VALUE,
  )
)

MethodNotImplemented: 501 Operation is not implemented, or supported, or enabled.

In [20]:
fs_client = FeatureOnlineStoreServiceClient(
    transport = FeatureOnlineStoreServiceGrpcTransport(
        channel = grpc.insecure_channel("ENDPOINT_IP:10002")
    )
)

data_key = FeatureViewDataKey(
    key = "12",
)
view = (
    f"projects/{GToken.project_id}"
    f"/locations/{GToken.location_id}"
    "/featureOnlineStores/house_pricing"
    "/featureViews/price_by_area_model"
)
fs_client.fetch_feature_values(
    request=FetchFeatureValuesRequest(
        feature_view=view,
        data_key=data_key,
    )
)

E0128 16:08:05.645778000 8029278208 hpack_parser.cc:999]               Error parsing 'content-type' metadata: invalid value


Unknown: None Stream removed

In [None]:
{
    'feature_registry_source': {
        'feature_groups': [
            {'feature_group_id': 'house_area', 
             'feature_ids': ['flr_one_sq_feet', 'flr_two_sq_feet', 'house_sq_feet']
            }
            ]
        }
    }


In [26]:
query_train_dataset = """
with sale_prices as (
    select entity_id house_id,
           feature_timestamp sale_ts,
           sale_price,
           row_number() over (partition by entity_id order by feature_timestamp desc) _rk
    from house_pricing.ai_house_prices
),
spine as (
    select *
    from sale_prices
    where _rk = 1
),
area as (
    select area.*,
           row_number() over (partition by area.entity_id order by area.feature_timestamp desc) _rk
    from house_pricing.ai_house_area area
         inner join spine on (
            area.entity_id = spine.house_id
            and area.feature_timestamp <= spine.sale_ts
         )
)
select spine.house_id,
       spine.sale_price,
       area.flr_one_sq_feet,
       area.flr_two_sq_feet,
       area.house_sq_feet
from spine
     inner join area on (area.entity_id = spine.house_id)
where area._rk = 1
"""

client = bigquery.Client()
tarin_df = client.query(query_train_dataset).result().to_dataframe()

In [27]:
tarin_df

Unnamed: 0,house_id,sale_price,flr_one_sq_feet,flr_two_sq_feet,house_sq_feet
0,60,124900,780,0,780
1,28,306000,1704,0,1704
2,21,325300,1158,1218,2376
3,46,319900,1752,0,1752
4,11,129500,1040,0,1040
...,...,...,...,...,...
97,17,149000,1004,0,1004
98,34,165500,1700,0,1700
99,37,145000,1097,0,1097
100,88,164500,612,612,1224


In [71]:
feature_manager = FeatureManager()

feature_manager.remove_feature("house_area", "flr_one_sq_feet")
feature_manager.remove_feature("house_area", "flr_two_sq_feet")
feature_manager.remove_feature("house_area", "house_sq_feet")

feature_manager.remove_feature("house_price", "sale_price")

{'name': 'projects/670967409083/locations/europe-west1/operations/5509554360494850048',
 'metadata': {'@type': 'type.googleapis.com/google.cloud.aiplatform.v1.DeleteOperationMetadata',
  'genericMetadata': {'createTime': '2024-01-28T16:18:29.264668Z',
   'updateTime': '2024-01-28T16:18:29.264668Z'}},
 'done': True,
 'response': {'@type': 'type.googleapis.com/google.protobuf.Empty'}}

In [None]:
client = bigquery.Client()
query = """
select *
from house_pricing.ai_house_features
limit 10
"""
query_job = client.query(query)
rows = query_job.result()
list(rows)

In [5]:
feature_manager = FeatureManager()
respose = feature_manager.create_feature_group(
    "realestate",
    "bq://ml-lab-324709.house_pricing.house_features")
respose

{'big_query': {'big_query_source': {'input_uri': 'bq://ml-lab-324709.house_pricing.house_features'}}}


{'name': 'projects/670967409083/locations/europe-west1/featureGroups/realestate/operations/4277520795246264320',
 'metadata': {'@type': 'type.googleapis.com/google.cloud.aiplatform.v1.CreateFeatureGroupOperationMetadata',
  'genericMetadata': {'createTime': '2024-01-14T15:00:20.429607Z',
   'updateTime': '2024-01-14T15:00:20.429607Z'}},
 'done': True,
 'response': {'@type': 'type.googleapis.com/google.cloud.aiplatform.v1.FeatureGroup',
  'name': 'projects/670967409083/locations/europe-west1/featureGroups/realestate'}}

In [None]:
{
  "big_query": {
    "big_query_source": {
      "input_uri": "BIGQUERY_SOURCE_URI"
    }
  }
}

In [None]:
curl -X POST \
    -H "Authorization: Bearer $(gcloud auth print-access-token)" \
    -H "Content-Type: application/json; charset=utf-8" \
    -d @request.json \
    "https://LOCATION_ID-aiplatform.googleapis.com/v1/projects/PROJECT_ID/locations/LOCATION_ID/featureGroups?feature_group_id=FEATUREGROUP_NAME"

In [21]:
topic = publisher.get_topic(request={"topic": topic_name})

In [67]:
publisher = pubsub_v1.PublisherClient()
topic_name = "projects/ml-lab-324709/topics/house_prices"
df = pd.read_csv("home-data-for-ml-course/train.csv")
for idx, row in df.iterrows():
    feature_row = house_features_pb2.HouseFeatures()
    feature_row.Id = row.Id
    feature_row.Flr1SF = row["1stFlrSF"]
    feature_row.Flr2SF = row["2ndFlrSF"]
    feature_row.SalePrice = row.SalePrice

    future = publisher.publish(topic_name, 
                               feature_row.SerializeToString())

    if idx > 10:
        break
