In [2]:
# in terminal: "aws configure" and specify empty credentials

In [4]:
!pachctl create repo feast

In [9]:
!pachctl put file feast@master:/house_dataset_lat_lon.parquet -f data/house_dataset_lat_lon.parquet
!pachctl put file feast@master:/house_dataset_main.parquet -f data/house_dataset_main.parquet



In [2]:
!pachctl list file feast@master

NAME                           TYPE SIZE     
/house_dataset_lat_lon.parquet file 354.4KiB 
/house_dataset_main.parquet    file 1.012MiB 


Now, change the main.py and .yaml file to use pachyderm paths

In [3]:
cd feature_store/

/home/ubuntu/mlops_workshops/feast/feature_store


### Unfortunately, s3 endpoint cannot be specified directly in the config file, so we need to use env variable

In [7]:
import os
os.environ['FEAST_S3_ENDPOINT_URL'] = 'http://localhost:30600'

In [34]:
!FEAST_S3_ENDPOINT_URL=http://localhost:30600 feast apply

In [12]:
!pachctl list file feast@master

NAME                           TYPE SIZE     
/house_dataset_lat_lon.parquet file 354.4KiB 
/house_dataset_main.parquet    file 1.012MiB 
/registry.db                   file 1.53KiB  


### Now let's make some changes to main.py

In [None]:
!FEAST_S3_ENDPOINT_URL=http://localhost:30600 feast apply

In [35]:
!pachctl list commit feast

In [24]:
!pachctl list file feast@master

NAME                           TYPE SIZE     
/house_dataset_lat_lon.parquet file 354.4KiB 
/house_dataset_main.parquet    file 1.012MiB 
/registry.db                   file 1.53KiB  


In [25]:
!pachctl list file feast@master^1

NAME                           TYPE SIZE     
/house_dataset_lat_lon.parquet file 354.4KiB 
/house_dataset_main.parquet    file 1.012MiB 
/registry.db                   file 1.945KiB 


In [26]:
from feast import FeatureStore

store = FeatureStore(repo_path=".")

In [27]:
import feast
import pandas as pd
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

entity_df = pd.DataFrame.from_dict({"HouseId": [i for i in range(1, 1000)]})
entity_df['event_timestamp'] = pd.to_datetime('now', utc=True)

retrieval_job = store.get_historical_features(
    entity_df=entity_df,
    features=store.get_feature_service('house_service')
)

In [30]:
dataset = store.create_saved_dataset(
    from_=retrieval_job,
    name='merged_dataset_v1',
    storage=SavedDatasetFileStorage(
        path='s3://master.feast/merged_dataset_v1.parquet',
        s3_endpoint_override='http://localhost:30600'
    )
)

training_df = retrieval_job.to_df()
print(len(training_df))
training_df.head()



999


Unnamed: 0,HouseId,event_timestamp,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,MedHouseVal,Latitude,Longitude
0,1,2022-06-16 07:12:50.098883+00:00,2.4792,24.0,3.454704,1.134146,2251.0,3.921603,2.0,34.18,-118.38
1,2,2022-06-16 07:12:50.098883+00:00,3.463,8.0,6.363636,1.166297,1307.0,2.898004,2.017,39.08,-121.04
2,3,2022-06-16 07:12:50.098883+00:00,3.75,16.0,5.768719,1.023295,1478.0,2.459235,1.473,38.68,-121.28
3,4,2022-06-16 07:12:50.098883+00:00,2.8542,34.0,3.858779,1.045802,1164.0,4.442748,1.469,34.04,-118.19
4,5,2022-06-16 07:12:50.098883+00:00,1.3375,18.0,4.567625,1.087327,2707.0,2.882854,0.596,39.13,-121.54


In [31]:
!pachctl list file feast@master

NAME                           TYPE SIZE     
/house_dataset_lat_lon.parquet file 354.4KiB 
/house_dataset_main.parquet    file 1.012MiB 
/merged_dataset_v1.parquet     file 68.71KiB 
/registry.db                   file 1.933KiB 
