In [42]:
%pip install -U --quiet feast 'feast[gcp]' protobuf==6.31.1

Note: you may need to restart the kernel to use updated packages.


In [43]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [44]:
!feast version

Feast SDK Version: "0.49.0"


## Authentication into GCP  

* Not needed  

In [None]:
# from google.colab import auth
# auth.authenticate_user()

In [45]:
PROJECT_ID = "polar-pillar-461115-g2"  # @param {type:"string"}
BUCKET_NAME = "mlops-course-polar-pillar-461115-g2-week3"  # @param {type:"string"} custom
BIGQUERY_DATASET_NAME = "feast_iris"  # @param {type:"string"} custom
AI_PLATFORM_MODEL_NAME = "feast_iris_model"  # @param {type:"string"} 

!gcloud config set project $PROJECT_ID
%env GOOGLE_CLOUD_PROJECT=$PROJECT_ID
!echo project_id = $PROJECT_ID > ~/.bigqueryrc

Updated property [core/project].
env: GOOGLE_CLOUD_PROJECT=polar-pillar-461115-g2


In [46]:
# Only run if your bucket does not already exist  

!gsutil mb gs://$BUCKET_NAME

Creating gs://mlops-course-polar-pillar-461115-g2-week3/...
ServiceException: 409 A Cloud Storage bucket named 'mlops-course-polar-pillar-461115-g2-week3' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


## Apply and deploy feature definitions  

In [47]:
import os

# Change directory
os.chdir('/home/jupyter/iris_pipeline')  # replace with your aimed path

# Verify change
print("Current working directory:", os.getcwd())

Current working directory: /home/jupyter/iris_pipeline


In [48]:
!pwd

/home/jupyter/iris_pipeline


In [50]:
import pandas as pd

# Load, convert timestamps, and overwrite parquet
for fname in ["iris_sepal.csv", "iris_petal.csv"]:
    df = pd.read_csv(f"data/{fname}")
    df["event_timestamp"] = pd.to_datetime(df["event_timestamp"], utc=True)
    df.to_parquet(f"data/{fname.replace('.csv', '.parquet')}", index=False)


## Set up and Apply Feast  

- Essential files in `feast_iris` folder  

In [52]:
!cd feast_iris/ && feast apply

No project found in the repository. Using project name feast_iris_project defined in feature_store.yaml
Applying changes for project feast_iris_project
[1m[94mNo changes to registry
[1m[94mNo changes to infrastructure


In [53]:
# Inspect the files created under the local folder

! ls -larth data

# Two new files are created under the data folder: online_store.db and registry.db

total 164K
-rw-r--r-- 1 jupyter jupyter 5.3K Jun  1 18:18 iris.csv
-rw-r--r-- 1 jupyter jupyter 9.7K Jun 20 20:35 iris_entity.csv
drwxr-xr-x 8 jupyter jupyter 4.0K Jun 20 21:16 ..
-rw-r--r-- 1 jupyter jupyter  14K Jun 20 22:43 iris_sepal.csv
-rw-r--r-- 1 jupyter jupyter  14K Jun 20 22:43 iris_petal.csv
-rw-r--r-- 1 jupyter jupyter  60K Jun 20 22:46 online_store.db
drwxr-xr-x 3 jupyter jupyter 4.0K Jun 22 04:22 .
drwxr-xr-x 2 jupyter jupyter 4.0K Jun 22 04:22 .ipynb_checkpoints
-rw-r--r-- 1 jupyter jupyter 5.2K Jun 22 04:23 online_features_iris.csv
-rw-r--r-- 1 jupyter jupyter  11K Jun 22 04:58 iris_sepal.parquet
-rw-r--r-- 1 jupyter jupyter  11K Jun 22 04:58 iris_petal.parquet
-rw-r--r-- 1 jupyter jupyter 1.6K Jun 22 05:01 registry.db


# Feast in action    

### STEPS  
#### Step 1: Get Historical Features 
    - get_historical_features()  # with 2023–2024 data
#### Step 2: Train and save a ML model on historical train 
    - train + joblib.save()
#### Step 3: Materialize 2025 data 
    - materialize(from, to) or materialize_incremental()  # for 2025 data
#### Step 4: Get Online Features, use trained model to predict 
    - get_online_features() + joblib.load() + predict

## Step 1: get_historical_features()  # with 2023–2024 data

In [None]:
# List all feature views in your registry
# store.list_feature_views()

In [55]:
from feast import FeatureStore
import pandas as pd

# Initialize FeatureStore
store = FeatureStore(repo_path="feast_iris")
# store.apply()  # Feast apply already ran in the cell above

# Load entity_df (used to retrieve historical features)
entity_df = pd.read_csv("data/iris_entity.csv")
entity_df["event_timestamp"] = pd.to_datetime(entity_df["event_timestamp"], utc=True)

entity_df_train = entity_df[
    (entity_df["event_timestamp"].dt.year >= 2023) &
    (entity_df["event_timestamp"].dt.year <= 2024)
]

# Optionally inspect entity_df
print(f"Entity DataFrame for Training:\n{entity_df_train.head()}")

# Define list of features to retrieve from both feature views
feature_refs = [
    "sepal_features:sepal_length",
    "sepal_features:sepal_width",
    "petal_features:petal_length",
    "petal_features:petal_width"
]

# Retrieve historical features
training_df = store.get_historical_features(
    entity_df=entity_df_train,
    features=feature_refs
).to_df()

# Save for reference
training_df.to_csv("feast_iris/training_df.csv", index=False)

Entity DataFrame for Training:
   flower_id           event_timestamp species
0          1 2023-12-31 23:59:59+00:00  setosa
1          2 2023-12-31 23:59:59+00:00  setosa
2          3 2023-12-31 23:59:59+00:00  setosa
3          4 2023-12-31 23:59:59+00:00  setosa
4          5 2023-12-31 23:59:59+00:00  setosa


In [56]:
print("----- Feature schema -----\n")
print(training_df.info())

print()
print("----- Example features -----\n")
print(training_df.head())

# training_df will have only 240 rows out of 300 from the original entity_df

----- Feature schema -----

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   flower_id        240 non-null    int64              
 1   event_timestamp  240 non-null    datetime64[ns, UTC]
 2   species          240 non-null    object             
 3   sepal_length     240 non-null    float64            
 4   sepal_width      240 non-null    float64            
 5   petal_length     240 non-null    float64            
 6   petal_width      240 non-null    float64            
dtypes: datetime64[ns, UTC](1), float64(4), int64(1), object(1)
memory usage: 13.2+ KB
None

----- Example features -----

   flower_id           event_timestamp    species  sepal_length  sepal_width  \
0        110 2023-12-31 23:59:59+00:00  virginica           6.8          3.0   
1        105 2023-12-31 23:59:59+00:00  virginica           6.7

 ### Step 2: Train model  

In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib
import pandas as pd

# Load training dataframe
training_df = pd.read_csv("feast_iris/training_df.csv")

# Features and target
X = training_df[["sepal_length", "sepal_width", "petal_length", "petal_width"]]
y = training_df["species"]

# Encode species labels if needed
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save the label encoder
joblib.dump(label_encoder, "artifacts/feast_iris_label_encoder.joblib")

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Initialize classifier
clf = RandomForestClassifier(random_state=42)

# Cross-validation on training set
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"CV (5-fold) Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Train on full training set
clf.fit(X_train, y_train)

# Evaluate on validation set
y_pred = clf.predict(X_val)
print("\nValidation Classification Report:")
print(classification_report(y_val, y_pred, target_names=label_encoder.classes_))

# Save the trained model
joblib.dump(clf, "artifacts/feast_iris_model.joblib")


CV (5-fold) Accuracy: 0.9791 ± 0.0197

Validation Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        16
  versicolor       0.83      0.94      0.88        16
   virginica       0.93      0.81      0.87        16

    accuracy                           0.92        48
   macro avg       0.92      0.92      0.92        48
weighted avg       0.92      0.92      0.92        48



['artifacts/feast_iris_model.joblib']

### Step 3: Materialize 2025 data to Online Store  

In [58]:
from feast import FeatureStore
from datetime import datetime

# # Initialize Feast feature store
# store = FeatureStore(repo_path="feast_iris")

# Materialize only 2025 data to online store
store.materialize(
    start_date=datetime(2025, 1, 1),
    end_date=datetime(2025, 12, 31)
)


# Alternatively, materialize incrementally
# This loads feature values into the online store up to end of 2025

# store.materialize_incremental(end_date=datetime(2025, 12, 31))

Materializing [1m[32m2[0m feature views from [1m[32m2025-01-01 00:00:00+00:00[0m to [1m[32m2025-12-31 00:00:00+00:00[0m into the [1m[32msqlite[0m online store.

[1m[32mpetal_features[0m:


100%|████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 10129.95it/s]


[1m[32msepal_features[0m:


100%|█████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 8939.58it/s]


### Step 4: Online prediction for 2025 data  

In [59]:
# Load model
clf = joblib.load("artifacts/feast_iris_model.joblib")
label_encoder = joblib.load("artifacts/feast_iris_label_encoder.joblib")

# Prepare 2025 entity rows
entity_df_2025 = entity_df[entity_df["event_timestamp"].dt.year == 2025]
entity_rows = entity_df_2025[["flower_id"]].drop_duplicates().to_dict(orient="records")

# Get online features
online_features = store.get_online_features(
    features=feature_refs,
    entity_rows=entity_rows
).to_df()

# Predict
X_online = online_features[[
    "sepal_length", "sepal_width", "petal_length", "petal_width"
]]
preds = clf.predict(X_online)
decoded_preds = label_encoder.inverse_transform(preds)

# Attach predictions
online_features["predicted_label"] = decoded_preds
print(online_features.info(), online_features.head())

# Export to csv
online_features.to_csv("data/online_features_iris.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   flower_id        60 non-null     int64  
 1   sepal_length     60 non-null     float64
 2   sepal_width      60 non-null     float64
 3   petal_width      60 non-null     float64
 4   petal_length     60 non-null     float64
 5   predicted_label  60 non-null     object 
dtypes: float64(4), int64(1), object(1)
memory usage: 2.9+ KB
None    flower_id  sepal_length  sepal_width  petal_width  petal_length  \
0        241      5.124007     3.414618     0.297925      1.413890   
1        242      5.091912     3.495279     0.338587      1.533552   
2        243      4.897901     2.974365     0.098859      1.579757   
3        244      5.227264     4.075287     0.030783      1.420465   
4        245      4.652927     3.555841     0.235786      0.948662   

  predicted_label  
0          setosa  
1 

### Step 5: Evaluate predictions (Optional - Sanity check)

In [60]:
# Merge true labels for evaluation
true_labels = entity_df_2025[["flower_id", "species"]].drop_duplicates()
merged = online_features.merge(true_labels, on="flower_id", how="inner")

# Encode true labels
y_true = label_encoder.transform(merged["species"])
y_pred = label_encoder.transform(merged["predicted_label"])

# Print classification report
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

# Optionally print or return merged DataFrame
print(merged[["flower_id", "species", "predicted_label"]].head(), merged[["flower_id", "species", "predicted_label"]].tail(), sep="\n\n")

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        20
  versicolor       0.95      0.90      0.92        20
   virginica       0.90      0.95      0.93        20

    accuracy                           0.95        60
   macro avg       0.95      0.95      0.95        60
weighted avg       0.95      0.95      0.95        60

   flower_id species predicted_label
0        241  setosa          setosa
1        242  setosa          setosa
2        243  setosa          setosa
3        244  setosa          setosa
4        245  setosa          setosa

    flower_id    species predicted_label
55        296  virginica       virginica
56        297  virginica       virginica
57        298  virginica       virginica
58        299  virginica       virginica
59        300  virginica       virginica
