In [None]:
# --- Configuration Variables ---
import os 

# Namespace where your resources exist
namespace = os.environ.get("NAMESPACE")

fsconfigmap = "cm-fs-data"

# Fetch token and server directly from oc CLI
import subprocess

def oc(cmd):
    return subprocess.check_output(cmd, shell=True).decode("utf-8").strip()

token = oc("oc whoami -t")
server = oc("oc whoami --show-server")

os.environ["CLUSTER_TOKEN"] = token
os.environ["CLUSTER_SERVER"] = server


# RayCluster name
raycluster = "feastraytest"
os.environ["RAY_CLUSTER"] = raycluster

# Show configured values
print("Configuration Variables:")
print(f"  Namespace: {namespace}")
print(f"  Server: {server}")
print(f"  Token: {'*' * 20}")   # hide actual token
print(f"  Ray Cluster: {raycluster}")

In [None]:
! git clone https://github.com/Srihari1192/feast-rag-ray.git

In [None]:
%cd feast-rag-ray/feature_repo

In [None]:
!oc login --token=$token --server=$server

In [None]:
!oc create configmap $fsconfigmap --from-file=data/customer_daily_profile.parquet --from-file=data/driver_stats.parquet -n $namespace

In [None]:
# Import pieces from codeflare-sdk
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication

# Create authentication with token and server from oc
auth = TokenAuthentication(
    token=token,
    server=server,
    skip_tls=True
)
auth.login()
print("✓ Authentication successful")


In [None]:
from kubernetes.client import (
    V1Volume,
    V1ConfigMapVolumeSource,
    V1VolumeMount,
) 

data_volume = V1Volume(
    name="data",
    config_map=V1ConfigMapVolumeSource(name=fsconfigmap)
)

data_mount = V1VolumeMount(
    name="data",
    mount_path="/opt/app-root/src/feast-rag-ray/feature_repo/data",
    read_only=True
)

cluster = Cluster(ClusterConfiguration(
    name=raycluster,
    head_cpu_requests=1,
    head_cpu_limits=1,
    head_memory_requests=4,
    head_memory_limits=4,
    head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests
    worker_extended_resource_requests={'nvidia.com/gpu':0},
    num_workers=2,
    worker_cpu_requests='250m',
    worker_cpu_limits=1,
    worker_memory_requests=4,
    worker_memory_limits=4,
    # image="", # Optional Field 
    write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources
    local_queue="fs-user-queue", # Specify the local queue manually
        # ⭐ Best method: Use secretKeyRef to expose AWS credentials safely
    volumes=[data_volume],
    volume_mounts=[data_mount],
    
))

In [None]:
cluster.apply()
# cluster.wait_ready()

In [None]:
import time

MAX_WAIT = 180        # 3 minutes
INTERVAL = 5          # check every 5 seconds
elapsed = 0

print("⏳ Waiting up to 3 minutes for RayCluster to be READY...\n")

while elapsed < MAX_WAIT:
    details = cluster.details()
    status = details.status.value

    print(details)
    print("Cluster Status:", status)

    if status == "ready":
        print("✅ RayCluster is READY!")
        break
    
    print(f"⏳ RayCluster is NOT ready yet: {status} ... checking again in {INTERVAL}s\n")
    time.sleep(INTERVAL)
    elapsed += INTERVAL

else:
    print("❌ Timeout: RayCluster did NOT become READY within 3 minutes.")


In [None]:
! feast apply

In [None]:
import sys
from pathlib import Path
from feast import FeatureStore

# Add feature repo to PYTHONPATH
repo_path = Path(".")
sys.path.append(str(repo_path))

# Initialize Feature Store
print("Initializing Feast with Ray configuration...")
store = FeatureStore(repo_path=".")

# Assertions: Verify store is initialized correctly
assert store is not None, "FeatureStore should be initialized"
assert store.config is not None, "Store config should be available"
assert store.config.offline_store is not None, "Offline store should be configured"

print(f"✓ Offline store: {store.config.offline_store.type}")
if hasattr(store.config, "batch_engine") and store.config.batch_engine:
    print(f"✓ Compute engine: {store.config.batch_engine.type}")
    # Assertion: Verify batch engine is configured if present
    assert store.config.batch_engine.type is not None, "Batch engine type should be set"
else:
    print("⚠ No compute engine configured")


## 2. Create Entity DataFrame

Create an entity DataFrame for historical feature retrieval with point-in-time timestamps.


In [None]:
from datetime import datetime, timedelta
import pandas as pd

# --- Create time window ---
end_date = datetime.now().replace(microsecond=0, second=0, minute=0)
start_date = end_date - timedelta(days=2)


entity_df = pd.DataFrame(
    {
        "driver_id": [1001, 1002, 1003],
        "customer_id": [2001, 2002, 2003],
        "event_timestamp": [
            pd.Timestamp(end_date - timedelta(hours=24), tz="UTC"),
            pd.Timestamp(end_date - timedelta(hours=12), tz="UTC"),
            pd.Timestamp(end_date - timedelta(hours=6), tz="UTC"),
        ],
    }
)

# Assertions: Verify entity DataFrame is created correctly
assert len(entity_df) == 3, f"Expected 3 rows, got {len(entity_df)}"
assert "driver_id" in entity_df.columns, "driver_id column should be present"
assert "customer_id" in entity_df.columns, "customer_id column should be present"
assert "event_timestamp" in entity_df.columns, "event_timestamp column should be present"
assert all(entity_df["driver_id"].isin([1001, 1002, 1003])), "driver_id values should match expected"
assert all(entity_df["customer_id"].isin([2001, 2002, 2003])), "customer_id values should match expected"
assert entity_df["event_timestamp"].notna().all(), "All event_timestamp values should be non-null"

print(f"✓ Created entity DataFrame with {len(entity_df)} rows")
print(f"✓ Time range: {start_date} to {end_date}")
print("\nEntity DataFrame:")
print(entity_df)


## 3. Retrieve Historical Features

Retrieve historical features using Ray compute engine for distributed point-in-time joins.


In [None]:
# Cell 4: Retrieve Historical Features
print("Retrieving historical features with Ray compute engine...")
print("(This demonstrates distributed point-in-time joins)")

try:
    # Get historical features - this uses Ray compute engine for distributed processing
    historical_features = store.get_historical_features(
        entity_df=entity_df,
        features=[
            "driver_hourly_stats:conv_rate",
            "driver_hourly_stats:acc_rate",
            "driver_hourly_stats:avg_daily_trips",
            "customer_daily_profile:current_balance",
            "customer_daily_profile:avg_passenger_count",
            "customer_daily_profile:lifetime_trip_count",
        ],
    )

    # Convert to DataFrame - Ray processes this efficiently
    historical_df = historical_features.to_df()
    
    # Assertions: Verify historical features are retrieved correctly
    assert historical_df is not None, "Historical features DataFrame should not be None"
    assert len(historical_df) > 0, "Should retrieve at least one row of historical features"
    assert "driver_id" in historical_df.columns, "driver_id should be in the result"
    assert "customer_id" in historical_df.columns, "customer_id should be in the result"
    
    # Verify expected feature columns are present (some may be None if data doesn't exist)
    expected_features = [
        "conv_rate", "acc_rate", "avg_daily_trips",
        "current_balance", "avg_passenger_count", "lifetime_trip_count"
    ]
    feature_columns = [col for col in historical_df.columns if col in expected_features]
    assert len(feature_columns) > 0, f"Should have at least one feature column, got: {historical_df.columns.tolist()}"
    
    print(f"✓ Retrieved {len(historical_df)} historical feature rows")
    print(f"✓ Features: {list(historical_df.columns)}")
    
    # Display the results
    print("\nHistorical Features DataFrame:")
    display(historical_df.head(10))

except Exception as e:
    print(f"⚠ Historical features retrieval failed: {e}")
    print("This might be due to missing Ray dependencies or data")
    raise


## 4. Test On-Demand Feature Transformations

Demonstrate on-demand feature transformations that are computed at request time.


In [None]:
# Cell 5: Test On-Demand Features
print("Testing on-demand feature transformations...")

try:
    # Get features including on-demand transformations
    features_with_odfv = store.get_historical_features(
        entity_df=entity_df.head(1),
        features=[
            "driver_hourly_stats:conv_rate",
            "driver_hourly_stats:acc_rate",
            "driver_hourly_stats:avg_daily_trips",
            "driver_activity_v2:conv_rate_plus_acc_rate",
            "driver_activity_v2:trips_per_day_normalized",
        ],
    )

    odfv_df = features_with_odfv.to_df()
    
    # Assertions: Verify on-demand features are computed correctly
    assert odfv_df is not None, "On-demand features DataFrame should not be None"
    assert len(odfv_df) > 0, "Should retrieve at least one row with on-demand features"
    assert "driver_id" in odfv_df.columns, "driver_id should be in the result"
    
    # Verify on-demand feature columns if they exist
    if "conv_rate_plus_acc_rate" in odfv_df.columns:
        # Assertion: Verify the on-demand feature is computed
        assert odfv_df["conv_rate_plus_acc_rate"].notna().any(), "conv_rate_plus_acc_rate should have non-null values"
        print("✓ On-demand feature 'conv_rate_plus_acc_rate' is computed")
    
    if "trips_per_day_normalized" in odfv_df.columns:
        assert odfv_df["trips_per_day_normalized"].notna().any(), "trips_per_day_normalized should have non-null values"
        print("✓ On-demand feature 'trips_per_day_normalized' is computed")
    
    print(f"✓ Retrieved {len(odfv_df)} rows with on-demand transformations")
    
    # Display results
    print("\nFeatures with On-Demand Transformations:")
    display(odfv_df)
    
    # Show specific transformed features
    if "conv_rate_plus_acc_rate" in odfv_df.columns:
        print("\nSample with on-demand features:")
        display(
            odfv_df[["driver_id", "conv_rate", "acc_rate", "conv_rate_plus_acc_rate"]]
        )

except Exception as e:
    print(f"⚠ On-demand features failed: {e}")
    raise


## 5. Materialize Features to Online Store

Materialize features to the online store using Ray compute engine for efficient batch processing.


In [None]:
from datetime import timezone
print("Materializing features to online store...")
store.materialize(
	start_date=datetime(2025, 1, 1, tzinfo=timezone.utc),
	end_date=end_date,
)

# Minimal output assertion: materialization succeeded if no exception
assert True, "Materialization completed successfully"
print("✓ Initial materialization successful")

## 6. Test Online Feature Serving

Retrieve features from the online store for low-latency serving.


In [None]:
# Cell 7: Test Online Feature Serving
print("Testing online feature serving...")

try:
    entity_rows = [
        {"driver_id": 1001, "customer_id": 2001},
        {"driver_id": 1002, "customer_id": 2002},
    ]
    
    # Assertion: Verify entity rows are valid
    assert len(entity_rows) == 2, "Should have 2 entity rows"
    assert all("driver_id" in row for row in entity_rows), "All entity rows should have driver_id"
    assert all("customer_id" in row for row in entity_rows), "All entity rows should have customer_id"
    
    online_features = store.get_online_features(
        features=[
            "driver_hourly_stats:conv_rate",
            "driver_hourly_stats:acc_rate",
            "customer_daily_profile:current_balance",
        ],
        entity_rows=entity_rows,
    )

    online_df = online_features.to_df()
    
    # Assertions: Verify online features are retrieved correctly
    assert online_df is not None, "Online features DataFrame should not be None"
    assert len(online_df) == len(entity_rows), f"Should retrieve {len(entity_rows)} rows, got {len(online_df)}"
    assert "driver_id" in online_df.columns, "driver_id should be in the result"
    assert "customer_id" in online_df.columns, "customer_id should be in the result"
    
    # Verify expected feature columns are present
    expected_features = ["conv_rate", "acc_rate", "current_balance"]
    feature_columns = [col for col in online_df.columns if col in expected_features]
    assert len(feature_columns) > 0, f"Should have at least one feature column, got: {online_df.columns.tolist()}"
    
    # Verify entity IDs match
    assert all(online_df["driver_id"].isin([1001, 1002])), "driver_id values should match entity rows"
    assert all(online_df["customer_id"].isin([2001, 2002])), "customer_id values should match entity rows"
    
    print(f"✓ Retrieved {len(online_df)} online feature rows")
    print(f"✓ Features retrieved: {feature_columns}")
    
    print("\nOnline Features DataFrame:")
    display(online_df)

except Exception as e:
    print(f"⚠ Online serving failed: {e}")
    raise


In [None]:
cluster.down()