# Develop a Managed Feature Store
This notebook demonstrates the development and registration of a feature store using Azure ML and Spark.

In [None]:
from pyspark.sql import SparkSession
import os
from azure.ai.ml import MLClient
from azureml.featurestore import FeatureStoreClient
from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
from azure.ai.ml.entities import FeatureStoreEntity, DataColumn, DataColumnType
from azure.ai.ml.entities import FeatureSet, FeatureSetSpecification

## Project Setup
Initialize the Spark session and check if the root directory exists.

In [None]:
# Print statement for indicating that the Spark session will be started
print("Spark session has been started")

In [None]:
# Initialize Spark session to work with distributed data processing
spark = SparkSession.builder.appName("AzureML Spark Example").getOrCreate()

In [None]:
# Check if the root directory exists
root_dir = "."

if os.path.isdir(root_dir):
    print("The folder exists.")
else:
    print("The folder does not exist. Please create or fix the path")

## Create Feature Store
Define the feature store name, location, subscription ID, and resource group.

In [None]:
# Define details for the feature store such as name, location, subscription, and resource group
featurestore_name = "fstore-2024-ch"
featurestore_location = "West Europe"
featurestore_subscription_id = "796313f9-881f-4bee-bd46-ba6ad10afbb4"
featurestore_resource_group_name = "rg-ml-2024-ch"

In [None]:
# Create the FeatureStoreClient object for managing the feature store
featurestore = FeatureStoreClient(
    credential=AzureMLOnBehalfOfCredential(), 
    subscription_id=featurestore_subscription_id,
    resource_group_name=featurestore_resource_group_name,
    name=featurestore_name,
)

## Explore Data
Load and display taxi trips data stored in Azure Blob Storage.

In [None]:
# Load the taxi trips dataset from Azure Data Lake using Spark
trips_source_data_path = "abfss://taxi@sadataplatform2024ch.dfs.core.windows.net/yellow/puYear=*/puMonth=*/*.parquet"
trips_src_df = spark.read.parquet(trips_source_data_path)  # Read the parquet data using Spark
trips_src_df.show(truncate=False)  # Display the dataset; may take time depending on data size

## Register a Feature Store Entity
Create a feature store entity representing the `trip` data and register it with Azure ML.

In [None]:
# Initialize the MLClient to interact with the Azure ML workspace
taxi_client = MLClient(
    AzureMLOnBehalfOfCredential(),  # Use the On-Behalf-Of credential for authentication
    featurestore_subscription_id,
    featurestore_resource_group_name,
    featurestore_name,
)

In [None]:
# Create a new feature store entity for representing trip data
trip_entity_config = FeatureStoreEntity(
    name="trip",  # Name of the entity
    version="1",  # Versioning the entity
    index_columns=[  # Specify the index columns (primary keys) for the entity
        DataColumn(name="vendorID", type=DataColumnType.STRING),  # Vendor ID is a string column
        DataColumn(name="tpepPickupDateTime", type=DataColumnType.DATETIME),  # Pickup datetime
        DataColumn(name="doLocationId", type=DataColumnType.STRING)  # Drop-off location ID
    ],
    stage="Development",  # Set the development stage for the entity
    description="Entity representing trips.",  # Description of the entity
    tags={"data_type": "nonPII"}  # Tag indicating data is not personally identifiable
)

# Register the entity with the feature store
poller = taxi_client.feature_store_entities.begin_create_or_update(trip_entity_config)
print(poller.result())  # Print the result of the operation

## Register the Feature Set with the Feature Store
Register a feature set that includes aggregated taxi trip features.

In [None]:
# Create the feature set for taxi trips and register it with the feature store
trip_fset_config = FeatureSet(
    name="taxi_trips",  # Feature set name
    version="1",  # Version of the feature set
    description="7-day and 3-day rolling aggregation of taxi trips",  # Description of the feature set
    entities=["azureml:trip:1"],  # Link the previously created trip entity
    stage="Development",  # Development stage
    specification=FeatureSetSpecification(path=root_dir + "/mtransform.py"),  # Path to the transformation script
    tags={"data_type": "nonPII"},  # Tag indicating the type of data
)

# Register the feature set
poller = taxi_client.feature_sets.begin_create_or_update(trip_fset_config)
print(poller.result())  # Print the result of the operation