In [2]:
import pandas as pd
import sagemaker

import boto3


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
df = pd.read_csv("s3://sagemaker-eu-north-1-755182613526/rides500.csv")

In [4]:
default_s3_bucket_name = "sagemaker-eu-north-1-755182613526"

In [5]:
import boto3
import sagemaker
from sagemaker.session import Session


region = boto3.Session().region_name
print(region)

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

eu-north-1
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [6]:
import time

current_time_sec = int(round(time.time()))
# append EventTime feature
df["eventTime"] = pd.Series([current_time_sec] * len(df), dtype="float64")

In [8]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")


# cast object dtype to string. The SageMaker Feature Store Python SDK will then map the string dtype to String feature type.
cast_object_to_string(df)

In [9]:
df.isnull().sum().sum()

0

In [22]:
df.dropna(how='any', inplace=True) 

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       500 non-null    int64  
 1   ride_id                  500 non-null    string 
 2   pickup_datetime          500 non-null    int64  
 3   pickup_longitude         500 non-null    float64
 4   dropoff_longitude        500 non-null    float64
 5   pickup_latitude          500 non-null    float64
 6   dropoff_latitude         500 non-null    float64
 7   passenger_count          500 non-null    int64  
 8   taxi_id                  500 non-null    int64  
 9   driver_id                500 non-null    int64  
 10  distance                 500 non-null    float64
 11  pickup_distance_to_jfk   500 non-null    float64
 12  dropoff_distance_to_jfk  500 non-null    float64
 13  pickup_distance_to_ewr   500 non-null    float64
 14  dropoff_distance_to_ewr  5

### Create feature group

In [11]:
from sagemaker.feature_store.feature_group import FeatureGroup

fg_name = "ride500"
feature_group = FeatureGroup(
    name=fg_name, sagemaker_session=feature_store_session
)

In [12]:
# record identifier and event time feature names
record_identifier_feature_name = "id"
event_time_feature_name = "eventTime"

In [13]:
feature_group.load_feature_definitions(data_frame=df)

[FeatureDefinition(feature_name='id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='ride_id', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='pickup_datetime', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='pickup_longitude', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='dropoff_longitude', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='pickup_latitude', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='dropoff_latitude', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='passenger_count', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='taxi_id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='driver_id', feature_type=<FeatureTy

In [14]:
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()
print(role)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
arn:aws:iam::755182613526:role/service-role/AmazonSageMaker-ExecutionRole-20230923T233439


In [15]:
prefix = "feature-store-benchmark"

def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",  # offline feature store bucket
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)
wait_for_feature_group_creation_complete(feature_group=feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup ride500 successfully created.


In [16]:
feature_group.ingest(data_frame=df, max_workers=1, wait=True)

IngestionManagerPandas(feature_group_name='ride500', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7fd491936200>, sagemaker_session=<sagemaker.session.Session object at 0x7fd492471f60>, max_workers=1, max_processes=1, profile_name=None, _async_result=None, _processing_pool=None, _failed_indices=[])