## Sagemaker Feature Store Setup

In [None]:
import boto3
import sagemaker

In [None]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [None]:
# You can modify the following to use a bucket of your choosing
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-demo"

print(default_s3_bucket_name)

In [None]:
# You can modify the following to use a bucket of your choosing
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-demo"

print(default_s3_bucket_name)

## Inspect Dataset 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io

s3_client = boto3.client("s3", region_name=region)

fraud_detection_bucket_name = f"sagemaker-example-files-prod-{region}"
valid_file_key = (
    "melissafinalbucket/csv/valid_flight_data"
)
training_file_key = (
    " melissafinalbucket/csv/training_flight_data"
)
test_file_key = (
    " melissafinalbucket/csv/test_flight_data"
)
valid_data_object = s3_client.get_object(
    Bucket=fraud_detection_bucket_name, Key=identity_file_key
)
training_data_object = s3_client.get_object(
    Bucket=fraud_detection_bucket_name, Key=transaction_file_key
)
test_data_object = s3_client.get_object(
    Bucket=fraud_detection_bucket_name, Key=transaction_file_key
)


valid_data = pd.read_csv(io.BytesIO(valid_data_object["Body"].read()))
training_data = pd.read_csv(io.BytesIO(training_data_object["Body"].read()))
test_data = pd.read_csv(io.BytesIO(test_data_object["Body"].read()))


## Feature Engineering

In [None]:
from time import gmtime, strftime, sleep

valid_feature_group_name = "valid-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
training_feature_group_name = "training-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
test_feature_group_name = "test-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup

valid_feature_group = FeatureGroup(
    name=valid_feature_group_name, sagemaker_session=feature_store_session
)
training_feature_group = FeatureGroup(
    name=training_feature_group_name, sagemaker_session=feature_store_session
)
test_feature_group = FeatureGroup(
    name=test_feature_group_name, sagemaker_session=feature_store_session
)

In [None]:
import time

current_time_sec = int(round(time.time()))


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")


# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(valid_data)
cast_object_to_string(training_data)
cast_object_to_string(test_data)

# record identifier and event time feature names
record_identifier_feature_name = ""
event_time_feature_name = "EventTime"

# append EventTime feature
valid_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(valid_data), dtype="float64"
)
training_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(training_data), dtype="float64"
)
test_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(training_data), dtype="float64"
)

# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
valid_feature_group.load_feature_definitions(data_frame=identity_data)
# output is suppressed
training_feature_group.load_feature_definitions(data_frame=transformed_transaction_data)
# output is suppressed
test_feature_group.load_feature_definitions(data_frame=transformed_transaction_data)
# output is suppressed

In [None]:
valid_data.head()

In [None]:
training_data.head()

In [None]:
test_data.head()

## Feature Engineering

In [2]:
# Feature Engineering

valid_data = valid_data.round(5)
training_data = training_data.round(5)
test_data = test_data.round(5)

identity_data = identity_data.fillna(0)
transaction_data = transaction_data.fillna(0)

# Feature transformations for this dataset are applied before ingestion into FeatureStore.
# One hot encode card4, card6
#encoded_card_bank = pd.get_dummies(transaction_data["card4"], prefix="card_bank")
#encoded_card_type = pd.get_dummies(transaction_data["card6"], prefix="card_type")

#transformed_transaction_data = pd.concat(
#    [transaction_data, encoded_card_type, encoded_card_bank], axis=1
#)
# blank space is not allowed in feature name
#transformed_transaction_data = transformed_transaction_data.rename(
#    columns={"card_bank_american express": "card_bank_american_express"}
#)

NameError: name 'valid_data' is not defined

In [None]:
valid_data.head()

In [None]:
training_data.head()

In [None]:
test_data.head()

## Ingest Data into FeatureStore

In [None]:
from time import gmtime, strftime, sleep

valid_feature_group_name = "valid-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
training_feature_group_name = "training-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
test_feature_group_name = "training-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup

valid_feature_group = FeatureGroup(
    name=valid_feature_group_name, sagemaker_session=feature_store_session
)
training_feature_group = FeatureGroup(
    name=training_feature_group_name, sagemaker_session=feature_store_session
)
test_feature_group = FeatureGroup(
    name=test_feature_group_name, sagemaker_session=feature_store_session
)

In [None]:
import time

current_time_sec = int(round(time.time()))


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")


# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(valid_data)
cast_object_to_string(training_data)
cast_object_to_string(test_data)

# record identifier and event time feature names
record_identifier_feature_name = ""
event_time_feature_name = "EventTime"

# append EventTime feature
valid_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(valid_data), dtype="float64"
)
training_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(training_data), dtype="float64"
)
test_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(test_data), dtype="float64"
)

# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
valid_feature_group.load_feature_definitions(data_frame=identity_data)
# output is suppressed
training_feature_group.load_feature_definitions(data_frame=transformed_transaction_data)
# output is suppressed
test_feature_group.load_feature_definitions(data_frame=transformed_transaction_data)
# output is suppressed

## Create FeatureGroups in SageMaker FeatureStore

In [None]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


valid_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

training_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

test_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=valid_feature_group)
wait_for_feature_group_creation_complete(feature_group=training_feature_group)
wait_for_feature_group_creation_complete(feature_group=test_feature_group)

In [None]:
valid_feature_group.describe()

In [None]:
training_feature_group.describe()

In [None]:
test_feature_group.describe()

In [None]:
sagemaker_client.list_feature_groups()  # use boto client to list FeatureGroups

## Put records in to feature groups

In [None]:
valid_feature_group.ingest(data_frame=valid_data, max_workers=3, wait=True)

In [None]:
training_feature_group.ingest(data_frame=training_data, max_workers=5, wait=True)

In [None]:
test_feature_group.ingest(data_frame=test_data, max_workers=5, wait=True)

In [None]:
record_identifier_value = str(2990130)

featurestore_runtime.get_record(
    FeatureGroupName=training_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

In [None]:
print(valid_feature_group.as_hive_ddl())

In [None]:
print(training_feature_group.as_hive_ddl())

In [None]:
print(test_feature_group.as_hive_ddl())

In [None]:
account_id = boto3.client("sts").get_caller_identity()["Account"]
print(account_id)

valid_feature_group_resolved_output_s3_uri = (
    valid_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)
training_feature_group_resolved_output_s3_uri = (
    training_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)
test_feature_group_resolved_output_s3_uri = (
    test_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)

valid_feature_group_s3_prefix = valid_feature_group_resolved_output_s3_uri.replace(
    f"s3://{default_s3_bucket_name}/", ""
)
training_feature_group_s3_prefix = training_feature_group_resolved_output_s3_uri.replace(
    f"s3://{default_s3_bucket_name}/", ""
)
test_feature_group_s3_prefix = test_feature_group_resolved_output_s3_uri.replace(
    f"s3://{default_s3_bucket_name}/", ""
)

offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3_client.list_objects(
        Bucket=default_s3_bucket_name, Prefix=training_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...\n")
        sleep(60)

print("Data available.")

In [None]:
valid_query = valid_feature_group.athena_query()
training_query = training_feature_group.athena_query()
test_query = test_query_feature_group.athena_query()

# Retrieve table names
valid_table = valid_query.table_name
training_table = training_query.table_name
test_table = test_query.table_name  # New feature group table

# Construct SQL query with three feature groups
query_string = (
    'SELECT * FROM "'
    + training_table
    + '" LEFT JOIN "'
    + valid_table
    + '" ON "'
    + training_table
    + '".transactionid = "'
    + valid_table
    + '".transactionid'
    + ' LEFT JOIN "'
    + test_table
    + '" ON "'
    + training_table
    + '".transactionid = "'
    + test_table
    + '".transactionid'
)

print("Running " + query_string)

# Run Athena query and store results in S3
valid_query.run(
    query_string=query_string,
    output_location="s3://" + default_s3_bucket_name + "/" + prefix + "/query_results/",
)

# Wait for query execution to complete
identity_query.wait()

# Load results into Pandas DataFrame
dataset = identity_query.as_dataframe()

dataset