## Sagemaker Feature Store Setup

In [None]:
import boto3
import sagemaker

In [None]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [None]:
# You can modify the following to use a bucket of your choosing
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-demo"

print(default_s3_bucket_name)

In [None]:
# You can modify the following to use a bucket of your choosing
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-demo"

print(default_s3_bucket_name)

## Inspect Dataset 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io

s3_client = boto3.client("s3", region_name=region)

fraud_detection_bucket_name = f"sagemaker-example-files-prod-{region}"
valid_file_key = (
    "melissafinalbucket/csv/valid_flight_data"
)
training_file_key = (
    " melissafinalbucket/csv/training_flight_data"
)
test_file_key = (
    " melissafinalbucket/csv/test_flight_data"
)
valid_data_object = s3_client.get_object(
    Bucket=fraud_detection_bucket_name, Key=identity_file_key
)
training_data_object = s3_client.get_object(
    Bucket=fraud_detection_bucket_name, Key=transaction_file_key
)
test_data_object = s3_client.get_object(
    Bucket=fraud_detection_bucket_name, Key=transaction_file_key
)


valid_data = pd.read_csv(io.BytesIO(valid_data_object["Body"].read()))
training_data = pd.read_csv(io.BytesIO(training_data_object["Body"].read()))
test_data = pd.read_csv(io.BytesIO(test_data_object["Body"].read()))


## Feature Engineering

In [None]:
from time import gmtime, strftime, sleep

valid_feature_group_name = "valid-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
training_feature_group_name = "training-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
test_feature_group_name = "test-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup

valid_feature_group = FeatureGroup(
    name=valid_feature_group_name, sagemaker_session=feature_store_session
)
training_feature_group = FeatureGroup(
    name=training_feature_group_name, sagemaker_session=feature_store_session
)
test_feature_group = FeatureGroup(
    name=test_feature_group_name, sagemaker_session=feature_store_session
)

In [None]:
import time

current_time_sec = int(round(time.time()))


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")


# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(valid_data)
cast_object_to_string(training_data)
cast_object_to_string(test_data)

# record identifier and event time feature names
record_identifier_feature_name = ""
event_time_feature_name = "EventTime"

# append EventTime feature
valid_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(valid_data), dtype="float64"
)
training_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(training_data), dtype="float64"
)
test_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(training_data), dtype="float64"
)

# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
valid_feature_group.load_feature_definitions(data_frame=identity_data)
# output is suppressed
training_feature_group.load_feature_definitions(data_frame=transformed_transaction_data)
# output is suppressed
test_feature_group.load_feature_definitions(data_frame=transformed_transaction_data)
# output is suppressed

In [None]:
valid_data.head()

In [None]:
training_data.head()

In [None]:
test_data.head()