Docs: https://sagemaker.readthedocs.io/en/stable/amazon_sagemaker_featurestore.html

In [2]:
import sagemaker
import boto3

sess = sagemaker.Session()

bucket = 'kaggle-writing-student'
key_dataset = 'dataset'

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
s3 = boto3.Session().client(service_name="s3", region_name=region)

In [3]:
# We need a record identifier name and an event time feature name. 
# This will match the column of the corresponding features in our data.
from datetime import datetime
from time import strftime
import time

timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)

2022-02-22T14:19:43Z


In [4]:
# Feature store API
featurestore_runtime = boto3.Session().client(service_name="sagemaker-featurestore-runtime", region_name=region)

## Define Feature group

In [5]:
# Create feature group
from time import gmtime, strftime, sleep

feature_group_name = "students-discourse-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
print(feature_group_name)

students-discourse-feature-group-22-14-20-06


In [6]:
from sagemaker.feature_store.feature_definition import (
    FeatureDefinition,
    FeatureTypeEnum,
)

feature_definitions = [
    FeatureDefinition(feature_name="id", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="discourse_id", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="discourse_text", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="discourse_type", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="predictionstring", feature_type=FeatureTypeEnum.STRING),
    # “EventTime” can be appended to your data when no timestamp is available.
    FeatureDefinition(feature_name="EventTime", feature_type=FeatureTypeEnum.STRING),
    # df["split_type"] = "train", or "valid", or "teset"
    FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING),
]

In [7]:
# id and time identifiers for the feature group

record_identifier_feature_name = "discourse_id"
event_time_feature_name = "EventTime"

In [8]:
from sagemaker.feature_store.feature_group import FeatureGroup

feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)
print(feature_group)

FeatureGroup(name='students-discourse-feature-group-22-14-20-06', sagemaker_session=<sagemaker.session.Session object at 0x7ffaa30d1550>, feature_definitions=[FeatureDefinition(feature_name='id', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='discourse_id', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='discourse_text', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='discourse_type', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='predictionstring', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='EventTime', feature_type=<FeatureTypeEnum.STRING: 'String'>), FeatureDefinition(feature_name='split_type', feature_type=<FeatureTypeEnum.STRING: 'String'>)])


Cast DataFrame `Object` to Supported Feature Store Data Type `String`

In [9]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")
            
# def cast_discourse_id_to_int(data_frame):
#     data_frame.discourse_id = data_frame.discourse_id.astype(int)

## Create feature group

In [10]:
prefix_offline_store = "students-discourse-feature-store-" + timestamp
print(prefix_offline_store)

students-discourse-feature-store-2022-02-22T14:19:43Z


In [None]:
feature_group.create(
    s3_uri=f"s3://{bucket}/{prefix_offline_store}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=False,
)

In [None]:
feature_group.describe()

In [13]:
# Is created?
import time


def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")
    
wait_for_feature_group_creation_complete(feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup students-discourse-feature-group-22-14-20-06 successfully created.


# Split and prepare datasets

In [14]:
path = 's3://{}/{}/train.csv'.format(bucket, key_dataset)
!aws s3 cp $path ./dataset/

download: s3://kaggle-writing-student/dataset/train.csv to dataset/train.csv


In [46]:
import pandas as pd

In [47]:
train_df = pd.read_csv('./dataset/train.csv')

In [48]:
train_df.dtypes

id                     object
discourse_id          float64
discourse_start       float64
discourse_end         float64
discourse_text         object
discourse_type         object
discourse_type_num     object
predictionstring       object
dtype: object

In [49]:
train_df.discourse_id = train_df.discourse_id.astype(int).astype(str).astype('string')

In [50]:
# This dataframe doesn't have the date type included, so we have to create it
# Accepted date formats: yyyy-MM-dd'T'HH:mm:ssZ, yyyy-MM-dd'T'HH:mm:ss.SSSZ
train_df[event_time_feature_name] = pd.Series([timestamp]*len(train_df), dtype="string")

## Feature engineering

It is necessary to filter out some special characters, otherwise the parsing will fail with:

> Failed to ingest row 76964: An error occurred (ValidationException) when calling the PutRecord operation: 1 validation error detected: Value 'content of discourse text' at 'record.3.member.valueAsString' failed to satisfy constraint: Member must satisfy regular expression pattern: .*

In [51]:
import re

train_df.discourse_text = train_df.discourse_text.apply(lambda text: re.sub('[^A-Za-z0-9]+', ' ', text))

In [52]:
train_df.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,EventTime
0,423A1CA112E2,1622627660524,8.0,229.0,Modern humans today are always on their phone ...,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,2022-02-22T14:19:43Z
1,423A1CA112E2,1622627653021,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,2022-02-22T14:19:43Z
2,423A1CA112E2,1622627671020,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75,2022-02-22T14:19:43Z
3,423A1CA112E2,1622627696365,402.0,758.0,When people have phones they know about certai...,Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...,2022-02-22T14:19:43Z
4,423A1CA112E2,1622627759780,759.0,886.0,Driving is one of the way how to get around Pe...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...,2022-02-22T14:19:43Z


In [53]:
# 10% for validation from 0.8 to 0.9
cutoff = int(len(train_df) * 0.8)
id_at_position = train_df.id[cutoff]
count = 0
while id_at_position == train_df.id[cutoff + count]:
    count += 1
index_valid_df = cutoff + count
cutoff, count, index_valid_df

(115434, 7, 115441)

In [54]:
# 10% for test
# 10% for validation 0.8 to 0.9
cutoff = int(len(train_df) * 0.9)
id_at_position = train_df.id[cutoff]
count = 0
while id_at_position == train_df.id[cutoff + count]:
    count += 1
index_test_df = cutoff + count
cutoff, count, index_test_df

(129863, 11, 129874)

In [55]:
valid_df = train_df[index_valid_df:index_test_df].copy()
len(valid_df)

14433

In [56]:
test_df = train_df[index_test_df:].copy()
len(test_df)

14419

In [57]:
train_df = train_df[:index_valid_df].copy()
train_df.tail(2)

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,EventTime
115439,25FC80C2D25E,1621872008064,1680.0,2131.0,She was really upset and hopeless and I was f...,Evidence,Evidence 2,326 327 328 329 330 331 332 333 334 335 336 33...,2022-02-22T14:19:43Z
115440,25FC80C2D25E,1621872020866,2132.0,2421.0,So I would say that taking the online course w...,Concluding Statement,Concluding Statement 1,410 411 412 413 414 415 416 417 418 419 420 42...,2022-02-22T14:19:43Z


In [70]:
train_df['split_type'] = 'train'
valid_df['split_type'] = 'validation'
test_df['split_type'] = 'test'

In [59]:
train_df.tail(1)

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,EventTime,split_type
115440,25FC80C2D25E,1621872020866,2132.0,2421.0,So I would say that taking the online course w...,Concluding Statement,Concluding Statement 1,410 411 412 413 414 415 416 417 418 419 420 42...,2022-02-22T14:19:43Z,train


In [71]:
valid_df.tail(1)

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,EventTime,split_type
129873,12988594D953,1617719319305,2676.0,3286.0,This complied understanding from different ide...,Concluding Statement,Concluding Statement 1,463 464 465 466 467 468 469 470 471 472 473 47...,2022-02-22T14:19:43Z,validation


# Ingest Data

In [61]:
cast_object_to_string(train_df)
cast_object_to_string(valid_df)
cast_object_to_string(test_df)

In [62]:
train_df.dtypes

id                     string
discourse_id           string
discourse_start       float64
discourse_end         float64
discourse_text         string
discourse_type         string
discourse_type_num     string
predictionstring       string
EventTime              string
split_type             string
dtype: object

In [64]:
columns_to_include = \
    ['id', 'discourse_id', 'discourse_text', 'discourse_type', 'predictionstring', 'EventTime', 'split_type']

In [65]:
columns_to_include

['id',
 'discourse_id',
 'discourse_text',
 'discourse_type',
 'predictionstring',
 'EventTime',
 'split_type']

In [66]:
train_df[columns_to_include].head()

Unnamed: 0,id,discourse_id,discourse_text,discourse_type,predictionstring,EventTime,split_type
0,423A1CA112E2,1622627660524,Modern humans today are always on their phone ...,Lead,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,2022-02-22T14:19:43Z,train
1,423A1CA112E2,1622627653021,They are some really bad consequences when stu...,Position,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,2022-02-22T14:19:43Z,train
2,423A1CA112E2,1622627671020,Some certain areas in the United States ban ph...,Evidence,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75,2022-02-22T14:19:43Z,train
3,423A1CA112E2,1622627696365,When people have phones they know about certai...,Evidence,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...,2022-02-22T14:19:43Z,train
4,423A1CA112E2,1622627759780,Driving is one of the way how to get around Pe...,Claim,139 140 141 142 143 144 145 146 147 148 149 15...,2022-02-22T14:19:43Z,train


In [67]:
feature_group.ingest(data_frame=train_df[columns_to_include], max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='students-discourse-feature-group-22-14-20-06', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7ffaa2185910>, max_workers=3, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7ffa96369510>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [69]:
# Only if OnlineStore enabled for FeatureGroup

# record_identifier_value = "1622627660524"

# featurestore_runtime.get_record(
#     FeatureGroupName=feature_group_name, RecordIdentifierValueAsString=record_identifier_value
# )

In [72]:
feature_group.ingest(data_frame=valid_df[columns_to_include], max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='students-discourse-feature-group-22-14-20-06', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7ffaa2185910>, max_workers=3, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7ffaa1818f50>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [73]:
feature_group.ingest(data_frame=test_df[columns_to_include], max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='students-discourse-feature-group-22-14-20-06', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7ffaa2185910>, max_workers=3, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7ffa9366dcd0>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])