# Saving Features into a SageMaker Feature Store


![](img/prepare_dataset_bert.png)

In [1]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)
s3 = boto3.Session().client(service_name='s3', region_name=region)

In [2]:
!pip list | grep pandas

pandas                             1.1.5


# Convert Raw Text to BERT Features using Hugging Face and TensorFlow

In [3]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

REVIEW_BODY_COLUMN = 'review_body'
REVIEW_ID_COLUMN = 'review_id'
# DATE_COLUMN = 'date'

LABEL_COLUMN = 'star_rating'
LABEL_VALUES = [1, 2, 3, 4, 5]

label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i

    
class InputFeatures(object):
  """BERT feature vectors."""

  def __init__(self,
               input_ids,
               input_mask,
               segment_ids,
               label_id,
               review_id,
               date,
               label,
               review_body):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.review_id = review_id
        self.date = date
        self.label = label
        self.review_body = review_body

    
class Input(object):
  """A single training/test input for sequence classification."""

  def __init__(self, text, review_id, date, label=None):
    """Constructs an Input.
    Args:
      text: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    self.text = text
    self.review_id = review_id
    self.date = date
    self.label = label
    

def convert_input(the_input, max_seq_length):
    # First, we need to preprocess our data so that it matches the data BERT was trained on:
    # 1. Lowercase our text (if we're using a BERT lowercase model)
    # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
    # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
    # 
    # Fortunately, the Transformers tokenizer does this for us!

    tokens = tokenizer.tokenize(the_input.text)
    print('**tokens**\n{}\n'.format(tokens))

    encode_plus_tokens = tokenizer.encode_plus(the_input.text,
                                               pad_to_max_length=True,
                                               max_length=max_seq_length,
#                                               truncation=True
                                              )

    # The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
    input_ids = encode_plus_tokens['input_ids']
    
    # Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.    
    input_mask = encode_plus_tokens['attention_mask']

    # Segment ids are always 0 for single-sequence tasks such as text classification.  1 is used for two-sequence tasks such as question/answer and next sentence prediction.
    segment_ids = [0] * max_seq_length

    # Label for each training row (`star_rating` 1 through 5)
    label_id = label_map[the_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        review_id=the_input.review_id,
        date=the_input.date,
        label=the_input.label,
        review_body=the_input.text)

    print('**input_ids**\n{}\n'.format(features.input_ids))
    print('**input_mask**\n{}\n'.format(features.input_mask))
    print('**segment_ids**\n{}\n'.format(features.segment_ids))
    print('**label_id**\n{}\n'.format(features.label_id))
    print('**review_id**\n{}\n'.format(features.review_id))
    print('**date**\n{}\n'.format(features.date))
    print('**label**\n{}\n'.format(features.label))
    print('**review_body**\n{}\n'.format(features.review_body))

    return features


# We'll need to transform our data into a format that BERT understands.
# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
# - `label` is the star_rating label (1, 2, 3, 4, 5) for our training input data
def transform_inputs_to_tfrecord(inputs, output_file, max_seq_length):
    records = []
    tf_record_writer = tf.io.TFRecordWriter(output_file)

    for (input_idx, the_input) in enumerate(inputs):
        if input_idx % 10000 == 0:
            print('Writing input {} of {}\n'.format(input_idx, len(inputs)))

        features = convert_input(the_input, max_seq_length)

        all_features = collections.OrderedDict()
        all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
        all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
        all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
        all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
        tf_record_writer.write(tf_record.SerializeToString())

        records.append({#'tf_record': tf_record.SerializeToString(),
                        'input_ids': features.input_ids,
                        'input_mask': features.input_mask,
                        'segment_ids': features.segment_ids,
                        'label_id': features.label_id,
                        'review_id': the_input.review_id,
                        'date': the_input.date,
                        'label': features.label,
                        'review_body': features.review_body
                       })
        
    tf_record_writer.close()

    return records

Three(3) feature vectors are created from each raw review (`review_body`) during the feature engineering phase to prepare for BERT processing:

* **`input_ids`**:  The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
    
* **`input_mask`**:  Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.

* **`segment_ids`**:  Segment ids are always 0 for single-sequence tasks such as text classification.  1 is used for two-sequence tasks such as question/answer and next sentence prediction.

And one(1) label is created from each raw review (`star_rating`)  :

* **`label_id`**:  Label for each training row (`star_rating` 1 through 5)

# Demonstrate the BERT-specific Feature Engineering Step
While we are demonstrating this code with a small amount of data here in the notebook, we will soon scale this to much more data on a powerful SageMaker cluster.

### Note: Event time date feature type provided Integral. Event time type should be either Fractional(Unix timestamp in seconds) or String (ISO-8601 format) type

In [4]:
from datetime import datetime
from time import strftime

#timestamp = datetime.now().replace(microsecond=0).isoformat()
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)

2020-12-27T01:22:25Z


In [5]:
import pandas as pd

data = [
        [5, 'ABCD12345', """I needed an "antivirus" application and know the quality of Norton products.  This was a no brainer for me and I am glad it was so simple to get."""],
        [3, 'EFGH12345', """The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos."""],
        [1, 'IJKL2345', """Terrible, none of my codes worked, and I can't uninstall it.  I think this product IS malware and viruses"""]
       ]

df = pd.DataFrame(data, columns=['star_rating', 'review_id',  'review_body'])

# Use the InputExample class from BERT's run_classifier code to create examples from the data
inputs = df.apply(lambda x: Input(
                                label = x[LABEL_COLUMN],
                                text = x[REVIEW_BODY_COLUMN],
                                review_id = x[REVIEW_ID_COLUMN],
                                date = timestamp
                            ),
                  axis = 1)


In [6]:
print(inputs[0].date)

2020-12-27T01:22:25Z


In [7]:
# df_train_embeddings = convert_features_to_tfrecord(train_inputs,
#                                                    '{}/part-{}-{}.tfrecord'.format(train_data, args.current_host, filename_without_extension),
#                                                    max_seq_length)

In [8]:
output_file='./data-tfrecord-featurestore/data.tfrecord'

In [9]:
max_seq_length = 64
records = transform_inputs_to_tfrecord(inputs, output_file, max_seq_length)

Writing input 0 of 3

**tokens**
['i', 'needed', 'an', '"', 'anti', '##virus', '"', 'application', 'and', 'know', 'the', 'quality', 'of', 'norton', 'products', '.', 'this', 'was', 'a', 'no', 'brain', '##er', 'for', 'me', 'and', 'i', 'am', 'glad', 'it', 'was', 'so', 'simple', 'to', 'get', '.']

**input_ids**
[101, 1045, 2734, 2019, 1000, 3424, 23350, 1000, 4646, 1998, 2113, 1996, 3737, 1997, 10770, 3688, 1012, 2023, 2001, 1037, 2053, 4167, 2121, 2005, 2033, 1998, 1045, 2572, 5580, 2009, 2001, 2061, 3722, 2000, 2131, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**input_mask**
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**segment_ids**
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

The three(3) features vectors and one(1) label are converted into a list of `TFRecord` instances (1 per each row of training data):
* **`tf_records`**:  Binary representation of each row of training data (3 features + 1 label)

These `TFRecord`s are the engineered features that we will use throughout the rest of the pipeline.

In [None]:
# print('**records**')

# for record in records:
#     print(record['tf_record'])

In [None]:
# my_tf_record = records[0]['tf_record']
# print(my_tf_record)

In [None]:
# deserialized_tf_record_example = tf.train.Example.FromString(my_tf_record)
# print(type(deserialized_tf_record_example))
# print(deserialized_tf_record_example)


In [11]:
# Create a description of the features.
feature_description = {
    'input_ids': tf.io.FixedLenFeature([], tf.int64),
    'input_mask': tf.io.FixedLenFeature([], tf.int64),
    'label_ids': tf.io.FixedLenFeature([], tf.int64),
    'segment_ids': tf.io.FixedLenFeature([], tf.int64),
}



In [12]:
#tf.io.parse_single_example(deserialized_tf_record_example, feature_description)

In [13]:
#tf.data.Dataset.from_tensor_slices([deserialized_tf_record_example])

# Add BERT Embeddings to Feature Store

In [14]:
featurestore_runtime = boto3.Session().client(service_name='sagemaker-featurestore-runtime', region_name=region)

# Define FeatureGroups

In [15]:
from time import gmtime, strftime, sleep

reviews_feature_group_name = 'reviews-feature-group-' + strftime('%d-%H-%M-%S', gmtime())
print(reviews_feature_group_name)

reviews-feature-group-27-01-23-16


In [16]:
from sagemaker.feature_store.feature_group import FeatureGroup

reviews_feature_group = FeatureGroup(name=reviews_feature_group_name, sagemaker_session=sagemaker_session)
print(reviews_feature_group)

FeatureGroup(name='reviews-feature-group-27-01-23-16', sagemaker_session=<sagemaker.session.Session object at 0x7fd5b2a3da20>, feature_definitions=[])


In [17]:
# record identifier and event time feature names
record_identifier_feature_name = "review_id"
event_time_feature_name = "date"

In [18]:
df_records = pd.DataFrame.from_dict(records)
df_records

Unnamed: 0,input_ids,input_mask,segment_ids,label_id,review_id,date,label,review_body
0,"[101, 1045, 2734, 2019, 1000, 3424, 23350, 100...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,ABCD12345,2020-12-27T01:22:25Z,5,"I needed an ""antivirus"" application and know t..."
1,"[101, 1996, 3291, 2007, 10777, 23663, 2003, 20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,EFGH12345,2020-12-27T01:22:25Z,3,The problem with ElephantDrive is that it requ...
2,"[101, 6659, 1010, 3904, 1997, 2026, 9537, 2499...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,IJKL2345,2020-12-27T01:22:25Z,1,"Terrible, none of my codes worked, and I can't..."


In [19]:
# df_feature_store = df_records.rename(columns={"label": "star_rating"})

In [20]:
# df_feature_store[['review_id', 'date', 'review_body', 'star_rating']]

# Cast to Supported Feature Store Data Types

In [21]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype("str").astype("string")

In [22]:
cast_object_to_string(df_records)

In [23]:
df_records

Unnamed: 0,input_ids,input_mask,segment_ids,label_id,review_id,date,label,review_body
0,"[101, 1045, 2734, 2019, 1000, 3424, 23350, 100...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,ABCD12345,2020-12-27T01:22:25Z,5,"I needed an ""antivirus"" application and know t..."
1,"[101, 1996, 3291, 2007, 10777, 23663, 2003, 20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,EFGH12345,2020-12-27T01:22:25Z,3,The problem with ElephantDrive is that it requ...
2,"[101, 6659, 1010, 3904, 1997, 2026, 9537, 2499...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,IJKL2345,2020-12-27T01:22:25Z,1,"Terrible, none of my codes worked, and I can't..."


In [24]:
# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
reviews_feature_group.load_feature_definitions(data_frame=df_records) # output is suppressed

[FeatureDefinition(feature_name='input_ids', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='input_mask', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='segment_ids', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='label_id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='review_id', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='date', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='label', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='review_body', feature_type=<FeatureTypeEnum.STRING: 'String'>)]

In [25]:
prefix = 'reviews-feature-store-' + timestamp
print(prefix)

reviews-feature-store-2020-12-27T01:22:25Z


In [26]:
reviews_feature_group.create(
    s3_uri=f"s3://{bucket}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True
)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:231218423789:feature-group/reviews-feature-group-27-01-23-16',
 'ResponseMetadata': {'RequestId': '30fc21b1-9d35-4afc-bb41-a83fb40f45ce',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '30fc21b1-9d35-4afc-bb41-a83fb40f45ce',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '110',
   'date': 'Sun, 27 Dec 2020 01:23:16 GMT'},
  'RetryAttempts': 0}}

In [27]:
reviews_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:231218423789:feature-group/reviews-feature-group-27-01-23-16',
 'FeatureGroupName': 'reviews-feature-group-27-01-23-16',
 'RecordIdentifierFeatureName': 'review_id',
 'EventTimeFeatureName': 'date',
 'FeatureDefinitions': [{'FeatureName': 'input_ids', 'FeatureType': 'String'},
  {'FeatureName': 'input_mask', 'FeatureType': 'String'},
  {'FeatureName': 'segment_ids', 'FeatureType': 'String'},
  {'FeatureName': 'label_id', 'FeatureType': 'Integral'},
  {'FeatureName': 'review_id', 'FeatureType': 'String'},
  {'FeatureName': 'date', 'FeatureType': 'String'},
  {'FeatureName': 'label', 'FeatureType': 'Integral'},
  {'FeatureName': 'review_body', 'FeatureType': 'String'}],
 'CreationTime': datetime.datetime(2020, 12, 27, 1, 23, 16, 397000, tzinfo=tzlocal()),
 'OnlineStoreConfig': {'EnableOnlineStore': True},
 'OfflineStoreConfig': {'S3StorageConfig': {'S3Uri': 's3://sagemaker-us-east-1-231218423789/reviews-feature-store-2020-12-27T01:22:25Z'}

In [28]:
sm.list_feature_groups() # use boto client to list FeatureGroups

{'FeatureGroupSummaries': [{'FeatureGroupName': 'reviews-feature-group-27-01-23-16',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:231218423789:feature-group/reviews-feature-group-27-01-23-16',
   'CreationTime': datetime.datetime(2020, 12, 27, 1, 23, 16, 397000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Creating'},
  {'FeatureGroupName': 'reviews-feature-group-26-02-37-47',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:231218423789:feature-group/reviews-feature-group-26-02-37-47',
   'CreationTime': datetime.datetime(2020, 12, 26, 2, 37, 47, 501000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'reviews-feature-group-25-00-47-38',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:231218423789:feature-group/reviews-feature-group-25-00-47-38',
   'CreationTime': datetime.datetime(2020, 12, 25, 0, 47, 41, 815000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'S

#### PutRecords into FeatureGroup

After the FeatureGroups have been created, we can put data into the FeatureGroups by using the PutRecord API. This API can handle high TPS and is designed to be called by different streams. The data from all of these Put requests is buffered and written to S3 in chunks. The files will be written to the offline store within a few minutes of ingestion. For this example, to accelerate the ingestion process, we are specifying multiple workers to do the job simultaneously. It will take ~1min to ingest data to the 2 FeatureGroups, respectively.

In [29]:
import time

def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


In [30]:
wait_for_feature_group_creation_complete(feature_group=reviews_feature_group)


Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup reviews-feature-group-27-01-23-16 successfully created.


In [31]:
reviews_feature_group.ingest(
    data_frame=df_records, max_workers=3, wait=True
)

IngestionManagerPandas(feature_group_name='reviews-feature-group-27-01-23-16', sagemaker_session=<sagemaker.session.Session object at 0x7fd5b2a3da20>, data_frame=                                           input_ids  \
0  [101, 1045, 2734, 2019, 1000, 3424, 23350, 100...   
1  [101, 1996, 3291, 2007, 10777, 23663, 2003, 20...   
2  [101, 6659, 1010, 3904, 1997, 2026, 9537, 2499...   

                                          input_mask  \
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   

                                         segment_ids  label_id  review_id  \
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...         4  ABCD12345   
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...         2  EFGH12345   
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...         0   IJKL2345   

                   date  label  \
0  2020-12-27T01:22:25Z      5   
1  2020-12-

In [32]:
record_identifier_value = 'IJKL2345'

featurestore_runtime.get_record(FeatureGroupName=reviews_feature_group_name, RecordIdentifierValueAsString=record_identifier_value)

{'ResponseMetadata': {'RequestId': 'c3bf18e4-4951-4d53-ab1b-2d92f75edad5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c3bf18e4-4951-4d53-ab1b-2d92f75edad5',
   'content-type': 'application/json',
   'content-length': '1184',
   'date': 'Sun, 27 Dec 2020 01:23:27 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'input_ids',
   'ValueAsString': '[101, 6659, 1010, 3904, 1997, 2026, 9537, 2499, 1010, 1998, 1045, 2064, 1005, 1056, 4895, 7076, 9080, 2140, 2009, 1012, 1045, 2228, 2023, 4031, 2003, 15451, 8059, 1998, 18191, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]'},
  {'FeatureName': 'input_mask',
   'ValueAsString': '[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]'},
  {'FeatureName': 'segment_ids',
   'ValueAsString': '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [33]:
print(reviews_feature_group.as_hive_ddl())

CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.reviews-feature-group-27-01-23-16 (
  input_ids STRING
  input_mask STRING
  segment_ids STRING
  label_id INT
  review_id STRING
  date STRING
  label INT
  review_body STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  STORED AS
  INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
  OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
LOCATION 's3://sagemaker-us-east-1-231218423789/reviews-feature-store-2020-12-27T01:22:25Z/231218423789/sagemaker/us-east-1/offline-store/reviews-feature-group-27-01-23-16'


In [34]:
print(reviews_feature_group_name)

reviews-feature-group-27-01-23-16


In [35]:
account_id = boto3.client('sts').get_caller_identity()["Account"]

reviews_feature_group_s3_prefix = prefix + '/' + account_id + '/sagemaker/' + region + '/offline-store/' + reviews_feature_group_name + '/data'

offline_store_contents = None
while (offline_store_contents is None):
    objects_in_bucket = s3.list_objects(Bucket=bucket,
                                        Prefix=prefix)
    if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):
        offline_store_contents = objects_in_bucket['Contents']
    else:
        print('Waiting for data in offline store...\n')
        sleep(60)
    
print('Data available.')

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Data available.


## Build Training Dataset

SageMaker FeatureStore automatically builds the Glue Data Catalog for FeatureGroups (you can optionally turn it on/off while creating the FeatureGroup). In this example, we want to create one training dataset with FeatureValues from both identity and transaction FeatureGroups. This is done by utilizing the auto-built Catalog. We run an Athena query that joins the data stored in the offline store in S3 from the 2 FeatureGroups. 

In [51]:
reviews_feature_store_query = reviews_feature_group.athena_query()

reviews_feature_store_table = reviews_feature_store_query.table_name

#query_string = 'SELECT review_body, input_ids, input_mask, segment_ids, label_id FROM "'+reviews_table+'" LIMIT 5'

query_string = 'SELECT review_body, input_ids, input_mask, segment_ids, label_id FROM "{}" LIMIT 5'.format(reviews_feature_store_table)

print('Running ' + query_string)

# run Athena query. The output is loaded to a Pandas dataframe.
dataset = pd.DataFrame()


reviews_feature_store_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/query_results/')

#reviews_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/query_results/')

reviews_feature_store_query.wait()
dataset = reviews_feature_store_query.as_dataframe()

dataset

Running SELECT review_body, input_ids, input_mask, segment_ids, label_id FROM "reviews-feature-group-27-01-23-16-1609032196" LIMIT 5


Unnamed: 0,review_body,input_ids,input_mask,segment_ids,label_id
0,The problem with ElephantDrive is that it requ...,"[101, 1996, 3291, 2007, 10777, 23663, 2003, 20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
1,"Terrible, none of my codes worked, and I can't...","[101, 6659, 1010, 3904, 1997, 2026, 9537, 2499...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
2,"I needed an ""antivirus"" application and know t...","[101, 1045, 2734, 2019, 1000, 3424, 23350, 100...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4


In [49]:
print(type(dataset))

<class 'pandas.core.frame.DataFrame'>


In [38]:
# # Prepare query results for training.
# query_execution = reviews_query.get_query_execution()
# query_result = 's3://'+bucket+'/'+prefix+'/query_results/'+query_execution['QueryExecution']['QueryExecutionId']+'.csv'
# print(query_result)

# # Select useful columns for training with target column as the first.
# dataset = dataset[["embedding"]]
# dataset

In [39]:
file_name = './data-tfrecord-featurestore/reviews-embeddings.tfrecord'

In [40]:
dataset.to_csv(file_name, header=False, index=False)

In [41]:
restored_tfrecord_dataset = tf.data.TFRecordDataset(file_name)
restored_tfrecord_dataset

<TFRecordDatasetV2 shapes: (), types: tf.string>

In [42]:
restored_tfrecord_dataset.as_numpy_iterator()

<tensorflow.python.data.ops.dataset_ops._NumpyIterator at 0x7fd511711198>

In [52]:
# list(restored_tfrecord_dataset.as_numpy_iterator())

In [53]:
#!head $file_name

In [None]:
%%javascript
Jupyter.notebook.save_checkpoint();
Jupyter.notebook.session.delete();