# Feature Transformation with Scikit-Learn In This Notebook


**Presentation Deep-Dive on BERT:** 
* [Slides - https://speakerdeck.com/antje/visualize-bert-attention](https://speakerdeck.com/antje/visualize-bert-attention)
* [Video - https://youtu.be/4PQyRJd9d_E](https://youtu.be/4PQyRJd9d_E)


![](img/prepare_dataset_bert.png)

In [29]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)
s3 = boto3.Session().client(service_name='s3', region_name=region)

In [2]:
print(role)

arn:aws:iam::806570384721:role/TeamRole


# Convert Raw Text to BERT Features using Hugging Face and TensorFlow

In [63]:
import tensorflow as tf
import collections
import json
import os
import pandas as pd
import csv
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

REVIEW_BODY_COLUMN = 'review_body'
REVIEW_ID_COLUMN = 'review_id'
# DATE_COLUMN = 'date'

LABEL_COLUMN = 'star_rating'
LABEL_VALUES = [1, 2, 3, 4, 5]

label_map = {}
for (i, label) in enumerate(LABEL_VALUES):
    label_map[label] = i

    
class InputFeatures(object):
  """BERT feature vectors."""

  def __init__(self,
               input_ids,
               input_mask,
               segment_ids,
               label_id,
               review_id,
               date):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_id = label_id
    self.review_id = review_id
    self.date = date
    
    
class Input(object):
  """A single training/test input for sequence classification."""

  def __init__(self, text, review_id, date, label=None):
    """Constructs an Input.
    Args:
      text: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    self.text = text
    self.review_id = review_id
    self.date = date
    self.label = label
    

def convert_input(the_input, max_seq_length):
    # First, we need to preprocess our data so that it matches the data BERT was trained on:
    # 1. Lowercase our text (if we're using a BERT lowercase model)
    # 2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
    # 3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
    # 
    # Fortunately, the Transformers tokenizer does this for us!

    tokens = tokenizer.tokenize(the_input.text)
    print('**tokens**\n{}\n'.format(tokens))

    encode_plus_tokens = tokenizer.encode_plus(the_input.text,
                                               pad_to_max_length=True,
                                               max_length=max_seq_length,
#                                               truncation=True
                                              )

    # The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
    input_ids = encode_plus_tokens['input_ids']
    
    # Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.    
    input_mask = encode_plus_tokens['attention_mask']

    # Segment ids are always 0 for single-sequence tasks such as text classification.  1 is used for two-sequence tasks such as question/answer and next sentence prediction.
    segment_ids = [0] * max_seq_length

    # Label for each training row (`star_rating` 1 through 5)
    label_id = label_map[the_input.label]

    features = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        review_id=the_input.review_id,
        date=the_input.date)

    print('**input_ids**\n{}\n'.format(features.input_ids))
    print('**input_mask**\n{}\n'.format(features.input_mask))
    print('**segment_ids**\n{}\n'.format(features.segment_ids))
    print('**label_id**\n{}\n'.format(features.label_id))
    print('**review_id**\n{}\n'.format(features.review_id))
    print('**date**\n{}\n'.format(features.date))

    return features


# We'll need to transform our data into a format that BERT understands.
# - `text` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
# - `label` is the star_rating label (1, 2, 3, 4, 5) for our training input data
def transform_inputs_to_tfrecord(inputs, max_seq_length):
    records = []
    for (input_idx, the_input) in enumerate(inputs):
        if input_idx % 10000 == 0:
            print('Writing input {} of {}\n'.format(input_idx, len(inputs)))

    features = convert_input(the_input, max_seq_length)
        
    all_features = collections.OrderedDict()
    all_features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_ids))
    all_features['input_mask'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.input_mask))
    all_features['segment_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features.segment_ids))
    all_features['label_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features.label_id]))

    tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
    
    print(tf_record.SerializeToString())
    
    records.append({'review_id': the_input.review_id,
                    'date': the_input.date,
                    'embedding': tf_record.SerializeToString()
                   })

    return records

Three(3) feature vectors are created from each raw review (`review_body`) during the feature engineering phase to prepare for BERT processing:

* **`input_ids`**:  The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
    
* **`input_mask`**:  Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.

* **`segment_ids`**:  Segment ids are always 0 for single-sequence tasks such as text classification.  1 is used for two-sequence tasks such as question/answer and next sentence prediction.

And one(1) label is created from each raw review (`star_rating`)  :

* **`label_id`**:  Label for each training row (`star_rating` 1 through 5)

# Demonstrate the BERT-specific Feature Engineering Step
While we are demonstrating this code with a small amount of data here in the notebook, we will soon scale this to much more data on a powerful SageMaker cluster.

### Note: Event time date feature type provided Integral. Event time type should be either Fractional(Unix timestamp in seconds) or String (ISO-8601 format) type

In [64]:
from datetime import datetime
from time import strftime

#timestamp = datetime.now().replace(microsecond=0).isoformat()
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)

2020-12-14T07:45:19Z


In [65]:
import pandas as pd

data = [
        [5, 'ABCD12345', """I needed an antivirus application and know the quality of Norton products.  This was a no brainer for me and I am glad it was so simple to get."""],
        [3, 'EFGH12345', """The problem with ElephantDrive is that it requires the use of Java. Since Java is notorious for security problems I haveit removed from all of my computers. What files I do have stored are photos."""],
        [1, 'IJKL2345', """Terrible, none of my codes worked, and I can't uninstall it.  I think this product IS malware and viruses"""]
       ]

df = pd.DataFrame(data, columns=['star_rating', 'review_id',  'review_body'])

# Use the InputExample class from BERT's run_classifier code to create examples from the data
inputs = df.apply(lambda x: Input(
                                label = x[LABEL_COLUMN],
                                text = x[REVIEW_BODY_COLUMN],
                                review_id = x[REVIEW_ID_COLUMN],
                                date = timestamp
                            ),
                  axis = 1)


In [66]:
print(inputs[0].date)

2020-12-14T07:45:19Z


In [67]:
max_seq_length = 64
records = transform_inputs_to_tfrecord(inputs, max_seq_length)

Writing input 0 of 3

**tokens**
['terrible', ',', 'none', 'of', 'my', 'codes', 'worked', ',', 'and', 'i', 'can', "'", 't', 'un', '##ins', '##tal', '##l', 'it', '.', 'i', 'think', 'this', 'product', 'is', 'mal', '##ware', 'and', 'viruses']

**input_ids**
[101, 6659, 1010, 3904, 1997, 2026, 9537, 2499, 1010, 1998, 1045, 2064, 1005, 1056, 4895, 7076, 9080, 2140, 2009, 1012, 1045, 2228, 2023, 4031, 2003, 15451, 8059, 1998, 18191, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**input_mask**
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**segment_ids**
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

**label_id**
0

**review_id**
IJKL2345

**

The three(3) features vectors and one(1) label are converted into a list of `TFRecord` instances (1 per each row of training data):
* **`tf_records`**:  Binary representation of each row of training data (3 features + 1 label)

These `TFRecord`s are the engineered features that we will use throughout the rest of the pipeline.

In [8]:
print('**records**')

for record in records:
    print(record)

**records**
{'review_id': 'IJKL2345', 'date': '2020-12-14T06:16:34Z', 'embedding': b'\n\xad\x02\nS\n\x0bsegment_ids\x12D\x1aB\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nn\n\tinput_ids\x12a\x1a_\n]e\x834\xf2\x07\xc0\x1e\xcd\x0f\xea\x0f\xc1J\xc3\x13\xf2\x07\xce\x0f\x95\x08\x90\x10\xed\x07\xa0\x08\x9f&\xa47\xf8F\xdc\x10\xd9\x0f\xf4\x07\x95\x08\xb4\x11\xe7\x0f\xbf\x1f\xd3\x0f\xdbx\xfb>\xce\x0f\x8f\x8e\x01f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nR\n\ninput_mask\x12D\x1aB\n@\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00

# Add BERT Embeddings to Feature Store

In [9]:
featurestore_runtime = boto3.Session().client(service_name='sagemaker-featurestore-runtime', region_name=region)

In [10]:

# feature_store_session = Session(
#     boto_session=boto_session,
#     sagemaker_client=sm,
#     sagemaker_featurestore_runtime_client=featurestore_runtime
# )

#### Define FeatureGroups

In [11]:
from time import gmtime, strftime, sleep

reviews_feature_group_name = 'reviews-feature-group-' + strftime('%d-%H-%M-%S', gmtime())


In [12]:
from sagemaker.feature_store.feature_group import FeatureGroup

reviews_feature_group = FeatureGroup(name=reviews_feature_group_name, sagemaker_session=sagemaker_session)



In [13]:
# record identifier and event time feature names
record_identifier_feature_name = "review_id"
event_time_feature_name = "date"

In [14]:
df_records = pd.DataFrame.from_dict(records)
df_records

Unnamed: 0,review_id,date,embedding
0,IJKL2345,2020-12-14T06:16:34Z,b'\n\xad\x02\nS\n\x0bsegment_ids\x12D\x1aB\n@\...


## _Cast needed for Feature Store_

In [15]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype("str").astype("string")

In [16]:
# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(df_records)

In [17]:
# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
reviews_feature_group.load_feature_definitions(data_frame=df_records); # output is suppressed

In [18]:
prefix = 'reviews_feature_store'

In [19]:
reviews_feature_group.create(
    s3_uri=f"s3://{bucket}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True
)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:806570384721:feature-group/reviews-feature-group-14-06-16-36',
 'ResponseMetadata': {'RequestId': '84744925-b8e6-4e3e-a141-24f872f27c7f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '84744925-b8e6-4e3e-a141-24f872f27c7f',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '110',
   'date': 'Mon, 14 Dec 2020 06:16:35 GMT'},
  'RetryAttempts': 0}}

In [20]:
reviews_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:806570384721:feature-group/reviews-feature-group-14-06-16-36',
 'FeatureGroupName': 'reviews-feature-group-14-06-16-36',
 'RecordIdentifierFeatureName': 'review_id',
 'EventTimeFeatureName': 'date',
 'FeatureDefinitions': [{'FeatureName': 'review_id', 'FeatureType': 'String'},
  {'FeatureName': 'date', 'FeatureType': 'String'},
  {'FeatureName': 'embedding', 'FeatureType': 'String'}],
 'CreationTime': datetime.datetime(2020, 12, 14, 6, 16, 36, 150000, tzinfo=tzlocal()),
 'OnlineStoreConfig': {'EnableOnlineStore': True},
 'OfflineStoreConfig': {'S3StorageConfig': {'S3Uri': 's3://sagemaker-us-east-1-806570384721/reviews_feature_store'},
  'DisableGlueTableCreation': False},
 'RoleArn': 'arn:aws:iam::806570384721:role/TeamRole',
 'FeatureGroupStatus': 'Creating',
 'ResponseMetadata': {'RequestId': '47be4d64-992a-4e54-bfda-96722e46b873',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '47be4d64-992a-4e54-bfda-96722e46b873',
  

In [21]:
sm.list_feature_groups() # use boto client to list FeatureGroups

{'FeatureGroupSummaries': [{'FeatureGroupName': 'reviews-feature-group-14-06-16-36',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:806570384721:feature-group/reviews-feature-group-14-06-16-36',
   'CreationTime': datetime.datetime(2020, 12, 14, 6, 16, 36, 150000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Creating'},
  {'FeatureGroupName': 'reviews-feature-group-14-06-14-43',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:806570384721:feature-group/reviews-feature-group-14-06-14-43',
   'CreationTime': datetime.datetime(2020, 12, 14, 6, 14, 43, 762000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'reviews-feature-group-14-06-12-19',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:806570384721:feature-group/reviews-feature-group-14-06-12-19',
   'CreationTime': datetime.datetime(2020, 12, 14, 6, 12, 19, 501000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'reviews-feature-group-14-06-03-04',
   'FeatureGro

#### PutRecords into FeatureGroup

After the FeatureGroups have been created, we can put data into the FeatureGroups by using the PutRecord API. This API can handle high TPS and is designed to be called by different streams. The data from all of these Put requests is buffered and written to S3 in chunks. The files will be written to the offline store within a few minutes of ingestion. For this example, to accelerate the ingestion process, we are specifying multiple workers to do the job simultaneously. It will take ~1min to ingest data to the 2 FeatureGroups, respectively.

In [22]:
import time

def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


In [23]:
wait_for_feature_group_creation_complete(feature_group=reviews_feature_group)


Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup reviews-feature-group-14-06-16-36 successfully created.


In [24]:
reviews_feature_group.ingest(
    data_frame=df_records, max_workers=3, wait=True
)

IngestionManagerPandas(feature_group_name='reviews-feature-group-14-06-16-36', sagemaker_session=<sagemaker.session.Session object at 0x7f5d902f9a20>, data_frame=  review_id                  date  \
0  IJKL2345  2020-12-14T06:16:34Z   

                                           embedding  
0  b'\n\xad\x02\nS\n\x0bsegment_ids\x12D\x1aB\n@\...  , max_workers=3, _futures={<Future at 0x7f5cf00e4b70 state=finished returned NoneType>: (0, 1), <Future at 0x7f5cf00e9b70 state=finished returned NoneType>: (1, 1), <Future at 0x7f5cf00e4ac8 state=finished returned NoneType>: (1, 1)})

In [26]:
record_identifier_value = 'IJKL2345'

featurestore_runtime.get_record(FeatureGroupName=reviews_feature_group_name, RecordIdentifierValueAsString=record_identifier_value)

{'ResponseMetadata': {'RequestId': 'f564cf20-758c-4b79-879f-b8c74a71d127',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f564cf20-758c-4b79-879f-b8c74a71d127',
   'content-type': 'application/json',
   'content-length': '1427',
   'date': 'Mon, 14 Dec 2020 06:18:20 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'review_id', 'ValueAsString': 'IJKL2345'},
  {'FeatureName': 'date', 'ValueAsString': '2020-12-14T06:16:34Z'},
  {'FeatureName': 'embedding',
   'ValueAsString': "b'\\n\\xad\\x02\\nS\\n\\x0bsegment_ids\\x12D\\x1aB\\n@\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\nn\\n\\tinput_ids\\x12a\\x1a_\\n]e\\x834\\xf2\\x07\\xc0\\x1e\\xcd\\x0f\\xea\\x0f\\xc1J\\xc3\\x13\\xf2\\x07\\xce\\x0f\\x

In [27]:
print(reviews_feature_group.as_hive_ddl())

CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.reviews-feature-group-14-06-16-36 (
  review_id STRING
  date STRING
  embedding STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  STORED AS
  INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
  OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
LOCATION 's3://sagemaker-us-east-1-806570384721/reviews_feature_store/806570384721/sagemaker/us-east-1/offline-store/reviews-feature-group-14-06-16-36'


In [30]:
account_id = boto3.client('sts').get_caller_identity()["Account"]

reviews_feature_group_s3_prefix = prefix + '/' + account_id + '/sagemaker/' + region + '/offline-store/' + reviews_feature_group_name + '/data'

offline_store_contents = None
while (offline_store_contents is None):
    objects_in_bucket = s3.list_objects(Bucket=bucket,Prefix=prefix)
    if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):
        offline_store_contents = objects_in_bucket['Contents']
    else:
        print('Waiting for data in offline store...\n')
        sleep(60)
    
print('Data available.')

Data available.


## Build Training Dataset

SageMaker FeatureStore automatically builds the Glue Data Catalog for FeatureGroups (you can optionally turn it on/off while creating the FeatureGroup). In this example, we want to create one training dataset with FeatureValues from both identity and transaction FeatureGroups. This is done by utilizing the auto-built Catalog. We run an Athena query that joins the data stored in the offline store in S3 from the 2 FeatureGroups. 

In [44]:
reviews_query = reviews_feature_group.athena_query()

reviews_table = reviews_query.table_name
# sagemaker_featurestore.reviews-feature-group-14-06-16-36

query_string = 'SELECT embedding FROM "'+reviews_table+'" LIMIT 1'
print('Running ' + query_string)

# run Athena query. The output is loaded to a Pandas dataframe.
dataset = pd.DataFrame()
reviews_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/query_results/')
reviews_query.wait()
dataset = reviews_query.as_dataframe()

dataset

Running SELECT embedding FROM "reviews-feature-group-14-06-16-36-1607926596" LIMIT 1


Unnamed: 0,embedding
0,b'\n\xad\x02\nS\n\x0bsegment_ids\x12D\x1aB\n@\...


In [None]:
# # Prepare query results for training.
# query_execution = reviews_query.get_query_execution()
# query_result = 's3://'+bucket+'/'+prefix+'/query_results/'+query_execution['QueryExecution']['QueryExecutionId']+'.csv'
# print(query_result)

# # Select useful columns for training with target column as the first.
# dataset = dataset[["embedding"]]
# dataset

# TODO: Check if to_csv is ok
## Possibly change to sth. else (might be ok for now, as we have only 1 column)

In [48]:
file_name = './data-tfrecord-featurestore/reviews-embeddings.tfrecord'

In [56]:
# Write to csv in S3 without headers and index column.
dataset.to_csv(file_name, header=False, index=False)

# TODO:  Run these next 2 lines to confirm if .tfrecord is still corrupt or not

In [57]:
restored_tfrecord_dataset = tf.data.TFRecordDataset(file_name)
restored_tfrecord_dataset


<TFRecordDatasetV2 shapes: (), types: tf.string>

In [61]:
restored_tfrecord_dataset.as_numpy_iterator()

<tensorflow.python.data.ops.dataset_ops._NumpyIterator at 0x7f5ce9430a58>

In [62]:
list(restored_tfrecord_dataset.as_numpy_iterator())

DataLossError: corrupted record at 0

In [52]:
# from io import TextIOWrapper

# with open(file_name, mode="wb") as fd:
#   dataset.to_csv(TextIOWrapper(fd), header=False, index=False)

In [None]:
#s3.upload_file('reviews-embeddings.csv', bucket, prefix+'/training_input/reviews_embeddings.csv')
#dataset_uri_prefix = 's3://'+bucket+'/'+prefix+'/training_input/';

In [55]:
!head $file_name

b'\n\xad\x02\nS\n\x0bsegment_ids\x12D\x1aB\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nn\n\tinput_ids\x12a\x1a_\n]e\x834\xf2\x07\xc0\x1e\xcd\x0f\xea\x0f\xc1J\xc3\x13\xf2\x07\xce\x0f\x95\x08\x90\x10\xed\x07\xa0\x08\x9f&\xa47\xf8F\xdc\x10\xd9\x0f\xf4\x07\x95\x08\xb4\x11\xe7\x0f\xbf\x1f\xd3\x0f\xdbx\xfb>\xce\x0f\x8f\x8e\x01f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nR\n\ninput_mask\x12D\x1aB\n@\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\x12\n\tlabel_ids\x12\x05\x1a\x03\n\x

## Train and Deploy the Model

Now it's time to launch a Training job to fit our model.

In [42]:
tf_record_dataset = tf.data.TFRecordDataset(file_name)


In [43]:
print(tf_record_dataset)

<TFRecordDatasetV2 shapes: (), types: tf.string>


In [None]:
training_output_path = 's3://'+bucket+'/'+prefix+'/training_output'
training_image = '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3'

In [None]:
epochs=1
learning_rate=0.00001
epsilon=0.00000001
train_batch_size=128
validation_batch_size=128
test_batch_size=128
train_steps_per_epoch=50
validation_steps=50
test_steps=50
train_instance_count=1
train_instance_type='ml.c5.9xlarge'
train_volume_size=1024
use_xla=True
use_amp=True
freeze_bert_layer=False
enable_sagemaker_debugger=True
enable_checkpointing=False
enable_tensorboard=False
input_mode='File'
run_validation=True
run_test=True
run_sample_predictions=True

In [None]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                       source_dir='src',
                       role=role,
                       instance_count=train_instance_count,
                       instance_type=train_instance_type,
                       volume_size=train_volume_size,
#                        use_spot_instances=True,
#                        max_wait=7200, # Seconds to wait for spot instances to become available
                       checkpoint_s3_uri=checkpoint_s3_uri,
                       py_version='py3',
                       framework_version='2.1.0',
                       hyperparameters={'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'epsilon': epsilon,
                                        'train_batch_size': train_batch_size,
                                        'validation_batch_size': validation_batch_size,
                                        'test_batch_size': test_batch_size,                                             
                                        'train_steps_per_epoch': train_steps_per_epoch,
                                        'validation_steps': validation_steps,
                                        'test_steps': test_steps,
                                        'use_xla': use_xla,
                                        'use_amp': use_amp,                                             
                                        'max_seq_length': max_seq_length,
                                        'freeze_bert_layer': freeze_bert_layer,
                                        'enable_sagemaker_debugger': enable_sagemaker_debugger,
                                        'enable_checkpointing': enable_checkpointing,
                                        'enable_tensorboard': enable_tensorboard,                                        
                                        'run_validation': run_validation,
                                        'run_test': run_test,
                                        'run_sample_predictions': run_sample_predictions},
                       input_mode=input_mode,
                       metric_definitions=metrics_definitions,
                       rules=rules,
                       debugger_hook_config=hook_config,                       
#                       max_run=7200, # number of seconds
                      )

In [None]:
from sagemaker.estimator import Estimator

training_model = Estimator(training_image,
                           role, 
                           instance_count=1, 
                           instance_type='ml.m5.2xlarge',
                           volume_size = 5,
                           max_run = 3600,
                           input_mode= 'File',
                           output_path=training_output_path,
                           sagemaker_session=feature_store_session)

In [None]:
import sagemaker.inputs

train_data = sagemaker.inputs.TrainingInput(dataset_uri_prefix, distribution='FullyReplicated', 
                                            content_type='text/csv', s3_data_type='S3Prefix')
data_channels = {'train': train_data}

In [None]:
training_model.fit(inputs=data_channels, logs=True)

In [25]:
%%javascript
Jupyter.notebook.save_checkpoint();
Jupyter.notebook.session.delete();

<IPython.core.display.Javascript object>