In [2]:
# please ignore warning messages during the installation
!pip install --disable-pip-version-check -q sagemaker==2.135.0
!conda install -q -y pytorch==1.6.0 -c pytorch
!pip install --disable-pip-version-check -q transformers==3.5.1

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytest-astropy 0.8.0 requires pytest-cov>=2.0, which is not installed.
pytest-astropy 0.8.0 requires pytest-filter-subpackage>=0.1, which is not installed.
docker-compose 1.29.2 requires PyYAML<6,>=3.10, but you have pyyaml 6.0 which is incompatible.[0m[31m
[0mCollecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - pytorch==1.6.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2023.01.10 |

In [3]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
import botocore

config = botocore.config.Config(user_agent_extra='dlai-pds/c2/w2')

# low-level service client of the boto3 session
sm = boto3.client(service_name='sagemaker', 
                  config=config)

sm_runtime = boto3.client('sagemaker-runtime',
                          config=config)

sess = sagemaker.Session(sagemaker_client=sm,
                         sagemaker_runtime_client=sm_runtime)

bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = sess.boto_region_name

In [4]:
# checking bucket
print(bucket)

sagemaker-ap-southeast-2-288344227581


In [5]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

### 1. Configure dataset, hyper-parameters and evaluation metrics

#### 1.1. Configure dataset
You have already transformed and balanced the data into a format that the model expects. Let's copy this data to S3. You will be using training and validation datasets to train the model. Test dataset will be used for tuning later. Setup the paths:

In [18]:
# determine s3 bucket
processed_train_data_s3_uri = 's3://{}/data/sentiment-train/'.format(bucket)
processed_validation_data_s3_uri = 's3://{}/data/sentiment-validation/'.format(bucket)

# copy balanced data from local to s3
!aws s3 cp --recursive ./balanced/sentiment-train $processed_train_data_s3_uri
!aws s3 cp --recursive ./balanced/sentiment-validation $processed_validation_data_s3_uri

# checking existence
!aws s3 ls --recursive $processed_train_data_s3_uri
!aws s3 ls --recursive $processed_validation_data_s3_uri

upload: balanced/sentiment-train/part-algo-1-womens_clothing_ecommerce_reviews.tsv to s3://sagemaker-ap-southeast-2-288344227581/data/sentiment-train/part-algo-1-womens_clothing_ecommerce_reviews.tsv
upload: balanced/sentiment-validation/part-algo-1-womens_clothing_ecommerce_reviews.tsv to s3://sagemaker-ap-southeast-2-288344227581/data/sentiment-validation/part-algo-1-womens_clothing_ecommerce_reviews.tsv
2023-03-30 11:19:52    4900454 data/sentiment-train/part-algo-1-womens_clothing_ecommerce_reviews.tsv
2023-03-30 11:19:53     270656 data/sentiment-validation/part-algo-1-womens_clothing_ecommerce_reviews.tsv


You will need to setup the input data channels, wrapping the S3 locations in a TrainingInput object to use with the SageMaker Training Job. This can be organized as a dictionary

In [19]:
data_channels = {
    'train': processed_train_data_s3_uri,
    'validation': processed_validation_data_s3_uri
}

### Exercise 1
Create data channel.

Instructions: Pass the S3 input path for training data into the sagemaker.inputs.TrainingInput function.

In [24]:
s3_input_train_data = sagemaker.inputs.TrainingInput(
    s3_data = processed_train_data_s3_uri
)

s3_input_validation_data = sagemaker.inputs.TrainingInput(
    s3_data = processed_validation_data_s3_uri
)

data_channels = {
    'train': s3_input_train_data,
    'validation': s3_input_validation_data
}

### 1.2 Configure model hyper-parameters
Set the Training Job parameters including the instance type, instance count, learning rate, batch size etc. For the purposes of this lab, you will use a relatively small instance type. Please refer to this link for additional instance types that may work for your use cases outside of this lab.

In [27]:
max_seq_length=128 # maximum number of input tokens passed to BERT model
freeze_bert_layer=False # specifies the depth of training within the network
epochs=3
learning_rate=2e-5
train_batch_size=256
train_steps_per_epoch=50
validation_batch_size=256
validation_steps_per_epoch=50
seed=42
run_validation=True

train_instance_count=1
train_instance_type='ml.c5.9xlarge'
train_volume_size=256
input_mode='File'


Some of them will be passed into the PyTorch estimator in the hyperparameters argument. Let's setup the dictionary for that:

In [28]:
hyperparameters={
    'max_seq_length': max_seq_length,
    'freeze_bert_layer': freeze_bert_layer,
    'epochs': epochs,
    'learning_rate': learning_rate,
    'train_batch_size': train_batch_size,
    'train_steps_per_epoch': train_steps_per_epoch,
    'validation_batch_size': validation_batch_size,
    'validation_steps_per_epoch': validation_steps_per_epoch,    
    'seed': seed,
    'run_validation': run_validation
}

1.3. Setup evaluation metrics
Choose loss and accuracy as the evaluation metrics. The regular expressions Regex will capture the values of metrics that the algorithm will emit.

In [29]:
metric_definitions = [
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_acc: ([0-9.]+)'},
]

1.4. Setup Debugger and Profiler
Amazon SageMaker Debugger can be used to profile machine learning models, helping to identify and fix training issues caused by hardware resource usage. Setting some parameters in the SageMaker estimator, without any change to the training code, you can enable the collection of infrastructure and model metrics such as: CPU and GPU, RAM and GPU RAM, data loading time, time spent in ML operators running on CPU and GPU, distributed training metrics and many more. In addition, you can visualize how much time is spent in different phases, such as preprocessing, training loop, and postprocessing. If needed, you can drill down on each training epoch, and even on each function in your training script.

Define Debugger Rules as described here: https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html

In [30]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.debugger import DebuggerHookConfig
from sagemaker.debugger import ProfilerConfig, FrameworkProfile

DebuggerHookConfig provides options to customize how debugging information is emitted and saved. s3_output_path argument value defines the location in Amazon S3 to store the output.

In [31]:
debugger_hook_config = DebuggerHookConfig(
    s3_output_path='s3://{}'.format(bucket),
)

ProfilerConfig sets the configuration for collecting system and framework metrics of SageMaker Training Jobs. Parameter system_monitor_interval_millis sets the time interval to collect system metrics (in milliseconds). Parameter framework_profile_params is the object for framework metrics profiling. Here you will set its local path, the step at which to start profiling, start_step, and the number of steps to profile, num_steps.

In [32]:
from sagemaker.debugger import ProfilerConfig, FrameworkProfile

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500,
    framework_profile_params=FrameworkProfile(local_path="/opt/ml/output/profiler/", start_step=5, num_steps=10)
)

For monitoring and profiling the built-in rules you can use the ProfilerReport. It creates a profiling report and updates when the individual rules are triggered. If you trigger this ProfilerReport rule without any customized parameter as in the cell below, then the ProfilerReport rule triggers all of the built-in rules for monitoring and profiling with their default parameter values.

The profiling report can be downloaded while the Training Job is running or after the job has finished.

In [33]:
rules=[ProfilerRule.sagemaker(rule_configs.ProfilerReport())]

### 2. Train model

2.1. Setup the RoBERTa and PyTorch script to run on SageMaker
You will prepare the PyTorch model to run as a SageMaker Training Job in a separate Python file, which will be called during the training.

Here you will be using the pre-trained model roberta-base. The information about the available models can be found in the Hugging Face website.

### Exercise 4
Open the file src/train.py. Go through the comments to understand its content.
Find and review the configure_model() function, which contains the RoBERTa model configuration.
In the following function investigate given mapping label2id of a 0-indexed list of classes used by BERT [0, 1, 2] to the list of the sentiment values [-1, 0, 1]:


```python
    config = RobertaConfig.from_pretrained(
        PRE_TRAINED_MODEL_NAME, 
        num_labels=len(classes),
        id2label={
            ...: ...,
            ...: ...,
            ...: ...,
        },
        label2id={
            -1: 0,
            0: 1,
            1: 2,
        }
    )

``` 

In [None]:
import sys, importlib

sys.path.append('src/')

import train

if 'train' in sys.modules:
    importlib.reload(train)

config = train.configure_model()

label_0 = config.id2label[0]
label_1 = config.id2label[1]
label_2 = config.id2label[2]

updated_correctly = False

if label_0 != -1 or label_1 != 0 or label_2 != 1:
    print('#######################################################################################')
    print('Please check that the function \'configure_model\' in the file src/train.py is complete.')
    print('########################################################################################')
    raise Exception('Please check that the function \'configure_model\' in the file src/train.py is complete.')
else:
    print('##################')    
    print('Updated correctly!')        
    print('##################')        

    updated_correctly = True

In [None]:
from sagemaker.pytorch import PyTorch as PyTorchEstimator
    if updated_correctly:
        estimator = PyTorchEstimator(
            entry_point = 'train.py',
            source_dir = 'src',
            role=role,
            instance_count=train_instance_count,
            instance_type=train_instance_type,
            volume_size=train_volume_size,
            py_version='py3', # dynamically retrieves the correct training image (Python 3)
            framework_version='1.6.0', # dynamically retrieves the correct training image (PyTorch)
            hyperparameters=hyperparameters,
            metric_definitions=metric_definitions,
            input_mode=input_mode,
            debugger_hook_config=debugger_hook_config,
            profiler_config=profiler_config,
            rules=rules
        )

### Excercise 5
Launch the SageMaker Training Job which will be fitting the model to the dataset.

Instructions: Use the estimator.fit function, passing the configured train and validation inputs (data channels).

In [None]:
# data channels inlcude both trianing and validation
estimator.fit(
    inputs=data_channels,
    wait=False
)

You can refer to the last Training Job using the estimator function latest_training_job. Then the Training Job name can be found with the name function:

In [None]:
training_job_name = estimator.latest_training_job.name
print('Training Job Name: {}.format(training_job_name))

### Excercise 6
Pull the Training Job status from the Training Job description.

Instructions: Print the keys of the Training Job description dictionary, choose the one related to the primary status of the Training Job and print the value of it.

In [None]:
print(estimator.latest_training_job.describe().keys())

In [None]:
training_job_status_primary = estimator.latest_training_job.describe()[None]
print('Training Job status: {}'.format(training_job_status_primary))

In [5]:
# Check out this cell magic function, it is interesting
%%time

estimator.latest_training_job.wait(logs=False)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


In [None]:
### Review the training metrics
df_metrics = estimator.training_job_analytics.dataframe()
df_metrics

In [None]:
df_metrics.query("metric_name=='validation:accuracy'").plot(x='timestamp', y='value')