# Description

Processing notebook to prepare data to be used by a transformer model

In [2]:
import sagemaker
import boto3

sess = sagemaker.Session()
sm_bucket = sess.default_bucket()

bucket = 'kaggle-writing-student'
key_dataset = 'dataset'
output_processed_data = 'processed_dataset-transformer'

raw_input_data_s3_uri = 's3://{}/{}/'.format(bucket, key_dataset)
output_data_s3_uri = 's3://{}/{}/'.format(bucket, output_processed_data)
                     
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
s3 = boto3.Session().client(service_name="s3", region_name=region)

In [6]:
raw_input_data_s3_uri, output_data_s3_uri

('s3://kaggle-writing-student/dataset/',
 's3://kaggle-writing-student/processed_dataset-transformer/')

In [3]:
from sagemaker.local import LocalSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor

In [4]:
processing_instance_type = "ml.m5.large"
processing_instance_count = 1

In [5]:
processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    env={"AWS_DEFAULT_REGION": region},
    max_runtime_in_seconds=1800,
    )

In [8]:
train_split_percentage = 0.9
validation_split_percentage = 0.1
random_seed = 42

In [None]:
processor.run(code='processing_transformer_script.py',
              inputs=[ProcessingInput(
                  source=raw_input_data_s3_uri,
                  destination='/opt/ml/processing/input_data/')],
              outputs=[ProcessingOutput(
                  output_name='processed_data',
                  source='/opt/ml/processing/processed_data/',
                  destination=output_data_s3_uri,
                  s3_upload_mode='EndOfJob')],
              arguments=[
                  "train-split-percentage",
                  str(train_split_percentage),
                  "validation-split-percentage",
                  str(validation_split_percentage),
                  "random-seed",
                  str(random_seed)],
              logs=True,
              wait=False,
              )

In [12]:
scikit_processing_job_name = processor.jobs[-1].describe()["ProcessingJobName"]
print(scikit_processing_job_name)

sagemaker-scikit-learn-2022-03-03-18-19-25-041


In [13]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/processing-jobs/{}">Processing Job</a></b>'.format(
            region, scikit_processing_job_name
        )
    )
)

In [14]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(
            region, scikit_processing_job_name
        )
    )
)

In [15]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Processing Job Has Completed</b>'.format(
            bucket, output_processed_data, region
        )
    )
)

In [None]:
running_processor = sagemaker.processing.ProcessingJob.from_processing_name(
    processing_job_name=scikit_processing_job_name, sagemaker_session=sess
)

processing_job_description = running_processor.describe()

print(processing_job_description)

In [17]:
running_processor.wait(logs=False)

!

# Train the model

In [23]:
from sagemaker.pytorch import PyTorch
import os

https://aws.amazon.com/sagemaker/pricing/

In [26]:
# At least 16 GB of RAM
train_instance_type = "ml.m5.xlarge"
train_instance_count = 1

In [44]:
estimator = PyTorch(entry_point='train_transformer_script.py',
                    source_dir='./src_transformers',
                    role=role,
                    framework_version='1.9',
                    py_version='py38',
                    instance_count=train_instance_count,
                    instance_type=train_instance_type,
                    hyperparameters={
                        'epochs': 1,
                    })

In [45]:
s3_input_train_data = output_data_s3_uri
s3_input_train_data

's3://kaggle-writing-student/processed_dataset-transformer/'

In [46]:
estimator.fit(
    # not the best way to do this. TODO: specify channels separately
    inputs={"train": s3_input_train_data},
    wait=False,
)

In [47]:
training_job_name = estimator.latest_training_job.name
print("Training Job Name:  {}".format(training_job_name))

Training Job Name:  pytorch-training-2022-03-03-19-19-58-747


In [48]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(
            region, training_job_name
        )
    )
)

In [49]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(
            region, training_job_name
        )
    )
)

In [50]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(
            bucket, training_job_name, region
        )
    )
)

In [None]:
training_job_name, sm_bucket

In [None]:
!aws s3 cp s3://$sm_bucket/$training_job_name/output/model.tar.gz s3://$bucket/models/bigbird_model_1.tar.gz

**Can I train with GPU?**

In [55]:
estimator = PyTorch(entry_point='train_transformer_script.py',
                    source_dir='./src_transformers',
                    role=role,
                    framework_version='1.9',
                    py_version='py38',
                    instance_count=train_instance_count,
                    instance_type='ml.g4dn.xlarge',
                    hyperparameters={
                        'epochs': 1,
                    })

In [56]:
estimator.fit(
    # not the best way to do this. TODO: specify channels separately
    inputs={"train": s3_input_train_data},
    wait=False,
)

training_job_name = estimator.latest_training_job.name
print("Training Job Name:  {}".format(training_job_name))

ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'ml.g4dn.xlarge for training job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit.