In [2]:
%%time
import os
import json

import sagemaker
import boto3
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.utils import unique_name_from_base

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = get_execution_role()
prefix = 'wenet'
output_path = f"s3://{bucket}/{prefix}"

print("boto3.__version__:{}".format(boto3.__version__))
print("sagemaker.__version__:{}".format(sagemaker.__version__))
print("bucket:{}".format(bucket))
print("role:{}".format(role))

boto3.__version__:1.26.8
sagemaker.__version__:2.116.0
bucket:sagemaker-us-east-1-348052051973
role:arn:aws:iam::348052051973:role/service-role/AmazonSageMakerServiceCatalogProductsExecutionRole
CPU times: user 1.22 s, sys: 900 ms, total: 2.12 s
Wall time: 1.05 s


In [3]:
%%markdown
Change the /root/wenet to /opt/ml/input in all data.list files (especially for train_960 and dev)
The "Librispeech" in data.list file in Github has the wrong captalization because it's wrong when I upload it. Please change it yourself!

Copy the wenet/examples/librispeech/s0/*.sh and wenet/examples/librispeech/s0/local to wenet/ as requested by Sagemaker
Overwrite the wenet/wenet/bin/train.py with the given one
Change the /root/wenet to /opt/ml/input in all data.list files (especially for train_960 and dev)
The "Librispeech" in data.list file in Github has the wrong captalization because it's wrong when I upload it. Please change it yourself!


In [4]:
from sagemaker.inputs import TrainingInput
prefix_dataset = "wenet/export"
loc =f"s3://{bucket}/{prefix_dataset}"

training = TrainingInput(
    s3_data_type='S3Prefix', # Available Options: S3Prefix | ManifestFile | AugmentedManifestFile
    s3_data=loc,
    distribution='FullyReplicated', # Available Options: FullyReplicated | ShardedByS3Key 
    input_mode='FastFile'
)

In [5]:
instance_type = "ml.p3.16xlarge"
# instance_type='local'

max_run = 432000
checkpoint_s3_uri = f"s3://{bucket}/{prefix}/checkpoints"

hyperparameters = {
    'datadir':'/opt/ml/input/data/training',
    'stage': '4',
    'stop_stage': '4',
    'train_config': 'conf/train_conformer.yaml',
    'model_dir': '/opt/ml/model',
    'checkpoint_dir': '/opt/ml/checkpoints',
    'output_dir': '/opt/ml/output/data',
}

est = PyTorch(
    entry_point="run-librispeech.sh",
    source_dir="./wenet",
    framework_version="1.11.0",
    py_version="py38",
    role=role,
    instance_count=1,
    instance_type=instance_type,
    volume_size=200,
    disable_profiler=True,
    debugger_hook_config=False,
    base_job_name=prefix,
    hyperparameters = hyperparameters,
    checkpoint_s3_uri = checkpoint_s3_uri,
    output_path = f"s3://{bucket}/{prefix}/",
    # keep_alive_period_in_seconds=1800,
    max_run = max_run,
    tags = [{"Key": "team", "Value": "asr"}, {"Key": "person", "Value": "andrew"}, {"Key": "project", "Value": "abc"}],
)

In [6]:
%%time
job_name = est.fit({"training":training})
#job_name = est.fit()

ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'ml.p3.16xlarge for training job usage' is 1 Instances, with current utilization of 1 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit.

In [7]:
model_data = est.model_data
print("Model artifact saved at:\n", model_data)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


Model artifact saved at:
 s3://sagemaker-us-east-1-348052051973/wenet/wenet-2022-12-08-05-36-25-104/output/model.tar.gz
