In [None]:
import boto3
from pathlib import Path

import datetime

In [None]:
session = boto3.session.Session(profile_name='dev2.DevAdministratorAccess')

In [None]:
region_name = session.region_name
account_id = session.client('sts').get_caller_identity()['Account']
region_name, account_id

In [None]:
sm_runtime = session.client('sagemaker-runtime')
s3_client = session.client('s3')
sage = session.client('sagemaker')

In [None]:
model_name = sage.list_models()['Models'][0]['ModelName']

In [None]:
bucket_name = f'pod-transcription-{account_id}-{region_name}'

In [None]:
all_audio_keys = [o['Key'] for o in s3_client.list_objects_v2(Bucket=bucket_name, Prefix='audio/')['Contents']]

In [None]:
episode_audio_keys = []
for key in all_audio_keys:
    try:
        int(Path(key).stem)
        episode_audio_keys.append(key)
    except:
        pass

In [None]:
print(episode_audio_keys)

In [None]:
existing_nums = []
for obj in s3_client.list_objects_v2(Bucket=bucket_name, Prefix='whisper-batch-output/')['Contents']:
    basename = Path(obj['Key']).stem
    pref = basename.split('.')[0]
    if basename.endswith('.json') and pref.isnumeric():
        existing_nums.append(pref)
existing_nums

In [None]:
filtered_keys = [
    key for key in episode_audio_keys if Path(key).stem.split('.')[0] not in existing_nums
]
print(sorted(filtered_keys), len(filtered_keys))

In [None]:
manifest =[
    {
        "prefix": f"s3://{bucket_name}/whisper-batch-input/"
    }
]
for key in filtered_keys[:6]:
    input_data = json.dumps({
        'bucket_name': bucket_name,
        'object_key': key,
    })
    input_file = f'{Path(key).stem}.json'
    input_data_key = f'whisper-batch-input/{input_file}'
    input_location = f's3://{bucket_name}/{input_data_key}'
    manifest.append(input_file)
    s3_client.put_object(Bucket=bucket_name, Key=input_data_key, Body=input_data)
    print(f'Wrote {input_data} to s3://{bucket_name}/{input_data_key}')

In [None]:
with open('../common-tags.json') as f:
    tags = json.load(f)

tag_dicts = []
for tag in tags:
    k, v = tag.split('=')
    tag_dicts.append({ 'Key': k, 'Value': v})
tag_dicts

In [None]:
job_name = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
manifest_key = f'whisper-batch-input/{job_name}.manifest'
manifest_location = f's3://{bucket_name}/{manifest_key}'
print(f'Wrote {manifest} to {manifest_location}')
s3_client.put_object(Bucket=bucket_name, Key=manifest_key, Body=json.dumps(manifest))

output_location = f's3://{bucket_name}/whisper-batch-output/{job_name}'
response = sage.create_transform_job(
    TransformJobName=job_name,
    ModelName=model_name,
    MaxConcurrentTransforms=1,
    BatchStrategy='SingleRecord',
    ModelClientConfig={
        'InvocationsTimeoutInSeconds': 3600,
        'InvocationsMaxRetries': 0
    },
    TransformInput={
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'ManifestFile',
                'S3Uri': manifest_location
            }
        },
        'ContentType': 'application/json'
    },
    TransformOutput={
        'S3OutputPath': output_location,
        'Accept': 'application/json'
    },
    TransformResources={
        'InstanceType': 'ml.p3.2xlarge',    # cheapest/slow: 'ml.m4.xlarge' (crashed), recommended: 'ml.p3.2xlarge'
        'InstanceCount': 1,
    },
    Tags=tag_dicts
)
response

In [None]:
output_location

In [None]:
transcribe = session.client('transcribe')
transcribe_audio_input_key = f's3://{bucket_name}/audio/sample1.mp3'
print(f'{transcribe_audio_input_key} -> {transcribe_output_key}')

In [None]:
for episode_audio_key in episode_audio_keys[1:]:
    mp3_audio_key = episode_audio_key.replace('m4a', 'mp3')
    prefix = Path(mp3_audio_key).stem
    transcribe_output_key = f'transcribe-output/{prefix}'
    transcribe_job_name = datetime.datetime.now().strftime(f'%Y%m%d%H%M%S_{prefix}')
    transcribe.start_transcription_job(
        TranscriptionJobName=transcribe_job_name,
    #     LanguageCode='af-ZA'|'ar-AE'|'ar-SA'|'da-DK'|'de-CH'|'de-DE'|'en-AB'|'en-AU'|'en-GB'|'en-IE'|'en-IN'|'en-US'|'en-WL'|'es-ES'|'es-US'|'fa-IR'|'fr-CA'|'fr-FR'|'he-IL'|'hi-IN'|'id-ID'|'it-IT'|'ja-JP'|'ko-KR'|'ms-MY'|'nl-NL'|'pt-BR'|'pt-PT'|'ru-RU'|'ta-IN'|'te-IN'|'tr-TR'|'zh-CN'|'zh-TW'|'th-TH'|'en-ZA'|'en-NZ',
    #     MediaSampleRateHertz=123,
        MediaFormat='mp3',
        Media={
            'MediaFileUri': f's3://{bucket_name}/{mp3_audio_key}',
        },
        OutputBucketName=bucket_name,
        OutputKey=transcribe_output_key,
    #     OutputEncryptionKMSKeyId='string',
    #     KMSEncryptionContext={
    #         'string': 'string'
    #     },
        Settings={
    #         'VocabularyName': 'string',
            'ShowSpeakerLabels': True,
            'MaxSpeakerLabels': 3,
    #         'ChannelIdentification': True|False,
    #         'ShowAlternatives': True|False,
    #         'MaxAlternatives': 123,
    #         'VocabularyFilterName': 'string',
    #         'VocabularyFilterMethod': 'remove'|'mask'|'tag'
        },
    #     ModelSettings={
    #         'LanguageModelName': 'string'
    #     },
    #     JobExecutionSettings={
    #         'AllowDeferredExecution': True|False,
    #         'DataAccessRoleArn': 'string'
    #     },
    #     ContentRedaction={
    #         'RedactionType': 'PII',
    #         'RedactionOutput': 'redacted'|'redacted_and_unredacted',
    #         'PiiEntityTypes': [
    #             'BANK_ACCOUNT_NUMBER'|'BANK_ROUTING'|'CREDIT_DEBIT_NUMBER'|'CREDIT_DEBIT_CVV'|'CREDIT_DEBIT_EXPIRY'|'PIN'|'EMAIL'|'ADDRESS'|'NAME'|'PHONE'|'SSN'|'ALL',
    #         ]
    #     },
        IdentifyLanguage=True,
        IdentifyMultipleLanguages=False,
        LanguageOptions=[
            'en-IE', 'en-GB', 'en-US',
        ],
        Subtitles={
            'Formats': [
                'vtt',
            ],
    #         'OutputStartIndex': 123
        },
        Tags=tag_dicts,
    #     LanguageIdSettings={
    #         'string': {
    #             'VocabularyName': 'string',
    #             'VocabularyFilterName': 'string',
    #             'LanguageModelName': 'string'
    #         }
    #     }
    )