In [1]:
!pip install boto3

Collecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/2f/08/f1ff665147a5d75b871bbe5ba76916f6490419c52a33e588385c4b69281b/boto3-1.15.18-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 2.7MB/s 
[?25hCollecting botocore<1.19.0,>=1.18.18
[?25l  Downloading https://files.pythonhosted.org/packages/2d/72/984ac8f33b5c8df5ff63f323a8724f65b4d0f8956968b942b77d35d3a1ef/botocore-1.18.18-py2.py3-none-any.whl (6.7MB)
[K     |████████████████████████████████| 6.7MB 6.8MB/s 
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading https://files.pythonhosted.org/packages/07/cb/5f001272b6faeb23c1c9e0acc04d48eaaf5c862c17709d20e3469c6e0139/jmespath-0.10.0-py2.py3-none-any.whl
Collecting s3transfer<0.4.0,>=0.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/69/79/e6afb3d8b0b4e96cefbdc690f741d7dd24547ff1f94240c997a26fa908d3/s3transfer-0.3.3-py2.py3-none-any.whl (69kB)
[K     |████████████████████████████████| 71kB 6.6MB/s 
Installing collect

In [2]:
class config:
    AWS_ACCESS_KEY_ID = ''
    AWS_SECRET_ACCESS_KEY = ''
    region_name ='sa-east-1'
    bucket_name = 'amazon-transcribe'
    audio_format = 'wav'
    output_path = '/content'

In [22]:
from boto3 import client
from time import sleep
from urllib.request import urlopen
from json import loads
import pandas as pd
from os.path import join
from tqdm import tqdm

def get_transcription_from_job(transcribe, job_name):
    status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
    response = urlopen(status['TranscriptionJob']['Transcript']['TranscriptFileUri'])
    data = loads(response.read())
    text = data['results']['transcripts'][0]['transcript']

    return text

def get_bucket_names():
    """
    Realisa a busca de diferentes buckets cujos nomes começam pelo nome
    dado em config.bucket_name e os retorna.
    """

    s3 = client('s3', 
                aws_access_key_id=config.AWS_ACCESS_KEY_ID, 
                aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, 
                region_name=config.region_name)

    response = s3.list_buckets()

    bucket_names = []

    for idx, _ in enumerate(response):
        bucket = response['Buckets'][idx]['Name']
        if bucket.startswith(config.bucket_name):
            bucket_names.append(response['Buckets'][idx]['Name'])

    return bucket_names

def get_audio_files_url(bucket_name=None):
    """
    Realiza a busca das URLs dos arquivos em um determinado bucket.
    Caso um nome de um determinado bucket não seja passado como parâmetro, 
    como padrão, o último bucket criado será analisado.
    """

    URLS = []

    s3 = client('s3', 
                aws_access_key_id=config.AWS_ACCESS_KEY_ID, 
                aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, 
                region_name=config.region_name)

    if bucket_name is None:
        response = s3.list_buckets()

        bucket_name = response['Buckets'][-1]['Name']
        
        # print(f'  {bucket_name}')
        # print(len(response))
        # for idx, _ in enumerate(response):
        #     print(response['Buckets'][idx]['Name'])


    url_prefix = 'https://' + bucket_name + '.s3' + '-' + config.region_name + '.amazonaws.com'

    for key in s3.list_objects(Bucket=bucket_name)['Contents']:
        if key['Key'].endswith('.' + config.audio_format):
            URLS.append(url_prefix + '/' + key['Key'])

    # print(URLS)

    return URLS


def transcribe_audio_files(URLS):
    """
    Realisa a transcrição dos áudios.
    Tem como parâmetro as URLs dos áudios em um bucket.
    """

    file_names = []

    transcribed_texts = []

    transcribe = client('transcribe', 
                        aws_access_key_id=config.AWS_ACCESS_KEY_ID, 
                        aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, 
                        region_name=config.region_name)

    for counter, url in enumerate(tqdm(URLS)):
        file_name = str(url).split('/')[-1]
        file_names.append(file_name)

        # print(f"Transcrevendo {file_name}... ")
        # Diferentes nomes para cada iteração
        job_name = file_name.split('.')[0]
    
        try:
            transcribe.start_transcription_job(TranscriptionJobName=job_name, 
                                            Media={'MediaFileUri': url}, 
                                            MediaFormat=config.audio_format, 
                                            LanguageCode='pt-BR')
        except transcribe.exceptions.ConflictException:
            print(f"\tO arquivo '{file_name}' já foi transcrito, indo para o próximo arquivo...")
            text = get_transcription_from_job(transcribe, job_name)
            transcribed_texts.append(text)
            continue

        while True:
            status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
            if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
                break
            sleep(2)
            
        # print(f"{status['TranscriptionJob']['TranscriptionJobStatus']}\n")

        if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':
            response = urlopen(status['TranscriptionJob']['Transcript']['TranscriptFileUri'])
            data = loads(response.read())
            text = data['results']['transcripts'][0]['transcript']
            transcribed_texts.append(text)

    return file_names, transcribed_texts

def get_completed_job_names():

    transcribe = client('transcribe', 
                        aws_access_key_id=config.AWS_ACCESS_KEY_ID, 
                        aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, 
                        region_name=config.region_name)
    
    job_names = []
    
    response = transcribe.list_transcription_jobs(Status='COMPLETED', MaxResults=100)
    response_more = response

    # Do while
    while True:
        for job in response_more['TranscriptionJobSummaries']:
            job_names.append(job['TranscriptionJobName'])
        
        if 'NextToken' not in response_more.keys():
            break

        response_more = transcribe.list_transcription_jobs(Status='COMPLETED', NextToken=response_more['NextToken'], MaxResults=100)

    return job_names


def delete_completed_jobs(completed_job_names):
    """
    Deleta os 'Transcription jobs' completos.
    Essa função se faz necessária caso se use o mesmo 
    'job_name' (na classe config) para mais de uma execução do script.

    Os 'Transcription jobs' devem ser únicos.
    """

    transcribe = client('transcribe', 
                        aws_access_key_id=config.AWS_ACCESS_KEY_ID, 
                        aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, 
                        region_name=config.region_name)
    
    for job_name in tqdm(completed_job_names):
        transcribe.delete_transcription_job(TranscriptionJobName=job_name)

def make_matadata(file_names, transcribed_texts):
    df = pd.DataFrame()

    for file_name, text in zip(file_names, transcribed_texts):
        df = df.append({'A': file_name, 'B' : text}, ignore_index=True)

    df.to_csv(join(config.output_path, 'transcribed_text.csv'), sep='|', index=False, header=False, quotechar="'")

def run_transcribe():

    # completed_jobs = get_completed_job_names()
    # print(completed_jobs)

    bucket_names = get_bucket_names()

    for idx, bucket in enumerate(bucket_names):
        print("\nAcessando bucket {0} -> {1} de {2}".format(bucket, idx+1, len(bucket_names)))

        URLS = get_audio_files_url(bucket)
        file_names, transcribed_texts = transcribe_audio_files(URLS)
        make_matadata(file_names, transcribed_texts)

In [24]:
run_transcribe()


0it [00:00, ?it/s]

[]
[]





In [None]:
from google.colab import files
files.download('transcribed_text.csv') 

In [None]:
import logging
from boto3 import client
from botocore.exceptions import ClientError

import os
import sys
import threading

import ntpath

class ProgressPercentage(object):

    def __init__(self, filename):
        self._filename = filename
        self._size = float(os.path.getsize(filename))
        self._seen_so_far = 0
        self._lock = threading.Lock()

    def __call__(self, bytes_amount):
        # To simplify, assume this is hooked up to a single filename
        with self._lock:
            self._seen_so_far += bytes_amount
            percentage = (self._seen_so_far / self._size) * 100
            sys.stdout.write(
                "\r%s  %s / %s  (%.2f%%)" % (
                    self._filename, self._seen_so_far, self._size,
                    percentage))
            sys.stdout.flush()

def upload_file(file_path, bucket, object_name=None):

    if object_name is None:
        object_name = ntpath.basename(file_path)

    # Upload the file
    s3 = client('s3', 
                aws_access_key_id=config.AWS_ACCESS_KEY_ID, 
                aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, 
                region_name=config.region_name)
    try:
        response = s3.upload_file(file_path, 
                                  bucket, 
                                  object_name,
                                  Callback=ProgressPercentage(file_path))
    except ClientError as e:
        logging.error(e)
        return False
    return True

def upload_multiple_files(files_path, bucket_name):
    """
    Realiza o upload de multiplos arquivos.
    
    ---

    files_path se refere ao caminho até o diretório onde se encontram
    os arquivos de áudio.

    bucket_name se refere ao nome de um bucket especifico, onde será feita o 
    upload dos arquivos.
    """

    files = os.listdir(files_path)

    for file in files:
        upload_file(os.path.join(files_path, file), bucket_name)

def create_bucket(bucket_name, region=None):
    """
    Cria um bucket com permissões privadas.

    ---

    bucket_name é o nome do bucket a ser criado.

    region é o código de região (region_name), se não for explicitado, como padrão,
    a região 'us-west-2' (US West (Oregon)) será utilizada.
    """

    try:
        if region is None:
            s3 = client('s3', 
                        aws_access_key_id=config.AWS_ACCESS_KEY_ID, 
                        aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY)
            s3.create_bucket(Bucket=bucket_name, ACL='private')


        else:
            s3 = client('s3', 
                        aws_access_key_id=config.AWS_ACCESS_KEY_ID, 
                        aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, 
                        region_name=region)
            location = {'LocationConstraint': region}
            s3.create_bucket(Bucket=bucket_name,
                            CreateBucketConfiguration=location,
                            ACL='private')
    
        response_public = s3.put_public_access_block(
            Bucket=bucket_name,
            PublicAccessBlockConfiguration={
                'BlockPublicAcls': True,
                'IgnorePublicAcls': True,
                'BlockPublicPolicy': True,
                'RestrictPublicBuckets': True
            },
        )

    except ClientError as e:
        logging.error(e)
        return False
    return True

# Exemplos de uso

In [None]:
create_bucket('cbtest0', config.region_name)

True

In [None]:
upload_multiple_files('/content/audio', 'cbtest0')

/content/a/015.wav  62044 / 62044.0  (100.00%)