### Whisper Inference on Vertex AI
The following notebook demostrates running Automatic Speech Recognition via the HuggingFace Transformers library on Vertex AI Prediction.


A selection of amicorpus mixed headset audio files were downloaded and then copied to cloud storage.  These audio files contain mixed audio from various AMI meetings ranging from 10 min to over an hour.

See [here](https://groups.inf.ed.ac.uk/ami/download/) for more details

In [65]:
# Pre-reqs
import sys
!{sys.executable} -m pip install transformers datasets torch==1.13.* torchaudio

Collecting torch==1.13.*
  Downloading torch-1.13.1-cp37-cp37m-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m530.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchaudio
  Downloading torchaudio-0.13.1-cp37-cp37m-manylinux1_x86_64.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m101.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting nvidia-cudnn-cu11==8.5.0.96
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cuda-nvrtc-cu11==11.7.99
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m00:01[0

In [5]:
!mkdir CPR

mkdir: cannot create directory ‘CPR’: File exists


### Create Custom Prediction Routine
Next, we will create a custom prediction routine using the HuggingFace ASR Pipeline libraries to take in a GCS path to an audio file and render a transcription either inline or back out to a file in cloud storage.

In [72]:
%%writefile CPR/predictor.py

from google.cloud.aiplatform.prediction.predictor import Predictor
from google.cloud.aiplatform.utils import prediction_utils
from google.cloud import storage

from transformers import pipeline
from datasets import Dataset, Audio, load_dataset

from pathlib import Path
class CprPredictor(Predictor):
    
    def __init__(self):
        # Bring in libsndfile1 if it is not there
        import subprocess
        subprocess.run(["apt update && apt install libsndfile1 -y"], shell=True,capture_output=True)
        
        return
    
    def _decompose_remote_file(self,remote_file) -> dict:
        return {
            'filename': remote_file.split('/')[-1],
            'bucket': remote_file.split('/')[2],
            'file_path': '/'.join(remote_file.split('/')[3:-1])
        }

    def _map_gcs_file_to_local_file(self,local_dir,remote_file) -> str:
        gcs_file_dict = self._decompose_remote_file(remote_file)
        return f"{local_dir}/{gcs_file_dict['bucket']}/{gcs_file_dict['file_path']}/{gcs_file_dict['filename']}"

    def _get_transcription(self, file_path):
        return self._transcriber(file_path)
    
    def _upload_transcription(self, file,contents):
        
        t = contents['text']
        
        gcs_file_dict = self._decompose_remote_file(file)
        output_filename = f"{gcs_file_dict['filename']}.txt"

        bucket = self._storage_client.bucket(gcs_file_dict['bucket'])
        blob = bucket.blob(f"{gcs_file_dict['file_path']}/{output_filename}")
        blob.upload_from_string(t)
        return f"gs://{gcs_file_dict['bucket']}/{gcs_file_dict['file_path']}/{output_filename}"
    
    def _download_file_from_gcs(self, remote_file) -> str:
        local_dir = '/tmp'
        gcs_file_dict = self._decompose_remote_file(remote_file)
        
        # Create local
        p = Path(f"{local_dir}/{gcs_file_dict['bucket']}/{gcs_file_dict['file_path']}")
        print('Creating local dir',p)
        p.mkdir(parents=True, exist_ok=True)
        
        local_file = self._map_gcs_file_to_local_file('/tmp',remote_file)
        
        bucket = self._storage_client.bucket(gcs_file_dict['bucket'])
        blob = bucket.blob(f"{gcs_file_dict['file_path']}/{gcs_file_dict['filename']}")
        blob.download_to_filename(local_file)
        return local_file        
    
    def load(self, artifacts_uri: str) -> None:
        self._model_name = 'openai/whisper-base'
        self._storage_client = storage.Client()
        self._transcriber = pipeline(
            model=self._model_name,
            chunk_length_s=30, 
            stride_length_s=(5,5), 
            device=0,
            #return_timestamps=True
        )
        
        return
        
    def preprocess(self, prediction_input: [str]) -> Dataset:
        
        instances_ds = {}
        local_files,source_files, output_formats = [],[],[]
        
        for instance in prediction_input['instances']:
            #Convert to GCS URL for non-tf-model
            #instance['input_file'] = instance['input_file'].replace('gs://','https://storage.cloud.google.com/')
            
            output_format = "inline"
            
            if 'output_format' in instance:
                output_format = instance['output_format']
                
            local_file = self._download_file_from_gcs(instance['input_file'])

            local_files.append(local_file)
            source_files.append(instance['input_file'])
            output_formats.append(output_format)
        
        instances_ds = {"audio":local_files,"source_file":source_files,"output_format":output_formats }
        print(instances_ds)
        
        #aDataset = load_dataset("audiofolder",data_files=instances_ds,fs=self._fs)

        return Dataset.from_dict(instances_ds).cast_column("audio",Audio(sampling_rate=16000)) #aDataset

    def predict(self, audio_dataset):
        """Performs prediction."""
        predictions = []
        
        for file in audio_dataset:
            output=self._get_transcription(file['audio']['path'])
            if file['output_format'] == 'file':
                output = self._upload_transcription(file['source_file'],output)
                #output = self._upload_transcription(file['audio']['path'],output)

            predictions.append({"input_file":file['source_file'],"output_format":file['output_format'],"output":output})

        return {"predictions": predictions}

Overwriting CPR/predictor.py


### Test Custom Prediction Routine `predictor` locally
First, we will import our custom prediction routine into the current kernel and verify it properly handles our instance payload and returns the results we expect

Note: when changing the CPR above, reload the kernel to see changes

In [73]:
from CPR.predictor import CprPredictor

predictor = CprPredictor()

Define a few test instances

In [4]:
instances = {"instances":[
                {"input_file":"gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002a/audio/ES2002a.Mix-Headset.wav","output_format":"inline"},
                {"input_file":"gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002b/audio/ES2002b.Mix-Headset.wav","output_format":"file"}
            ]
        }

Next, lets call our `preprocess` and `predict` methods directly to test out our model 

In [75]:
predictions = predictor.predict(predictor.preprocess(instances))
predictions

Creating local dir /tmp/gcp-ml-sandbox-whisper/audio/amicorpus/ES2002a/audio
Creating local dir /tmp/gcp-ml-sandbox-whisper/audio/amicorpus/ES2002b/audio
{'audio': ['/tmp/gcp-ml-sandbox-whisper/audio/amicorpus/ES2002a/audio/ES2002a.Mix-Headset.wav', '/tmp/gcp-ml-sandbox-whisper/audio/amicorpus/ES2002b/audio/ES2002b.Mix-Headset.wav'], 'source_file': ['gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002a/audio/ES2002a.Mix-Headset.wav', 'gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002b/audio/ES2002b.Mix-Headset.wav'], 'output_format': ['inline', 'file']}




{'predictions': [{'input_file': 'gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002a/audio/ES2002a.Mix-Headset.wav',
   'output_format': 'inline',
   'output': {'text': " Like, gosh, she's already produced a PowerPoint. I think it's already back. Okay. Right? Well, this is the kickoff meeting for our project. And this is just what we're going to be doing over the next 25 minutes. So first of all, just to kind of make sure that we all know each other. I'm Laura and I'm the project manager. Do you want to introduce yourself again? I'm David and I'm supposed to be an industrial designer. Okay. I'm Andrew and I'm a marketing expert. I'm Greg and I'm a user interface. Great. So we're designing a new remote control. Oh, I have to record he's here actually. David, Andrea and Craig? And you all arrive don't time. Yes, we designed a new remote control. As you can see, it's supposed to be original trendy and user friendly. So that's kind of our brief, so we're, and so there are three different s

For our test instances, we specified to return the response inline for the first input file, and to write the results back to Cloud Storage for the second file.  Let's quickly check the output of the second file, just grabbing the first 1000 characters.

In [76]:
output_file = predictions['predictions'][1]['output']
print(f'Output file path: {output_file}')
!gsutil cat $output_file | head -c 1000

Output file path: gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002b/audio/ES2002b.Mix-Headset.wav.txt
 Oh yes, I forgot about that. Okay. So, right now. Okay. That's cool. Someone turn these on. Sorry? Did someone turn these on for us? Great. Okay, everybody, I'll start the meeting. Okay. We've got half an hour for this one to discuss the functional design Thanks Already to go okay, okay, so hopefully if I was in working away and I've put the minutes of the last meeting in the project folder. So I guess just to recap on what we did last time. I got to know each other a little bit and got familiar with all the equipment and started to discuss a bit about the project, you know, cost wise, how much money we have three new requirements, which is the first one is that The companies decided that teletext is outdated because of how popular the internet is nobody uses teletext very much anymore So we don't really need to consider that in the functionality of the remote control and they've al

### Building the CPR Image

Now that we've tested our CPR predictor routine locally with some sample instances and verified the output, lets build the CPR container, test the container locally, and then make it available in Vertex

First, let's capture our library requirements for installation into our CPR prediction container

In [77]:
%%writefile CPR/requirements.txt
datasets==2.10.0
google-cloud-storage==2.7.0
transformers==4.27.3
torchaudio==0.13.*
librosa==0.10.0
gcsfs==2023.1.0
protobuf==3.20.*

Overwriting CPR/requirements.txt


Double-checking the requirements file

In [78]:
cat CPR/requirements.txt

datasets==2.10.0
google-cloud-storage==2.7.0
transformers==4.27.3
torchaudio==0.13.*
librosa==0.10.0
gcsfs==2023.1.0
protobuf==3.20.*


Next, lets build our CPR predictor image. 

In [79]:
from google.cloud.aiplatform.prediction import LocalModel
from CPR.predictor import CprPredictor

import logging
logging.basicConfig(level=logging.INFO)

import os

# {import your predictor and handler}
REGION='us-central1'
PROJECT_ID='gcp-ml-sandbox'
REPOSITORY='vertex-custom-containers'
IMAGE='whisper-base-asr-pt-training-gpu'
BASE_IMAGE='pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime'

local_model = LocalModel.build_cpr_model(
    os.getcwd(),
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}",
    predictor=CprPredictor,
    base_image=BASE_IMAGE,
    requirements_path='CPR/requirements.txt',
    no_cache=False
)

INFO:google.cloud.aiplatform.docker_utils.build:Running command: docker build -t us-central1-docker.pkg.dev/gcp-ml-sandbox/vertex-custom-containers/whisper-base-asr-pt-training-gpu --rm -f- /home/jupyter/whisper/notebooks
  self.stdin = io.open(p2cwrite, 'wb', bufsize)
  self.stdout = io.open(c2pread, 'rb', bufsize)
INFO:google.cloud.aiplatform.docker_utils.local_util:Sending build context to Docker daemon  7.448MB
INFO:google.cloud.aiplatform.docker_utils.local_util:

INFO:google.cloud.aiplatform.docker_utils.local_util:Step 1/14 : FROM pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime

INFO:google.cloud.aiplatform.docker_utils.local_util: ---> 71eb2d092138

INFO:google.cloud.aiplatform.docker_utils.local_util:Step 2/14 : ENV PYTHONDONTWRITEBYTECODE=1

INFO:google.cloud.aiplatform.docker_utils.local_util: ---> Using cache

INFO:google.cloud.aiplatform.docker_utils.local_util: ---> 2f5dc93edcf2

INFO:google.cloud.aiplatform.docker_utils.local_util:Step 3/14 : EXPOSE 8080

INFO:google.clou

With our local model built, lets see what our `serving_container_specs` look like

In [80]:
serving_specs = local_model.get_serving_container_spec()
serving_specs

image_uri: "us-central1-docker.pkg.dev/gcp-ml-sandbox/vertex-custom-containers/whisper-base-asr-pt-training-gpu"
predict_route: "/predict"
health_route: "/health"

In order to reduce contention with accessing the GPU, lets pin the workers to 1.  See the [build_cpr_method](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.prediction.LocalModel#google_cloud_aiplatform_prediction_LocalModel_build_cpr_model) method for more info on controlling the # of workers in a custom prediction routine

In [81]:
serving_specs = local_model.get_serving_container_spec()

serving_specs.env = [{'name':"VERTEX_CPR_MAX_WORKERS",'value':"1"}]
local_model.serving_container_spec = serving_specs

local_model.get_serving_container_spec()



image_uri: "us-central1-docker.pkg.dev/gcp-ml-sandbox/vertex-custom-containers/whisper-base-asr-pt-training-gpu"
env {
  name: "VERTEX_CPR_MAX_WORKERS"
  value: "1"
}
predict_route: "/predict"
health_route: "/health"

Next, lets test out our model locally by deploying it to a local endpoint

In [83]:
from google.cloud.aiplatform.prediction import LocalModel,LocalEndpoint

with local_model.deploy_to_local_endpoint(
    host_port=8081,
    gpu_count=-1
) as local_endpoint:
    health_check_response = local_endpoint.run_health_check()
    print(health_check_response, health_check_response.content)


INFO:google.cloud.aiplatform.prediction.local_endpoint:Got the project id from the global config: gcp-ml-sandbox.


<Response [200]> b'{}'


In [85]:
# Start our server backup
local_endpoint.serve()



In [9]:
local_model.push_image()

  self.stdin = io.open(p2cwrite, 'wb', bufsize)
  self.stdout = io.open(c2pread, 'rb', bufsize)
INFO:google.cloud.aiplatform.docker_utils.local_util:Using default tag: latest

INFO:google.cloud.aiplatform.docker_utils.local_util:The push refers to repository [us-central1-docker.pkg.dev/gcp-ml-sandbox/vertex-custom-containers/whisper-base-stt-tf-training-gpu]

INFO:google.cloud.aiplatform.docker_utils.local_util:505f42865e39: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:daf16329ed27: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:9cfdcbb035f2: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:c15df39cb355: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:e42695c7b436: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:e42695c7b436: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:72f0f663075e: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:0c307ceb7de8: Preparing

INFO:google.cl

In [88]:
import json
json_instances = json.dumps(instances)
json_instances

predict_response = local_endpoint.predict(
    request=json_instances,
    headers={"Content-Type": "application/json"}
)
response = json.loads(predict_response.text)
response

{'predictions': [{'input_file': 'gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002a/audio/ES2002a.Mix-Headset.wav',
   'output_format': 'inline',
   'output': {'text': " Like, gosh, she's already produced a PowerPoint. I think it's already back. Okay. Right? Well, this is the kickoff meeting for our project. And this is just what we're going to be doing over the next 25 minutes. So first of all, just to kind of make sure that we all know each other. I'm Laura and I'm the project manager. Do you want to introduce yourself again? I'm David and I'm supposed to be an industrial designer. Okay. I'm Andrew and I'm a marketing expert. I'm Greg and I'm a user interface. Great. So we're designing a new remote control. Oh, I have to record he's here actually. David, Andrea and Craig? And you all arrive don't time. Yes, we designed a new remote control. As you can see, it's supposed to be original trendy and user friendly. So that's kind of our brief, so we're, and so there are three different s

Great - now that we've tested out local endpoint, lets push it up to the aritifact registry to prepare for deployment in Vertex

In [90]:
local_model.push_image()

  self.stdin = io.open(p2cwrite, 'wb', bufsize)
  self.stdout = io.open(c2pread, 'rb', bufsize)
INFO:google.cloud.aiplatform.docker_utils.local_util:Using default tag: latest

INFO:google.cloud.aiplatform.docker_utils.local_util:The push refers to repository [us-central1-docker.pkg.dev/gcp-ml-sandbox/vertex-custom-containers/whisper-base-asr-pt-training-gpu]

INFO:google.cloud.aiplatform.docker_utils.local_util:64acb45d4f49: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:d39968f49404: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:08183ab97c71: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:1f11823ff78c: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:2f9392706e5a: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:1d1bf9a3cb96: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:aa7652a10f81: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:45bbe3d22998: Preparing

INFO:google.cl

With our CPR image in the cloud repository, lets know register the local model in the Vertex Registry, adding some tags

In [92]:
from google.cloud import aiplatform

#PARENT_MODEL='1335653740073451520'

model = aiplatform.Model.upload(
    local_model=local_model,
    #parent_model=PARENT_MODEL,
    is_default_version=True,
    version_aliases=['hf-pipeline','pytorch'],
    labels={'base-image':'pytorch-1-13-1-cuda11-6-cudnn8-runtime'},
    display_name='whisper-base-stt-hf-pytorch-gpu'
)

Creating Model


INFO:google.cloud.aiplatform.models:Creating Model


Create Model backing LRO: projects/357746845324/locations/us-central1/models/3068906277913493504/operations/7426688467049906176


INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/357746845324/locations/us-central1/models/3068906277913493504/operations/7426688467049906176


Model created. Resource name: projects/357746845324/locations/us-central1/models/3068906277913493504@1


INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/357746845324/locations/us-central1/models/3068906277913493504@1


To use this Model in another session:


INFO:google.cloud.aiplatform.models:To use this Model in another session:


model = aiplatform.Model('projects/357746845324/locations/us-central1/models/3068906277913493504@1')


INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/357746845324/locations/us-central1/models/3068906277913493504@1')


Now lets create a new Vertex Endpoint and deploy our model

In [97]:
vertex_endpoint = aiplatform.Endpoint.create(
    display_name='whisper-base-stt-endpoint'
)

Creating Endpoint


INFO:google.cloud.aiplatform.models:Creating Endpoint


Create Endpoint backing LRO: projects/357746845324/locations/us-central1/endpoints/2088346415100067840/operations/8957912340355874816


INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/357746845324/locations/us-central1/endpoints/2088346415100067840/operations/8957912340355874816


Endpoint created. Resource name: projects/357746845324/locations/us-central1/endpoints/2088346415100067840


INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/357746845324/locations/us-central1/endpoints/2088346415100067840


To use this Endpoint in another session:


INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:


endpoint = aiplatform.Endpoint('projects/357746845324/locations/us-central1/endpoints/2088346415100067840')


INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/357746845324/locations/us-central1/endpoints/2088346415100067840')


In [None]:
vertex_endpoint.deploy(
    model=model,
    deployed_model_display_name='whisper-base-hf-pipeline',
    machine_type="n1-standard-8",
    accelerator_count=1,
    accelerator_type='NVIDIA_TESLA_T4',
    traffic_percentage=100,
    traffic_split={"0":100},
    sync=True
)



Lets check the status of our endpoint:

In [3]:
from google.cloud import aiplatform

vertex_endpoint = aiplatform.Endpoint('projects/357746845324/locations/us-central1/endpoints/2088346415100067840')

vertex_endpoint.list_models()


[id: "2095864875610800128"
 model: "projects/357746845324/locations/us-central1/models/3068906277913493504"
 display_name: "whisper-base-hf-pipeline"
 create_time {
   seconds: 1679674997
   nanos: 429482000
 }
 dedicated_resources {
   machine_spec {
     machine_type: "n1-standard-4"
     accelerator_type: NVIDIA_TESLA_T4
     accelerator_count: 1
   }
   min_replica_count: 1
   max_replica_count: 1
 }
 model_version_id: "1"]

We can see that our model is deployed now, so lets send it a prediction request for a single file

In [6]:
from google.cloud import aiplatform
instances = [
                {"input_file":"gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002a/audio/ES2002a.Mix-Headset.wav","output_format":"inline"},
            ]

prediction = vertex_endpoint.predict(instances)
prediction


Prediction(predictions=[{'input_file': 'gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002a/audio/ES2002a.Mix-Headset.wav', 'output_format': 'inline', 'output': {'text': " Like, gosh, she's already produced a PowerPoint. I think it's already back. Okay. Right? Well, this is the kickoff meeting for our project. And this is just what we're going to be doing over the next 25 minutes. So first of all, just to kind of make sure that we all know each other. I'm Laura and I'm the project manager. Do you want to introduce yourself again? I'm David and I'm supposed to be an industrial designer. Okay. I'm Andrew and I'm a marketing expert. I'm Greg and I'm a user interface. Great. So we're designing a new remote control. Oh, I have to record he's here actually. David, Andrea and Craig? And you all arrive don't time. Yes, we designed a new remote control. As you can see, it's supposed to be original trendy and user friendly. So that's kind of our brief, so we're, and so there are three different 

Great - we were able to get a realtime transcription of a 20 min .wav file in less than the 60s timeout of the endpoint.  Pretty amazing!

### Batch Prediction
Now lets prepare some data for Batch Prediction where we will transcribe a set of audio files and write the results back out to cloud storage

In [7]:
# Root audio directory:
BUCKET="gcp-ml-sandbox-whisper"
AUDIO_ROOT="audio/amicorpus"
AUDIO_ROOT

'audio/amicorpus'

Let's list out the audio files in cloud storage

In [8]:
AUDIO_GLOB_PATH=f"gs://{BUCKET}/{AUDIO_ROOT}/**/*.wav"
!gsutil ls $AUDIO_GLOB_PATH

gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002a/audio/ES2002a.Mix-Headset.wav
gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002b/audio/ES2002b.Mix-Headset.wav
gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002c/audio/ES2002c.Mix-Headset.wav
gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2002d/audio/ES2002d.Mix-Headset.wav
gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003a/audio/ES2003a.Mix-Headset.wav
gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003b/audio/ES2003b.Mix-Headset.wav
gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003c/audio/ES2003c.Mix-Headset.wav
gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003d/audio/ES2003d.Mix-Headset.wav
gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2004a/audio/ES2004a.Mix-Headset.wav
gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2004b/audio/ES2004b.Mix-Headset.wav
gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2004c/audio/ES2004c.Mix-Headset.wav
gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2004d/audio/ES2004d.Mix-Headset.wav
gs://gcp-ml-sand

Now, lets create an output file that contains a listing of our audio files and copy it up to cloud storage for Batch Prediction

In [9]:
audio_files=!gsutil ls  $AUDIO_GLOB_PATH
with open('amicorpus.jsonl', 'w') as w:
    for file in audio_files:
        #w.write(f'\{"input_file":"{file}","output_format":"file"\}\n')
        w.write(f'{{"input_file":"{file}","output_format":"file"}}\n')
    w.close()
    

In [10]:
TARGET_DIR=f"gs://{BUCKET}/{AUDIO_ROOT}"
!gsutil cp amicorpus.jsonl $TARGET_DIR

Copying file://amicorpus.jsonl [Content-Type=application/octet-stream]...
/ [1 files][  5.2 KiB/  5.2 KiB]                                                
Operation completed over 1 objects/5.2 KiB.                                      


With our files in place, let's kickoff our batch prediction job in vertex

In [17]:
from google.cloud import aiplatform

# Grabbing a reference to our model from our earlier execution:
model = aiplatform.Model('projects/357746845324/locations/us-central1/models/3068906277913493504@1')

bp_job = aiplatform.BatchPredictionJob.create(
    job_display_name='whisper-base-asr-stt',
    model_name=model,
    instances_format='jsonl',
    predictions_format= 'jsonl',
    gcs_source=[f'{TARGET_DIR}/amicorpus.jsonl'],
    gcs_destination_prefix=f'{TARGET_DIR}/batch-prediction-output',
    machine_type='n1-standard-4',
    accelerator_type='NVIDIA_TESLA_T4',
    accelerator_count=1,
    starting_replica_count= 2,
    max_replica_count=2,
    sync=False,
    batch_size=2,
    service_account='357746845324-compute@developer.gserviceaccount.com'
)


Creating BatchPredictionJob
BatchPredictionJob created. Resource name: projects/357746845324/locations/us-central1/batchPredictionJobs/8118503176111390720
To use this BatchPredictionJob in another session:
bpj = aiplatform.BatchPredictionJob('projects/357746845324/locations/us-central1/batchPredictionJobs/8118503176111390720')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/8118503176111390720?project=357746845324
BatchPredictionJob projects/357746845324/locations/us-central1/batchPredictionJobs/8118503176111390720 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/357746845324/locations/us-central1/batchPredictionJobs/8118503176111390720 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/357746845324/locations/us-central1/batchPredictionJobs/8118503176111390720 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/357746845324/locations/us-central1/batchPredictionJobs/

Lets check the state of the batch job:

In [21]:
bp_job.done()

False

BatchPredictionJob projects/357746845324/locations/us-central1/batchPredictionJobs/8118503176111390720 current state:
JobState.JOB_STATE_RUNNING


In [26]:
bp_job.done()

True

We can iterate over the batch job to look at the results which gives us a `google.cloud.storage.blob.Blob` object

In [42]:
import json
prediction_output = []
for output_file in bp_job.iter_outputs():
    print(output_file.download_as_text())


{"instance": {"input_file": "gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003b/audio/ES2003b.Mix-Headset.wav", "output_format": "file"}, "prediction": {"input_file": "gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003b/audio/ES2003b.Mix-Headset.wav", "output_format": "file", "output": "gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003b/audio/ES2003b.Mix-Headset.wav.txt"}}
{"instance": {"input_file": "gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003c/audio/ES2003c.Mix-Headset.wav", "output_format": "file"}, "prediction": {"input_file": "gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003c/audio/ES2003c.Mix-Headset.wav", "output_format": "file", "output": "gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003c/audio/ES2003c.Mix-Headset.wav.txt"}}
{"instance": {"input_file": "gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003d/audio/ES2003d.Mix-Headset.wav", "output_format": "file"}, "prediction": {"input_file": "gs://gcp-ml-sandbox-whisper/audio/amicorpus/ES2003d/audio/ES2003d.Mix-Head

## Ideas for future testing
 * Finding the optimal # of workers for a given accelerator to best maximize GPU memory
 * Testing out with multiple accelerators on an instance (e.g 4x T4s)
 * Using BetterTransformers in the pipeline to improve inference performance
 * Testing different batch sizes in pipeline code
 * Testing different batch sizes in BatchPrediction code
    
    

## Graveyard
-----------------------------------------------------------

### DEBUGGING
Running CPR container manually with `bash` entrypoint on port 8081
```
docker run -it --entrypoint bash --gpus all \
--env VERTEX_CPR_MAX_WORKERS=1 \
--env AIP_HTTP_PORT=8080 --env AIP_HEALTH_ROUTE='/' \
--env AIP_PREDICT_ROUTE='/predict' \
-p 0.0.0.0:8081:8080/tcp us-central1-docker.pkg.dev/gcp-ml-sandbox/vertex-custom-containers/whisper-base-asr-pt-training-gpu
```

Running CPR in container

```
python -m google.cloud.aiplatform.prediction.model_server
```

Installing missing libsndfile library in container

```
subprocess.run(["apt update && apt install libsndfile1 -y"], shell=True,capture_output=True)
```


### Experiments/Archive

Testing out HuggingFace Whisper TensorFlow model with XLA compilation for inference speedup.  Note, this model is limited to 30 second audio samples and will clip any audio beyond that duration

In [None]:
import tensorflow as tf
from transformers import AutoProcessor, TFWhisperForConditionalGeneration
from datasets import Dataset, Audio

from google.cloud import storage

import warnings
warnings.filterwarnings('ignore')

def generate_transcriptions(audio_files: list[dict]):
    
    def load_model(model_name="openai/whisper-base"):
        processor = AutoProcessor.from_pretrained(model_name)
        model_instance = TFWhisperForConditionalGeneration.from_pretrained(model_name)
        generator = tf.function(model_instance.generate, jit_compile=True)
        
        return processor, generator
    
    def get_input_features(audio_array, sampling_rate, processor):
            return processor(audio_array,sampling_rate=sampling_rate, return_tensors="tf").input_features

    def get_transcription(input_features,processor, generator):
        generated_ids = generator(input_features)
        return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    def upload_transcription(file,contents):
        filename = file.split('/')[-1]
        gcs_path = file.split(filename)[0]
        output_filename = f"{filename}.txt"

        bucket_name = file.split('/')[2]
        destination_blob_name = '/'.join(file.split('/')[3:-1]) + "/" + output_filename

        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(destination_blob_name)
        blob.upload_from_string(contents)
        return gcs_path + output_filename
    
    transcriptions =[]
    
    
    audio_dataset = Dataset.from_dict({"audio": audio_files}).cast_column("audio",Audio(sampling_rate=16000))    
    processor, generator = load_model()
    
    for file in audio_dataset:
        input_features = get_input_features(file['audio']['array'], audio_dataset.features["audio"].sampling_rate,processor)
        
        t=get_transcription(input_features, processor, generator)
        output_file = upload_transcription(file['audio']['path'],t)
        transcriptions.append(output_file)
    
    return transcriptions
        


In [101]:
trans = generate_transcriptions(audio_files)
trans

All model checkpoint layers were used when initializing TFWhisperForConditionalGeneration.

All the layers of TFWhisperForConditionalGeneration were initialized from the model checkpoint at openai/whisper-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFWhisperForConditionalGeneration for predictions without further training.
2023-02-28 20:26:42.822442: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at scatter_nd_op.cc:216 : INVALID_ARGUMENT: indices[0] = [0, -1] does not index into shape [1,51865]
2023-02-28 20:26:42.822997: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at scatter_nd_op.cc:216 : INVALID_ARGUMENT: indices[0] = [0, -1] does not index into shape [1,51865]
2023-02-28 20:26:42.823676: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at scatter_nd_op.cc:216 : INVALID_ARGUMENT: indices[0] = [0, -1] does not index into shape [1,51865]


{'gs://gcp-ml-sandbox-whisper/audio/librispeech_asr_dummy/1272/128104/1272-128104-0000.flac': ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.',
 'gs://gcp-ml-sandbox-whisper/audio/librispeech_asr_dummy/1272/128104/1272-128104-0001.flac': " Nor is Mr. Quilter's manner less interesting than his matter.",
 'gs://gcp-ml-sandbox-whisper/audio/librispeech_asr_dummy/1272/128104/1272-128104-0002.flac': ' He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly is drawn from eating and its results occur most readily to the mind.',
 'gs://gcp-ml-sandbox-whisper/audio/librispeech_asr_dummy/1272/128104/1272-128104-0003.flac': " He has graved doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca.",
 'gs://gcp-ml-sandbox-whisper/audio/librispeech_asr_dummy/1272/128104/1272-128104-0004.flac': " Linnell's pictures are a sort of upgards and a

In [3]:
generated_ids = xla_generate(input_features)

transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
transcription

' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'

###