# Transcribe Audio and Summarize

## Installs

In [6]:
!pip install --upgrade google-cloud-speech
!pip install vertexai
!pip install google-cloud-aiplatform>=1.25 "shapely<2.0.0"

[0mCollecting google-cloud-speech
  Downloading google_cloud_speech-2.20.0-py2.py3-none-any.whl (273 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m273.6/273.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[0mInstalling collected packages: google-cloud-speech
Successfully installed google-cloud-speech-2.20.0
[0m

## Setup

In [1]:
# Enable Vertex AI FOr This Project
!gcloud services enable aiplatform.googleapis.com

In [2]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'mg-ce-demos'

In [3]:
REGION = 'us-central1'
LOCATION = 'us'
EXPERIMENT = 'chirp-stt'
SERIES = 'applied-genai'

In [4]:
import os
import io
import json
import base64
import requests
import concurrent.futures
import time

import shapely

import numpy as np

import vertexai
from vertexai.preview.language_models import TextGenerationModel, TextEmbeddingModel, ChatModel
from google.cloud import aiplatform
from google.cloud import storage
from google.cloud import speech
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech

In [5]:
vertexai.init(project = PROJECT_ID, location = REGION)
aiplatform.init(project = PROJECT_ID, location = REGION)

In [6]:
stt = speech.SpeechClient()
gcs = storage.Client(project = PROJECT_ID)

## Get URI's from GCS

In [11]:
example_audio_uri = "speech/brooklyn_bridge.flac"
example_audio_uri2 = "speech/sample-podcasts/GCPEpisode328-DatabaseMigrationService-2min sample.flac"

gcs_bucket = 'cloud-samples-data'
bucket = gcs.bucket(gcs_bucket)
blob = bucket.blob(example_audio_uri2)


In [7]:
gcs_bucket = 'cloud-samples-data'
bucket = gcs.bucket(gcs_bucket)
# Get the list of blobs
blobs = bucket.list_blobs()

# Loop through the blobs
pdf_data = []
for blob in blobs:
    if blob.name.startswith('speech/'):
        print(blob.name)
        #pdf_data.append([blob.name, blob.content_type, blob.download_as_bytes()])

speech/
speech/Google_Gnome.wav
speech/VER_video_series/
speech/VER_video_series/Anu1.flac
speech/VER_video_series/Anu1.m4a
speech/VER_video_series/Anu1.wav
speech/VER_video_series/Anu2.wav
speech/VER_video_series/estella.wav
speech/VER_video_series/restaurants.wav
speech/VER_video_series/restaurants2.wav
speech/VER_video_series/restaurants3.wav
speech/VER_video_series/restaurants4.wav
speech/VER_video_series/restaurants5.wav
speech/VER_video_series/restaurants6.wav
speech/VER_video_series/restaurants7.wav
speech/VER_video_series/restaurants8.wav
speech/audio.flac
speech/audio.raw
speech/audio.txt
speech/brooklyn_bridge.flac
speech/brooklyn_bridge.mp3
speech/brooklyn_bridge.raw
speech/brooklyn_bridge.wav
speech/clip.flac
speech/clip.txt
speech/commercial_mono.wav
speech/commercial_stereo.wav
speech/corbeau_renard.flac
speech/en-US.wav
speech/hello.flac
speech/hello.raw
speech/hello.wav
speech/listen/Alice_51_sample_mix.aif
speech/listen/Alice_BL.flac
speech/listen/Alice_BR.flac
speech/

## Setup recognizer and download data

In [38]:
#recognizer = 'chirp-recognizer'
#stt_model = 'chirp'

In [12]:
audio_bytes = blob.download_as_bytes()

In [13]:
type(audio_bytes)

bytes

## Quickstart

In [41]:
def stt_demo(gcs_uri) -> speech.RecognizeResponse:
    # Instantiates a client
    client = speech.SpeechClient()

    # The name of the audio file to transcribe
    gcs_uri = gcs_uri

    audio = speech.RecognitionAudio(uri=gcs_uri)
    
    diarization_config = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=True,
        min_speaker_count=2,
        max_speaker_count=10,
    )

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        #sample_rate_hertz=16000,
        audio_channel_count=2,
        language_code="en-US",
        enable_word_confidence=True,
        #enable_word_time_offsets=True,
        model="default",
        #enable_speaker_diarization=True,
        diarization_config=diarization_config,
    )

    # Detects speech in the audio file
    operation = stt.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    #for result in response.results:
    #    print(f"Transcript: {result.alternatives[0].transcript}")
        
    # The transcript within each result is separate and sequential per result.
    # However, the words list within an alternative includes all the words
    # from all the results thus far. Thus, to get all the words with speaker
    # tags, you only have to take the words list from the last result:
    result = response.results[-1]

    words_info = result.alternatives[0].words

    # Printing out the output:
    for word_info in words_info:
        print(f"word: '{word_info.word}', speaker_tag: {word_info.speaker_tag}")

    return result

In [42]:
stt_demo(gcs_uri="gs://cloud-samples-data/speech/sample-podcasts/GCPEpisode328-DatabaseMigrationService-2min sample.flac")

ValueError: Unknown field for RecognitionConfig: enable_speaker_diarization

## Run transcriber

In [50]:
def transcribe_speech(audio_bytes):
  audio = speech.RecognitionAudio(content=audio_bytes)

  config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
    language_code="en-US",
    model="default",
    audio_channel_count=1,
    enable_word_confidence=True,
    enable_word_time_offsets=True,
  )

  # Detects speech in the audio file
  operation = stt.long_running_recognize(config=config, audio=audio)

  print("Waiting for operation to complete...")
  response = operation.result(timeout=90)
    
  return response


In [51]:
%%time

text = transcribe_speech(audio_bytes)

Waiting for operation to complete...
CPU times: user 5.26 ms, sys: 3.15 ms, total: 8.41 ms
Wall time: 1.45 s


In [44]:
for result in text.results:
    print("Transcript: {}".format(result.alternatives[0].transcript))


Transcript: how old is the Brooklyn Bridge


In [37]:
config = speech.RecognitionConfig(
    encoding = speech.RecognitionConfig.AudioEncoding.MP3,
    sample_rate_hertz = 16000,
    language_code = "en-US"
)

text = stt.recognize(
    config = config,
    audio = stt.RecognitionAudio(content = response.audio_bytes)
)

AttributeError: MP3