# Transcribe audio using STT

## Setup

### Installs

In [4]:
# Install Vertex AI LLM SDK, langchain and dependencies
#!pip install google-cloud-aiplatform vertexai --upgrade --quiet


### Imports

In [17]:
from datetime import datetime
import time
import pandas as pd
import numpy as np
import json
import os

from IPython.display import Markdown

import vertexai
from vertexai.language_models import TextGenerationModel, TextEmbeddingModel
from vertexai.preview.language_models import TextGenerationModel as TextGenerationModel_preview
from google.cloud import aiplatform
from google.cloud import storage
from google.cloud import bigquery

print("Vertex AI version: " + str(aiplatform.__version__))

Vertex AI version: 1.34.0


### Env variables and Setup

In [18]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'mg-ce-demos'

In [19]:
REGION = 'us-central1'
LOCATION = 'us'
GCS_BUCKET = PROJECT_ID
BLOB_PATH = 'audio_data/test' #update with your GCS blob where the audio file is

In [21]:
# gcs client
gcs = storage.Client(project = PROJECT_ID)

# vertex ai clients
vertexai.init(project = PROJECT_ID, location = REGION)
aiplatform.init(project = PROJECT_ID, location = REGION)

### Functions

In [22]:
# get audio data from GCS
def get_audio_data(gcs_bucket, blob_path):
    bucket = gcs.bucket(gcs_bucket)

    # Get the list of blobs
    blobs = bucket.list_blobs()

    # Loop through the blobs
    audio_data = []
    for blob in blobs:
        if blob.name.startswith(blob_path):
            if blob.name.endswith('.mp3'):
                #print(blob.name)
                audio_data.append([blob.name, blob.content_type, f'gs://{GCS_BUCKET}/{blob.name}'])
                
    return {
        'data_name': audio_data[-1][0],
        'data_type': audio_data[-1][1],
        'data_uri': audio_data[-1][2]
    }

## Data: Get data from GCS

Can loop through GCS to pick up multiple files.  This iteration is built for one file at a time but easy to update.

Loop through GCS bucket(s) for audio files to transcribe

In [25]:
sample_uri = get_audio_data(GCS_BUCKET, BLOB_PATH)['data_uri']
print(sample_uri)

gs://mg-ce-demos/audio_data/test/Mapping_uncharted_undersea_volcanoes,_and_elephant_seals_dive_deep_to_sleep.mp3


## Transcribe: STT v2 - long

In [83]:
# STT using cloud speech v2
from google.api_core.client_options import ClientOptions
from google.cloud.speech_v2 import SpeechClient as SpeechClient_v2
from google.cloud.speech_v2.types import cloud_speech as cloud_speech_v2

def transcribe_gcs_v2(model, gcs_uri: str) -> str:
    TIMEOUT_DEFAULT = 3600
    if model == "long":
        word_conf = True
    else:
        word_conf = False
    

    # Instantiates a client
    client = SpeechClient_v2(
        client_options=ClientOptions(
            api_endpoint="us-central1-speech.googleapis.com",
        )
    )
    
    config = cloud_speech_v2.RecognitionConfig(
        auto_decoding_config=cloud_speech_v2.AutoDetectDecodingConfig(),
        language_codes=["en-US"],
        model=model,
        features=cloud_speech_v2.RecognitionFeatures(
            enable_automatic_punctuation=True, #only if using long
            enable_word_confidence=word_conf
        )
    )
    
    file_metadata = cloud_speech_v2.BatchRecognizeFileMetadata(uri=gcs_uri)

    request = cloud_speech_v2.BatchRecognizeRequest(
        recognizer=f"projects/{PROJECT_ID}/locations/{REGION}/recognizers/long-recognizer",
        config=config,
        files=[file_metadata],
        recognition_output_config=cloud_speech_v2.RecognitionOutputConfig(
            inline_response_config=cloud_speech_v2.InlineOutputConfig(),
        ),
    )

    # Transcribes the audio into text
    operation = client.batch_recognize(request=request)

    print("Waiting for operation to complete...")
    try:
        response = operation.result(timeout=TIMEOUT_DEFAULT)       # The default is 3600 seconds, 1 hour

    except:
        response = operation.result(timeout=1.5 * TIMEOUT_DEFAULT) # 5400 seconds, 1 hour and 30 minutes

    finally:
        response = operation.result(timeout=2 * TIMEOUT_DEFAULT)   # 7200 seconds, 2 hours

    return response


In [84]:
start_time = time.time()

MODEL = "long"
response_v2 = transcribe_gcs_v2(MODEL, sample_uri)

end_time = time.time()
total_time = (end_time - start_time)/60
print(f'{total_time} minutes')


Waiting for operation to complete...
17.02648857831955 minutes


In [85]:
print(f'{response_v2.total_billed_duration.seconds/60} billed minutes')

18.4 billed minutes


Build transcription, by speaker, grabbing words passing the confidence threshold

In [86]:
response_v2

results {
  key: "gs://mg-ce-demos/audio_data/test/Mapping_uncharted_undersea_volcanoes,_and_elephant_seals_dive_deep_to_sleep.mp3"
  value {
    transcript {
      results {
        alternatives {
          transcript: "This podcast is supported by the icon school of medicine at Mount Sinai. The academic arm of the Mount Sinai health system in New York City. And one of America\'s leading research medical schools, what are researchers working on to advance our understanding of the brain and to improve care for such disorders as depression, dementia and drug addiction."
          confidence: 0.982306718826294
          words {
            word: "This"
            confidence: 0.9955734014511108
          }
          words {
            word: "podcast"
            confidence: 0.9902135729789734
          }
          words {
            word: "is"
            confidence: 0.9062939882278442
          }
          words {
            word: "supported"
            confidence: 0.997375249862670

In [87]:
transcript_df = pd.DataFrame(columns=['file', 'order', 'text'])

for result in response_v2.results[sample_uri].transcript.results:
    transcript_df.loc[len(transcript_df.index)] = [sample_uri, order, result.alternatives[0].transcript]
    order += 1

transcript_df.head()
transcript_string = transcript_df['text'].str.cat(sep=' ')

In [88]:
transcript_string

"This podcast is supported by the icon school of medicine at Mount Sinai. The academic arm of the Mount Sinai health system in New York City. And one of America's leading research medical schools, what are researchers working on to advance our understanding of the brain and to improve care for such disorders as depression, dementia and drug addiction.  To find out. We invite you to read a special supplement to Science magazine prepared by icon Mount Sinai in partnership with science.  Just visit our website at www.science.org and search for frontiers of medical research -brainscience. The icon school of medicine at Mount Sinai. Those little slopes these changes in the angle of the water. You can make a pretty, pretty good, guess of the seamount and depth of the cement. So the new study did this worldwide and how many new seamounts were identified or mapped in this research, 19,000 over 19,000. And how many do we know about before? I think it was about 25,000, we know about four a lot o

## Transcribe: STT v2 - chirp

In [94]:
# STT using cloud speech v2
from google.api_core.client_options import ClientOptions
from google.cloud.speech_v2 import SpeechClient as SpeechClient_v2
from google.cloud.speech_v2.types import cloud_speech as cloud_speech_v2

def transcribe_gcs_v2(model, gcs_uri: str) -> str:
    TIMEOUT_DEFAULT = 3600
    if model == "long":
        word_conf = True
    else:
        word_conf = False
    

    # Instantiates a client
    client = SpeechClient_v2(
        client_options=ClientOptions(
            api_endpoint="us-central1-speech.googleapis.com",
        )
    )
    
    config = cloud_speech_v2.RecognitionConfig(
        auto_decoding_config=cloud_speech_v2.AutoDetectDecodingConfig(),
        language_codes=["en-US"],
        model=model,
        features=cloud_speech_v2.RecognitionFeatures(
            enable_automatic_punctuation=True, #only if using long
            enable_word_confidence=word_conf
        )
    )
    
    file_metadata = cloud_speech_v2.BatchRecognizeFileMetadata(uri=gcs_uri)

    request = cloud_speech_v2.BatchRecognizeRequest(
        recognizer=f"projects/{PROJECT_ID}/locations/{REGION}/recognizers/long-recognizer",
        config=config,
        files=[file_metadata],
        recognition_output_config=cloud_speech_v2.RecognitionOutputConfig(
            inline_response_config=cloud_speech_v2.InlineOutputConfig(),
        ),
    )

    # Transcribes the audio into text
    operation = client.batch_recognize(request=request)

    print("Waiting for operation to complete...")
    try:
        response = operation.result(timeout=TIMEOUT_DEFAULT)       # The default is 3600 seconds, 1 hour

    except:
        response = operation.result(timeout=1.5 * TIMEOUT_DEFAULT) # 5400 seconds, 1 hour and 30 minutes

    finally:
        response = operation.result(timeout=2 * TIMEOUT_DEFAULT)   # 7200 seconds, 2 hours

    return response


In [95]:
start_time = time.time()

MODEL = "chirp"
response_v2 = transcribe_gcs_v2(MODEL, sample_uri)

end_time = time.time()
total_time = (end_time - start_time)/60
print(f'{total_time} minutes')


Waiting for operation to complete...
1.1238877177238464 minutes


In [96]:
print(f'{response_v2.total_billed_duration.seconds/60} billed minutes')

18.4 billed minutes


Build transcription, by speaker, grabbing words passing the confidence threshold

In [97]:
response_v2

results {
  key: "gs://mg-ce-demos/audio_data/test/Mapping_uncharted_undersea_volcanoes,_and_elephant_seals_dive_deep_to_sleep.mp3"
  value {
    transcript {
      results {
        alternatives {
          transcript: " This podcast is supported by the Icon School of Medicine at Mount Sinai, the academic arm of the Mount Sini health system in New York City and one of America\'s leading research medical schools. What are researcher is working on to advance our understanding of the brain and to improve care for such disorders as depression, dementia and drug addiction? To find out, we invite you to read a special supplement to science magazine prepared by Icon Mountsieni in partnership with science. Just visit our website at."
        }
        language_code: "en-US"
      }
      results {
        alternatives {
          transcript: " www.science.org and search for frontiers ofmedical research-brainscience. the icon school of medicine at Mount Sinai. We find a way. This is the scienc

In [109]:
transcript_df = pd.DataFrame(columns=['file', 'order', 'text'])

for result in response_v2.results[sample_uri].transcript.results:
    transcript_df.loc[len(transcript_df.index)] = [sample_uri, order, result.alternatives[0].transcript]
    order += 1

transcript_df.head()
transcript_string = transcript_df['text'].str.cat(sep=' ')

In [116]:
transcript_string

" This podcast is supported by the Icon School of Medicine at Mount Sinai, the academic arm of the Mount Sini health system in New York City and one of America's leading research medical schools. What are researcher is working on to advance our understanding of the brain and to improve care for such disorders as depression, dementia and drug addiction? To find out, we invite you to read a special supplement to science magazine prepared by Icon Mountsieni in partnership with science. Just visit our website at.  www.science.org and search for frontiers ofmedical research-brainscience. the icon school of medicine at Mount Sinai. We find a way. This is the science podcast for April 21st 2023. I'm Sarah Crespy. First up this week, so many seamounts. Staff newswriter Paul Vouson joins me to discuss a study that nearly doubled the number of these.  submarine volcanoes. Next up, how to mammals that spend 90% of their time in the water, get any sleep? Jessica Kendelbar is here to talk about her

## 32K Model Summarization Test

In [107]:
textgen_model_32k = TextGenerationModel_preview.from_pretrained('text-bison-32k')

In [113]:
if len(transcript_string) > 25000:
    print('full string is too long, chopping to 30K')
    transcript_string = transcript_string[0:30000]
    #print(transcript_string)

full string is too long, chopping to 30K


In [115]:
# 32k model summarization test
# Ask the LLM
prompt = """Summarize the text below.  Write the summary in bullet form with 1-2 sentences per bullet highlighting key points.

Text:
{}""".format(transcript_string)

# Send prompt to LLM
response_32k = textgen_model_32k.predict(
   (prompt),
    max_output_tokens=5000,
    temperature=0.4,
    top_p=0.8,
    top_k=40,
)
display(Markdown(str(response_32k)))

 - Newswriter Paul Vousden discusses a study that nearly doubled the number of known seamounts, submarine volcanoes, from 2,500 to over 19,000. 
 - The new study used radar-equipped satellites to measure the height of the ocean worldwide, detecting changes in slope that indicate the presence of seamounts. 
 - Seamounts are important because they provide habitats for diverse marine life, can affect ocean circulation, and pose hazards to shipping. 
 - Jessica Kendell Bar, a postdoctoral fellow at the Scripps Institution of Oceanography, discusses her research on the sleep patterns of elephant seals. 
 - Elephant seals spend most of their lives in the water and can dive to great depths. 
 - Kendell Bar's study found that elephant seals sleep in short periods of time, both during deep dives and at the surface. 
 - The study also found that elephant seals experience REM sleep, which is unusual for marine mammals. 
 - Eric Nesler and Paul Kenny, addiction researchers at the Icahn School of Medicine at Mount Sinai, discuss the latest research on addiction. 
 - Addiction is a chronic mental health condition that affects millions of people worldwide. 
 - Researchers define addiction as a compulsive behavior that persists despite negative consequences. 
 - Addiction-related deaths have increased dramatically in recent years, with over 100,000 Americans dying each year from drug overdoses. 
 - The increase in deaths is due to factors such as the excessive consumption of opioids and other drugs in the United States. 
 - Advances in the science of addiction have improved our understanding of how drugs of abuse affect the brain and have led to the development of new treatments.

## Transcribe: STT v1

In [64]:
# STT using cloud speech v1
from google.cloud import speech_v1p1beta1 as speech
from google.cloud.speech_v1p1beta1 import SpeechClient
from google.cloud.speech_v1p1beta1.types import cloud_speech

def transcribe_gcs_v1(gcs_uri: str) -> str:

    TIMEOUT_DEFAULT = 3600

    client = SpeechClient()

    audio = cloud_speech.RecognitionAudio(uri=gcs_uri)
    config = cloud_speech.RecognitionConfig(
        encoding=cloud_speech.RecognitionConfig.AudioEncoding.MP3,    # Configure audio file type to MP3
        sample_rate_hertz = 16000,                                     # Sampling rate set to 44.1KHz
        language_code="en-US",                                        # Language: U. S. English
        enable_automatic_punctuation=True,                            # Enabled automatic punctuation while transforming
        enable_word_confidence=True,                                  # Enabled confidence scores for transcriptions
        model="latest_long",                                          # Works for most of the audio files including long files over 10 minutes
        #diarization_config=cloud_speech.SpeakerDiarizationConfig(     # Enabled speaker diarization
        #    enable_speaker_diarization=True
        #)
    )

    operation = client.long_running_recognize(config=config, audio=audio)   # Execute long running recognition job

    print("Waiting for operation to complete...")

    try:
        response = operation.result(timeout=TIMEOUT_DEFAULT)       # The default is 3600 seconds, 1 hour

    except:
        response = operation.result(timeout=1.5 * TIMEOUT_DEFAULT) # 5400 seconds, 1 hour and 30 minutes

    finally:
        response = operation.result(timeout=2 * TIMEOUT_DEFAULT)   # 7200 seconds, 2 hours

    #print("Building transcriptions...")
    #transcript_builder = []
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    #for result in response.results:
    #    alternative = result.alternatives[0]

        # The first alternative is the most likely one for this portion.
    #    transcript_builder.append(f"\n{result.alternatives[0].transcript}")
        # transcript_builder.append(f"\nConfidence: {result.alternatives[0].confidence}")

    #transcript = ''.join(transcript_builder)

    #return transcript_builder
    return response

In [61]:
start_time = time.time()

response_v1 = transcribe_gcs_v1(sample_uri)

end_time = time.time()
total_time = (end_time - start_time)/60
print(f'{total_time} minutes')

Waiting for operation to complete...
32.93471109867096 minutes


In [62]:
response_v1

results {
  alternatives {
    transcript: "cash power"
    confidence: 0.9721954464912415
    words {
      word: "cash"
      confidence: 0.9721954464912415
    }
    words {
      word: "power"
      confidence: 0.9721954464912415
    }
  }
  result_end_time {
    seconds: 11
    nanos: 530000000
  }
  language_code: "en-us"
}
results {
  alternatives {
    transcript: " Potter"
    confidence: 0.8984012603759766
    words {
      word: "Potter"
      confidence: 0.8984012603759766
    }
  }
  result_end_time {
    seconds: 21
    nanos: 740000000
  }
  language_code: "en-us"
}
results {
  alternatives {
  }
  result_end_time {
    seconds: 30
    nanos: 400000000
  }
  language_code: "en-us"
}
results {
  alternatives {
    transcript: " you are here"
    confidence: 0.9665749073028564
    words {
      word: "you"
      confidence: 0.9721954464912415
    }
    words {
      word: "are"
      confidence: 0.9721954464912415
    }
    words {
      word: "here"
      confidence: 0.95

In [63]:
print(f'{response_v1.total_billed_time.seconds/60} billed minutes')

101.3 billed minutes


In [67]:
transcript_builder = []
# Each result is for a consecutive portion of the audio. Iterate through
# them to get the transcripts for the entire audio file.
for result in response_v1.results:
    alternative = result.alternatives[0]
# The first alternative is the most likely one for this portion.
    transcript_builder.append(f"\n{result.alternatives[0].transcript}")
    #transcript_builder.append(f"\nConfidence: {result.alternatives[0].confidence}")

transcript = ''.join(transcript_builder)

In [68]:
transcript

"\ncash power\n Potter\n\n you are here\n sure you\n just\n oh you\n know\n I'm sure\n forces that show\n just\n ah sorry\n\n\n yes\nfashion\n ash good for you\n are sure\n yeah\n oh yeah\n yeah\n you\n you\nhmm\n wow\nif\nyou\n oh\n I wash more\n videos\nso how\n are you\n travel for you\nif\n you\n are\n yeah however\nsolution\n\nwas just\n right here\nFrozen\n and over here\n yes you\n how\n you\n not just\n and not copper fruits\n of water\nover here\n oh yeah\n\n Round Rock\n You\n\nwow\nI\n challenge\n you\n all part of\nif\nyou hammers for you\n first search\nthe second yeah\n sure\n how do you\n here\n for you\n\nhere\n\n here\nforever challenge\n oh sure\n here\n you\nyou you\n of course\n and\n the first\n you watch\npower\n of love for you\n yeah\n how short\n\n ones\nof you\n\nhere you\n\n\n for\n sure\n of subscribe\n for our\n\n her sister\n\n oh okay\n Luxor\n your\n must\nache\nso marvelous the last\n hot\n one\n\n\n\n\n and such an expert And subscribe\n fortunately re

In [55]:
result = response_v1.results[-1]
words_info = result.alternatives[0].words
#result
#words_info

In [56]:
speaker = 1
text = ''
order = 0
transcript = pd.DataFrame(columns=['file', 'order', 'speaker_tag', 'text'])

for word_info in words_info:
    if word_info.confidence >= 0.5:
        if word_info.speaker_tag==speaker:
            text=text+" "+word_info.word
        else:
            transcript.loc[len(transcript.index)] = [sample_uri, order, str('speaker_'+str(speaker)), text] 
            order += 1
            speaker=word_info.speaker_tag
            text=""+word_info.word

transcript.loc[len(transcript.index)] = [sample_uri, order, str('speaker_'+str(speaker)), text]

In [58]:
transcript['text'].replace('', np.nan, inplace=True)
transcript = transcript.dropna() 
' '.join(transcript["text"])

" cash power Potter you are here sure you just oh you know I'm sure Foster fish that show just are you fashion ashburton Church are sure yeah oh yeah yeah you hmm wow if oh I wash more videos so how are you travel for you if you are yeah whatever solution was just right here Frozen and over here if you however not just and not copper fruits of water over here oh yeah wow lunch wow I challenge you all part of if you hammers for you first search the second yeah sure how do you here for you however forever challenge oh sure here you you you of course and the first you watch power of love for you yeah how short ones of you here are for sure of subscribe for our prevention oh okay Luxor your must have happened so marvelous the last hot and just so and such that fortunately recognition sure you Russia that you are Versace oh welcome or financials and yeah wow are you however you her friends and traditions very interesting and and understand I said that has happened and you you know you are y

## UI (WIP)

In [31]:
def upload_file(file):
    file_path = file.name
    file_name = os.path.basename(file_path)
    bucket = gcs.bucket(GCS_BUCKET)
    blob = bucket.blob(f'audio_data/cumulus/{file_name}')
    blob.upload_from_filename(f'{file_path}')
    #print("Uploaded")
    
    gcs_path = f'gs://{GCS_BUCKET}/{blob.name}'
    
    return gcs_path
    
#with gr.Blocks() as demo:
#    file_output = gr.File()
#    upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio"])
#    upload_button.upload(upload_file, upload_button, file_output)

#demo.launch(share=False, debug=False)

In [32]:
# create a function that pulls all the steps together
def article_builder(gcs_uri):
    

IndentationError: expected an indented block (754168372.py, line 3)

In [None]:
get_audio_data('mg-ce-demos')

In [None]:
import gradio as gr

demo = gr.Blocks()

with demo:
    audio_file = gr.Audio(type="filepath")
    text = gr.Textbox()
    label = gr.Label()

    b1 = gr.Button("Upload")
    b2 = gr.Button("Get GCS URI")
    b3 = gr.Button("Create Articles")

    b1.click(upload_file, inputs=audio_file, outputs=text)
    b2.click(get_audio_data, inputs=text, outputs=label)

if __name__ == "__main__":
    demo.launch()

In [None]:

with gr.Blocks() as demo:
    gr.Markdown(
    """
    ## Audio to Articles
    """)
    #with gr.Row():
    #    file_output = gr.File()
    #    upload_button = gr.UploadButton("Click to Transcribe and Draft Articles", file_types=["audio"])
    #    upload_button.upload(upload_file, upload_button, file_output)  
        
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Task", placeholder="Cloud Bucket Name")
    
    with gr.Row():
        find_gcs_uri = gr.Button("Select Audio File")
        
    with gr.Row():
        label1 = gr.Textbox(label="GCS URI")
        
    #with gr.Row():
    #    generate = gr.Button("Generate Response")

    #with gr.Row():
    #    label2 = gr.Textbox(label="Prompt")
    #with gr.Row():
    #    label3 = gr.Textbox(label="Response generated by LLM")

    generate.click(get_audio_data, input_text, label1)
    
demo.launch(share=False, debug=False)