In [327]:
import os
from uuid import uuid4
from google.cloud import speech_v2
from google.api_core import client_options
import json
from google.cloud import storage

project_id = "PROJECT ID"
audio = "gs://blogpost_data/audiobooks/audios/the_wonderful_wizard_of_oz.wav"
output_loc = "gs://blogpost_data/stt_transcript"
groundtruth_bucket = "blogpost_data"
groundtruth_loc = "audiobooks/groundtruth/the_wonderful_wizard_of_oz.txt" # Exclude bucket name.

In [328]:
# Instantiate a client.
client_options = client_options.ClientOptions(api_endpoint="us-central1-speech.googleapis.com")

client = speech_v2.SpeechClient(client_options=client_options)

In [329]:
# Configure recognizer.

request = speech_v2.types.cloud_speech.CreateRecognizerRequest(
            parent=f"projects/{project_id}/locations/us-central1",
            recognizer_id="recognizer-" + str(uuid4()),
            recognizer=speech_v2.types.cloud_speech.Recognizer(
            language_codes=['en-US'],
            model='chirp'),
    )

# Create recognizer
operation = client.create_recognizer(request=request)
recognizer = operation.result()

In [330]:
files = speech_v2.types.cloud_speech.BatchRecognizeFileMetadata(uri=audio)

gcs_output_config = speech_v2.types.cloud_speech.GcsOutputConfig(uri=output_loc)

output_config = speech_v2.types.cloud_speech.RecognitionOutputConfig(gcs_output_config=gcs_output_config)

In [331]:
# Set the features. 
features = speech_v2.RecognitionFeatures(
            enable_word_time_offsets=True,
            enable_automatic_punctuation=True
        )

# Create the recognition config variable. 
config =  speech_v2.types.cloud_speech.RecognitionConfig(
            auto_decoding_config={},
            features=features
        )

In [None]:
# Create the batch recognize request.
request = speech_v2.types.cloud_speech.BatchRecognizeRequest(
            recognizer=recognizer.name,
            config=config,
            files=[files],
            recognition_output_config=output_config
        )

# Send the API request.
operation = client.batch_recognize(request=request)
response = operation.result(timeout=10000)
print(response)

results {
  key: "gs://blogpost_data/audiobooks/audios/the_wonderful_wizard_of_oz.wav"
  value {
    uri: "gs://blogpost_data/stt_transcript/the_wonderful_wizard_of_oz_transcript_649cb4d8-0000-2ff6-ab18-3c286d3d9972.json"
  }
}
total_billed_duration {
  seconds: 408
}



In [354]:
storage_client = storage.Client()

# Input the bucket the output transcript is stored in.
bucket = storage_client.bucket("blogpost_data")

# Input the path from the bucket to the output transcript file.
blob = bucket.blob("stt_transcript/the_wonderful_wizard_of_oz_transcript_649cb4d8-0000-2ff6-ab18-3c286d3d9972.json")

with blob.open("r") as f:
    data = json.load(f)



In [355]:
# Parse the transcript

transcript = ""
for result in data["results"]:
    if "alternatives" in result:
        transcript += ' ' + result['alternatives'][0]['transcript']

print(transcript)

  Dorothy lived in the midst of the great Kansas prairies with uncle Henry who was a farmer and aunt m who was the farmer's wife. Their house was small, for the lumber to build it had to be carried by wagon many miles. There were four walls, the floor and a roof which made one room, and this room contained a rusty looking cook stove, a cupbard for the dishes, a table, three or four chairs and the beds. Uncle Henry and aunt m had a big bed in one corner and  Dorothy, a little bed in another corner. There was no garret at all, and no celler, except a small hole dug in the ground, called a cyclone celler, where the family could go in case one of those great whirlwinds arose, mighty enough to crush any building in its path. It was reached by a trap door in the middle of the floor, from which a ladder led down into a small dark hole. When Dorothy stood in the doorway and looked around, she could see nothing but  the great gray prairie on every side, not a tree nor a house broke the broad sw

In [356]:
import simple_wer_v2 as wer
from IPython.display import display
from IPython.display import HTML
import re

# Read the ground truth file.
bucket = storage_client.bucket(groundtruth_bucket)
blob = bucket.blob(groundtruth_loc)

with blob.open("r") as f:
    groundtruth = f.read()
    
# Remove punctuations to normalize the transcript and the ground truth
groundtruth = re.sub(r"[;.:“”!,?'‘]", "", groundtruth, flags=re.MULTILINE)
groundtruth = re.sub(r"[—-]", " ", groundtruth, flags=re.MULTILINE)
groundtruth = re.sub(r"[’]", "'", groundtruth, flags=re.MULTILINE)
transcript = re.sub(r"[.:;!,?]", "", transcript, flags=re.MULTILINE)


# Call the simple WER function.
analysis = wer.SimpleWER(cap_punct=0,
    
    html_handler=wer.HighlightAlignedHtml,
    preprocess_handler=wer.TxtPreprocess)

analysis.AddHypRef(transcript, groundtruth)

# Print the output.
summary, details, keyphrases = analysis.GetSummaries()
aligned_html = f"""<body><html>
<h1>Chirp WER Results</h1>
<div>{summary}<br>{details}<br><b>{keyphrases}</b></div>
<table>{"".join(analysis.aligned_htmls)}</table>
</body></html>"""

results = HTML(aligned_html)
display(results)
