In [1]:
import urllib.request
import os
import glob
from google.cloud import storage
from google.cloud import speech
from google.protobuf.json_format import MessageToDict
import yaml
import json
import time

In [2]:
with open('properties.yaml') as file:
    properties = yaml.full_load(file)

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = properties['google_application_credentials']

In [3]:
bucket_name = properties['bucket_name']
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

In [4]:
url = "https://ia600702.us.archive.org/22/items/tobacco_dwu03f00/170500121_512kb.mp4"

In [5]:
source_file_name = url.split('/')[-1]
source_file_stem = source_file_name.split('.')[0]

In [6]:
start_time = time.time()    
print("downloading file")
urllib.request.urlretrieve(url, 'video_files/' + source_file_name)
print("run time:", time.time() - start_time)
start_time = time.time()
blob = bucket.blob("video_files/" + source_file_name)
blob.upload_from_filename('video_files/' + source_file_name)

downloading file
run time: 3.4007630348205566


In [7]:
# convert locally to flac format
print("converting to flac")
os.system('ffmpeg -v quiet -i video_files/' + source_file_name  + ' -c:a flac flac_files/' + source_file_stem + '.flac')
print("run time:", time.time() - start_time)
start_time = time.time()    

converting to flac
run time: 7.3047990798950195


In [8]:
print("uploading flac file to cloud")
blob = bucket.blob(source_file_name)
blob = bucket.blob("flac_files/" + source_file_stem + '.flac')

uploading flac file to cloud


In [9]:
# upload
blob.upload_from_filename('flac_files/' + source_file_stem + '.flac')
print("run time:", time.time() - start_time)
start_time = time.time()
# extract transcript

run time: 0.42754626274108887


In [10]:
print("extracting transcript")
client = speech.SpeechClient()

gcs_uri = "gs://" + bucket_name  + "/flac_files/" + source_file_stem + ".flac"
#https://cloud.google.com/speech-to-text/docs/encoding    
#You are not required to specify the encoding and sample rate for WAV or FLAC files. 
#If omitted, Speech-to-Text automatically determines the encoding and sample rate for 
#WAV or FLAC files based on the file header. 
#If you specify an encoding or sample rate value that does not match the value in the 
#file header, then Speech-to-Text returns an error.    
# model='video' is not required, costs more, but might lead to better transcription

audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    #encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
    #sample_rate_hertz=16000,
    audio_channel_count=2,
    language_code="en-US",
    use_enhanced=True,
    model='video',
    enable_word_time_offsets=True
)

operation = client.long_running_recognize(config=config, audio=audio)
response = operation.result()
print("run time:", time.time() - start_time)
start_time = time.time()

extracting transcript
run time: 17.821135997772217


In [11]:
result_dict = MessageToDict(response.__class__.pb(response))
print(result_dict)

{'results': [{'alternatives': [{'transcript': "meet the rule breaker the extra long menthol cigarettes that surprised everybody including us in a taste test get beat the leading mental brand and that's unheard of smokers almost always prefer their own brand but Menthol smokers preferred Capri more than two to one over their own Brands the rule breaker Capri Menthol but with a soft fresh", 'confidence': 0.9000328, 'words': [{'startTime': '3.400s', 'endTime': '3.900s', 'word': 'meet'}, {'startTime': '3.900s', 'endTime': '4.100s', 'word': 'the'}, {'startTime': '4.100s', 'endTime': '4.400s', 'word': 'rule'}, {'startTime': '4.400s', 'endTime': '5s', 'word': 'breaker'}, {'startTime': '5.700s', 'endTime': '5.800s', 'word': 'the'}, {'startTime': '5.800s', 'endTime': '6.100s', 'word': 'extra'}, {'startTime': '6.100s', 'endTime': '6.300s', 'word': 'long'}, {'startTime': '6.300s', 'endTime': '6.800s', 'word': 'menthol'}, {'startTime': '6.800s', 'endTime': '7.200s', 'word': 'cigarettes'}, {'startT

In [12]:
bucket.blob("video_files/" + source_file_name).delete()
bucket.blob("flac_files/" + source_file_stem + '.flac').delete()

In [13]:
os.remove("video_files/" + source_file_name)
os.remove("flac_files/" + source_file_stem + '.flac')