In [1]:
# note - you will need to be able to run the UNIX ffmpeg utility to run this notebook

In [2]:
# pip install google-cloud-storage
#!pip install google-cloud-language
#!pip install google-cloud-speech

In [34]:
import urllib.request
import os
import glob
from google.cloud import storage
from google.cloud import speech
from google.cloud import language_v1
from google.protobuf.json_format import MessageToDict
import yaml
import json
import time
import pandas as pd

In [37]:
pd.set_option('display.max_colwidth', None)

In [4]:
with open('properties.yaml') as file:
    properties = yaml.full_load(file)

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = properties['google_application_credentials']

In [5]:
bucket_name = properties['bucket_name']
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

In [6]:
url = "https://ia800304.us.archive.org/27/items/tobacco_pcy99d00/87784869_512kb.mp4"

In [7]:
source_file_name = url.split('/')[-1]
source_file_stem = source_file_name.split('.')[0]

In [8]:
start_time = time.time()    
print("downloading file")
urllib.request.urlretrieve(url, 'video_files/' + source_file_name)
print("run time:", time.time() - start_time)

downloading file
run time: 8.883095979690552


In [9]:
#start_time = time.time()
#blob = bucket.blob("video_files/" + source_file_name)
#blob.upload_from_filename('video_files/' + source_file_name)

In [10]:
# convert locally to flac format
print("converting to flac")
os.system('ffmpeg -v quiet -i video_files/' + source_file_name  + ' -c:a flac flac_files/' + source_file_stem + '.flac')
print("run time:", time.time() - start_time)
start_time = time.time()    

converting to flac
run time: 8.967916011810303


File 'flac_files/87784869_512kb.flac' already exists. Overwrite? [y/N] 

In [11]:
print("uploading flac file to cloud")
blob = bucket.blob(source_file_name)
blob = bucket.blob("flac_files/" + source_file_stem + '.flac')

uploading flac file to cloud


In [12]:
# upload
blob.upload_from_filename('flac_files/' + source_file_stem + '.flac')
print("run time:", time.time() - start_time)
start_time = time.time()
# extract transcript

run time: 19.367159128189087


In [13]:
print("extracting transcript")
client = speech.SpeechClient()

gcs_uri = "gs://" + bucket_name  + "/flac_files/" + source_file_stem + ".flac"
#https://cloud.google.com/speech-to-text/docs/encoding    
#You are not required to specify the encoding and sample rate for WAV or FLAC files. 
#If omitted, Speech-to-Text automatically determines the encoding and sample rate for 
#WAV or FLAC files based on the file header. 
#If you specify an encoding or sample rate value that does not match the value in the 
#file header, then Speech-to-Text returns an error.    
# model='video' is not required, costs more, but might lead to better transcription

audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    #encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
    #sample_rate_hertz=16000,
    audio_channel_count=2,
    language_code="en-US",
    use_enhanced=True,
    model='video',
    enable_word_time_offsets=True
)

operation = client.long_running_recognize(config=config, audio=audio)
response = operation.result()
print("run time:", time.time() - start_time)
start_time = time.time()

extracting transcript
run time: 39.78413510322571


In [None]:
result_dict = MessageToDict(response.__class__.pb(response))

In [39]:
result_dict

{'results': [{'alternatives': [{'transcript': 'gentlemen gentlemen tobacco industry has a very serious multi billion dollar problem we need more cigarette smokers pure and simple everyday 2,000 Americans stopped smoking another 1100 also quit actually technically they die that means that this business needs 3000 fresh new volunteers every day so forget about cancer heart disease emphysema',
     'confidence': 0.8005896,
     'words': [{'startTime': '9.800s',
       'endTime': '10.300s',
       'word': 'gentlemen'},
      {'startTime': '10.300s', 'endTime': '10.900s', 'word': 'gentlemen'},
      {'startTime': '11.900s', 'endTime': '12.200s', 'word': 'tobacco'},
      {'startTime': '12.200s', 'endTime': '12.600s', 'word': 'industry'},
      {'startTime': '12.600s', 'endTime': '12.700s', 'word': 'has'},
      {'startTime': '12.700s', 'endTime': '12.800s', 'word': 'a'},
      {'startTime': '12.800s', 'endTime': '13s', 'word': 'very'},
      {'startTime': '13s', 'endTime': '13.500s', 'word'

In [15]:
print(result_dict['results'][0]['alternatives'][0]['transcript'])

gentlemen gentlemen tobacco industry has a very serious multi billion dollar problem we need more cigarette smokers pure and simple everyday 2,000 Americans stopped smoking another 1100 also quit actually technically they die that means that this business needs 3000 fresh new volunteers every day so forget about cancer heart disease emphysema


In [16]:
full_text = ""
for r in result_dict['results']:
    if 'transcript' in r['alternatives'][0]:
        full_text += (r['alternatives'][0]['transcript'])

In [17]:
print(full_text)

gentlemen gentlemen tobacco industry has a very serious multi billion dollar problem we need more cigarette smokers pure and simple everyday 2,000 Americans stopped smoking another 1100 also quit actually technically they die that means that this business needs 3000 fresh new volunteers every day so forget about cancer heart disease emphysema Strokes them gentleman we're not in this business for help


In [38]:
transcript_data = []
for r in result_dict['results']:
    if 'transcript' in r['alternatives'][0].keys():
        #print(r['alternatives'][0]['transcript'], r['alternatives'][0]['confidence'])
        transcript_data.append((r['alternatives'][0]['transcript'], r['alternatives'][0]['confidence']))
df = pd.DataFrame(transcript_data, columns=['text', 'confidence'])
df

Unnamed: 0,text,confidence
0,"gentlemen gentlemen tobacco industry has a very serious multi billion dollar problem we need more cigarette smokers pure and simple everyday 2,000 Americans stopped smoking another 1100 also quit actually technically they die that means that this business needs 3000 fresh new volunteers every day so forget about cancer heart disease emphysema",0.80059
1,Strokes them gentleman we're not in this business for help,0.729682


In [18]:
with open('json_output/' + source_file_stem + '.json', 'w') as fp:
        json.dump(result_dict, fp)

In [19]:
#bucket.blob("video_files/" + source_file_name).delete()
bucket.blob("flac_files/" + source_file_stem + '.flac').delete()

In [20]:
os.remove("video_files/" + source_file_name)
os.remove("flac_files/" + source_file_stem + '.flac')

In [23]:
type_ = language_v1.Document.Type.PLAIN_TEXT

language = "en"
document = {"content": full_text, "type_": type_, "language": language}

client = language_v1.LanguageServiceClient()

sentiment_response = client.analyze_sentiment(request = {'document': document})

In [24]:
print(sentiment_response.document_sentiment.score)
print(sentiment_response.document_sentiment.magnitude)

-0.800000011920929
0.800000011920929
