# Speech-to-Text API Experimentation

## Import Libraries

In [26]:
from google.cloud import speech_v1p1beta1 as speech
from google.cloud import storage

import spacy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Reshape, Bidirectional
from tensorflow.keras import backend as K

from tqdm import tqdm 

import gensim.downloader as api

import string
import json 
import re

import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/ekin/Downloads/hack.json"

## Using Client Library

In [3]:
'''
Function to test credentials
'''
def implicit():
    from google.cloud import storage

    # If you don't specify credentials when constructing the client, the
    # client library will look for credentials in the environment.
    storage_client = storage.Client()

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    print(buckets)

In [4]:
implicit()

[]


In [5]:
'''
Test speech client and transcription API on a sample audio file. 
'''
# Instantiates a client
client = speech.SpeechClient()

# The name of the audio file to transcribe
gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw"

audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=16000,
    language_code="en-US",
)

# Detects speech in the audio file
response = client.recognize(config=config, audio=audio)

print(response.results)

for result in response.results:
    print("Transcript: {}".format(result.alternatives[0].transcript))
    print("Confidence: {}".format(result.alternatives[0].confidence))

[alternatives {
  transcript: "how old is the Brooklyn Bridge"
  confidence: 0.9823954
}
language_code: "en-us"
]
Transcript: how old is the Brooklyn Bridge
Confidence: 0.9823954105377197


## Transcribing Long Audio Files

In [6]:
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=48000,
        audio_channel_count=2,
        language_code="en-US",
        enable_word_time_offsets=True, 
        enable_word_confidence=True,
    )

    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        print("Confidence: {}".format(result.alternatives[0].confidence))
        
    return response.results

In [7]:
example_gcs_uri = "gs://hack_the_ne/hmms.flac"
results = transcribe_gcs(example_gcs_uri)

Waiting for operation to complete...
Transcript: I'm going to test maybe that's a bit different interesting absolutely totally no way
Confidence: 0.9492603540420532


## Transcript Processing Algorithms

In [8]:
result = results[0].alternatives[0]
transcript = result.transcript
words = result.words

In [9]:
print(transcript)

I'm going to test maybe that's a bit different interesting absolutely totally no way


### Clarity
Measurement of how understandable your speech is. Based on the confidence of the overall transcription. 

In [10]:
clarity = result.confidence

### Brevity
Count the number of filler_words or hedging language phrases. 

In [11]:
'''
This function returns a count of how many times 
each phrase in the `phrases` list is in the 
`transcript` string. 
'''
def count_phrases(transcript, phrases, return_phrase_counts=True): 
    space_transcript = ' ' + transcript + ' '
    phrase_counts = {}
    all_counts = 0 
    for phrase in phrases: 
        space_phrase = ' ' + phrase + ' '
        count = space_transcript.count(space_phrase)
        if count > 0: 
            phrase_counts[phrase] = count
            all_counts = all_counts + count
    
    if return_phrase_counts: 
        return phrase_counts, all_counts # Return the counts for each filler phrase
    else:
        return all_counts # Only return the total number of counts

In [12]:
filler_words = [
    'like', 
    'I mean',
    'you know', 
    'so', 
    'well', 
    'you see', 
]

hedging_language = [
    'kind of', 
    'I think', 
    'maybe',
]

In [13]:
test_string = "so I think I will begin with this like kind of interesting thing maybe"
filler_phrase_counts, filler_all_counts = count_phrases(test_string, filler_words)
print(filler_phrase_counts) 

{'like': 1, 'so': 1}


In [14]:
hedging_phrase_counts, hedging_all_counts = count_phrases(test_string, hedging_language)
print(hedging_phrase_counts)

{'kind of': 1, 'I think': 1, 'maybe': 1}


### Cadence
Counting the number of words spoken every second. 

* Average is 130 words per minute | 2.166 words per second (avg)
* Takes us around 0.46 seconds to speak one word

In [15]:
'''
Compute the total duration of the speech in seconds, given
the dict of words. 
'''
def speech_time(words): 
    first_word = words[0]
    last_word = words[-1]
    
    start_time = first_word.start_time.total_seconds()
    end_time = last_word.end_time.total_seconds()
    
    return end_time - start_time

'''
Computes the number of words spoken in the speech per 
second, given `words` as a dict. 
'''
def words_per_second(words, time): 
    return len(words) / time 

In [16]:
'''
Perform actual computations
'''
total_time = speech_time(words)
pace = words_per_second(words, total_time)

print('Total Time: ', total_time)
print('Pace (words / second): ', pace) # slower than average

Total Time:  15.3
Pace (words / second):  0.9150326797385621


In [17]:
'''
Compute the number of seconds for each word in `words`
'''

def seconds_per_word(words, verbose=False):
    word_times = {}
    for word_info in words:
        word = word_info.word
        start_time = word_info.start_time.total_seconds()
        end_time = word_info.end_time.total_seconds()
        confidence = word_info.confidence

        if verbose: 
            print(
                f"{word}, start_time: {start_time}, end_time: {end_time}, confidence: {confidence}"
            )

        time_of_word = round(end_time - start_time, 3)
        word_times[word] = time_of_word
        
    return word_times

In [18]:
word_times = seconds_per_word(words, verbose=True)
# word_times = {k: v for k, v in sorted(word_times.items(), key=lambda item: item[1])} # sorted
print(word_times)

I'm, start_time: 1.3, end_time: 1.9, confidence: 0.9876290559768677
going, start_time: 1.9, end_time: 2.1, confidence: 0.9876290559768677
to, start_time: 2.1, end_time: 2.2, confidence: 0.9876290559768677
test, start_time: 2.2, end_time: 2.5, confidence: 1.0
maybe, start_time: 2.5, end_time: 4.7, confidence: 0.9239681959152222
that's, start_time: 4.7, end_time: 5.1, confidence: 0.9249534010887146
a, start_time: 5.1, end_time: 5.4, confidence: 0.9668338894844055
bit, start_time: 5.4, end_time: 5.6, confidence: 0.9876290559768677
different, start_time: 5.6, end_time: 6.2, confidence: 1.0
interesting, start_time: 6.2, end_time: 13.7, confidence: 0.8979593515396118
absolutely, start_time: 13.7, end_time: 14.9, confidence: 0.9249765872955322
totally, start_time: 14.9, end_time: 15.2, confidence: 0.9876290559768677
no, start_time: 15.2, end_time: 16.4, confidence: 0.8444064855575562
way, start_time: 16.4, end_time: 16.6, confidence: 0.8684027194976807
{"I'm": 0.6, 'going': 0.2, 'to': 0.1, 't

### TO-DO: Below

In [19]:
# todo: plot each word on a histogram and also plot an average line 
# simpler, just count the number of words that are greater than average, and compare
def classify_pauses(baseline_time): 
    normal_words = {}
    delayed_words = {}
    avg_time = 0.46
    for word, time in word_times.items(): 
        if time > avg_time: 
            delayed_words[word] = time
        else:
            normal_words[word] = time
    return normal_words, delayed_words

In [20]:
avg_time = 0.5 # half a second per word on avg. 
normal_words, delayed_words = classify_pauses(avg_time)
print('Normal words: ', normal_words)
print('Delayed words: ', delayed_words)

Normal words:  {'going': 0.2, 'to': 0.1, 'test': 0.3, "that's": 0.4, 'a': 0.3, 'bit': 0.2, 'totally': 0.3, 'way': 0.2}
Delayed words:  {"I'm": 0.6, 'maybe': 2.2, 'different': 0.6, 'interesting': 7.5, 'absolutely': 1.2, 'no': 1.2}


### Strong Vocabulary (Passion/Urgency)

In [21]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(transcript)

In [22]:
# Analyze syntax
noun_phrases = [chunk.text for chunk in doc.noun_chunks]
verb_phrases = [chunk.text for chunk in doc.noun_chunks]

print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])
print("Adjectives:", [token.lemma_ for token in doc if token.pos_ == "ADJ"])

# for token in doc: 
#     print(token.pos_)
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['I', 'no way']
Verbs: ['go', 'test']
Adjectives: ['different', 'interesting']


In [23]:
# todo: rate the adjectives based on level of neutrality

## TED Analytics

In [37]:
def load_wv():
    # this line of code needs to run asynchronously first
    wv = api.load('word2vec-google-news-300')
    return wv

def create_embedding(transcript, wv):
#     print(transcript)
    X = []
    found_words = []
    words = transcript.split()
    for word in words: 
        try:
            found_words.append(wv[word])
        except: 
            continue
            
    embedding = np.asarray(found_words)
    mean = np.mean(embedding, axis=0)
    mean = mean.tolist()

    if type(mean) == list: 
        X.append(mean)

    X = np.array(X)
    return X

In [57]:
def perform_analysis():
    wv = load_wv()
    embedding = create_embedding(transcript, wv)
    
    model_filepath = './ted_analysis_model'
    model = tf.keras.models.load_model(model_filepath)
    
    pred = model.predict(embedding)[0]
    cols = ['Beautiful', 'Confusing', 'Courageous', 'Funny', 'Informative', 'Ingenious', 'Inspiring', 'Longwinded', 'Unconvincing', 'Fascinating', 'Jaw-dropping', 'Persuasive', 'OK', 'Obnoxious']
    
    ted_dict = {}
    for val, col in zip(pred, cols):
        ted_dict[col] = val
    return ted_dict # can be converted to a JSON

In [58]:
ted_dict = perform_analysis()

In [59]:
print(ted_dict)

{'Beautiful': 0.11447036, 'Confusing': 0.10280144, 'Courageous': 0.0026097298, 'Funny': 0.81440914, 'Informative': 0.11125913, 'Ingenious': 0.7225983, 'Inspiring': 0.111739606, 'Longwinded': 0.026772916, 'Unconvincing': 0.09550491, 'Fascinating': 0.8304244, 'Jaw-dropping': 0.35544878, 'Persuasive': 0.0057587028, 'OK': 0.45375717, 'Obnoxious': 0.10564661}


### TED Analysis Experimentation

In [38]:
wv = load_wv()

In [39]:
embedding = create_embedding(transcript, wv)

In [42]:
model_filepath = './ted_analysis_model'
model = tf.keras.models.load_model(model_filepath)

In [50]:
pred = model.predict(embedding)[0]

In [51]:
print(transcript)

I'm going to test maybe that's a bit different interesting absolutely totally no way


In [52]:
cols = ['Beautiful', 'Confusing', 'Courageous', 'Funny', 'Informative', 'Ingenious', 'Inspiring', 'Longwinded', 'Unconvincing', 'Fascinating', 'Jaw-dropping', 'Persuasive', 'OK', 'Obnoxious']

In [55]:
ted_dict = {}
for val, col in zip(pred, cols):
    ted_dict[col] = val
print(ted_dict)

{'Beautiful': 0.11447036, 'Confusing': 0.10280144, 'Courageous': 0.0026097298, 'Funny': 0.81440914, 'Informative': 0.11125913, 'Ingenious': 0.7225983, 'Inspiring': 0.111739606, 'Longwinded': 0.026772916, 'Unconvincing': 0.09550491, 'Fascinating': 0.8304244, 'Jaw-dropping': 0.35544878, 'Persuasive': 0.0057587028, 'OK': 0.45375717, 'Obnoxious': 0.10564661}
